# Experiment 002: Baseline Without Leakage

**Goal**: Establish true baseline performance by removing the data leakage feature (giver_username_if_known).

**Changes from exp_000:**
- Remove giver_username_if_known feature (confirmed leakage)
- Report both AUC and log loss metrics
- Keep all other features the same for fair comparison

In [None]:
import pandas as pd
import numpy as np
import json
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# Set random seed
np.random.seed(42)

# Load data
print("Loading data...")
train_path = "/home/data/train.json"
test_path = "/home/data/test.json"

with open(train_path, 'r') as f:
    train_data = json.load(f)
with open(test_path, 'r') as f:
    test_data = json.load(f)

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Target distribution: {train_df['requester_received_pizza'].value_counts(normalize=True)}")

def extract_features(df, is_train=True):
    """Extract features from the data - ONLY use columns available in both train and test."""
    features = pd.DataFrame()
    
    # === TABULAR FEATURES (using only 'at_request' columns available in test) ===
    
    # Upvotes/downvotes at request time
    features['upvotes_at_request'] = df['requester_upvotes_plus_downvotes_at_request']
    features['upvotes_minus_downvotes'] = df['requester_upvotes_minus_downvotes_at_request']
    
    # Requester activity at request time
    features['num_comments_at_request'] = df['requester_number_of_comments_at_request']
    features['num_posts_at_request'] = df['requester_number_of_posts_at_request']
    features['num_comments_in_raop_at_request'] = df['requester_number_of_comments_in_raop_at_request']
    features['num_posts_in_raop_at_request'] = df['requester_number_of_posts_on_raop_at_request']
    
    # Activity ratios
    features['comments_per_post'] = features['num_comments_at_request'] / (features['num_posts_at_request'] + 1)
    features['raop_comments_per_post'] = features['num_comments_in_raop_at_request'] / (features['num_posts_in_raop_at_request'] + 1)
    
    # Account age (in days at request time)
    features['account_age_at_request'] = df['requester_account_age_in_days_at_request']
    
    # Days since first post on RAOP
    features['days_since_first_raop_post'] = df['requester_days_since_first_post_on_raop_at_request']
    
    # Number of subreddits
    features['num_subreddits'] = df['requester_number_of_subreddits_at_request']
    
    # === TEXT FEATURES (from request_text_edit_aware) ===
    
    text_data = df['request_text_edit_aware'].fillna('')
    
    # Basic text stats
    features['text_length'] = text_data.str.len()
    features['word_count'] = text_data.str.split().str.len()
    features['sentence_count'] = text_data.str.count(r'[.!?]+') + 1
    features['avg_word_length'] = features['text_length'] / (features['word_count'] + 1)
    
    # Punctuation counts
    features['question_marks'] = text_data.str.count(r'\?')
    features['exclamation_marks'] = text_data.str.count(r'!')
    features['periods'] = text_data.str.count(r'\.') + 1
    
    # Politeness and keywords (lowercase for consistency)
    text_lower = text_data.str.lower()
    features['has_please'] = text_lower.str.contains('please').astype(int)
    features['has_thank'] = text_lower.str.contains('thank').astype(int)
    features['has_thanks'] = text_lower.str.contains('thanks').astype(int)
    features['has_sorry'] = text_lower.str.contains('sorry').astype(int)
    features['has_family'] = text_lower.str.contains('family').astype(int)
    features['has_kids'] = text_lower.str.contains('kid').astype(int)
    features['has_work'] = text_lower.str.contains('work').astype(int)
    features['has_money'] = text_lower.str.contains('money').astype(int)
    features['has_pay'] = text_lower.str.contains('pay').astype(int)
    features['has_hungry'] = text_lower.str.contains('hungry').astype(int)
    features['has_food'] = text_lower.str.contains('food').astype(int)
    features['has_help'] = text_lower.str.contains('help').astype(int)
    features['has_emergency'] = text_lower.str.contains('emergency').astype(int)
    
    # === CATEGORICAL FEATURES (encode with LabelEncoder) ===
    
    # CRITICAL: EXCLUDE giver_username_if_known due to leakage
    categorical_features = []
    
    if 'request_title' in df.columns:
        features['request_title'] = df['request_title']
        categorical_features.append('request_title')
    
    if 'requester_subreddits_at_request' in df.columns:
        features['requester_subreddits_at_request'] = df['requester_subreddits_at_request'].fillna('N/A')
        categorical_features.append('requester_subreddits_at_request')
    
    if 'requester_username' in df.columns:
        features['requester_username'] = df['requester_username']
        categorical_features.append('requester_username')
    
    return features, categorical_features

In [None]:
def extract_features(df, is_train=True):
    """Extract features from the data."""
    features = pd.DataFrame()
    
    # === TABULAR FEATURES (common to both train and test) ===
    
    # Vote features
    features['upvotes'] = df['number_of_upvotes_of_request_at_retrieval']
    features['downvotes'] = df['number_of_downvotes_of_request_at_retrieval']
    features['net_votes'] = features['upvotes'] - features['downvotes']
    features['vote_ratio'] = features['upvotes'] / (features['downvotes'] + 1)
    
    # Comment features
    features['num_comments'] = df['request_number_of_comments_at_retrieval']
    features['comments_per_upvote'] = features['num_comments'] / (features['upvotes'] + 1)
    
    # Edit flag
    features['post_was_edited'] = df['post_was_edited'].astype(int)
    
    # Requester activity features
    features['requester_upvotes_at_retrieval'] = df['requester_upvotes_plus_downvotes_at_retrieval']
    features['requester_upvotes_at_request'] = df['requester_upvotes_plus_downvotes_at_request']
    features['requester_number_of_comments_at_retrieval'] = df['requester_number_of_comments_at_retrieval']
    features['requester_number_of_comments_at_request'] = df['requester_number_of_comments_at_request']
    features['requester_number_of_posts_at_retrieval'] = df['requester_number_of_posts_at_retrieval']
    features['requester_number_of_posts_at_request'] = df['requester_number_of_posts_at_request']
    
    # Activity ratios
    features['activity_ratio_retrieval'] = (
        features['requester_number_of_comments_at_retrieval'] / 
        (features['requester_number_of_posts_at_retrieval'] + 1)
    )
    features['activity_ratio_request'] = (
        features['requester_number_of_comments_at_request'] / 
        (features['requester_number_of_posts_at_request'] + 1)
    )
    
    # Account age (in seconds)
    features['account_age'] = df['requester_account_age_in_days_at_retrieval']
    
    # === TEXT FEATURES (from request_text_edit_aware) ===
    
    text_data = df['request_text_edit_aware'].fillna('')
    
    # Basic text stats
    features['text_length'] = text_data.str.len()
    features['word_count'] = text_data.str.split().str.len()
    features['sentence_count'] = text_data.str.count(r'[.!?]+') + 1
    features['avg_word_length'] = features['text_length'] / (features['word_count'] + 1)
    
    # Punctuation counts
    features['question_marks'] = text_data.str.count(r'\?')
    features['exclamation_marks'] = text_data.str.count(r'!')
    features['periods'] = text_data.str.count(r'\.') + 1
    
    # Politeness and keywords (lowercase for consistency)
    text_lower = text_data.str.lower()
    features['has_please'] = text_lower.str.contains('please').astype(int)
    features['has_thank'] = text_lower.str.contains('thank').astype(int)
    features['has_thanks'] = text_lower.str.contains('thanks').astype(int)
    features['has_sorry'] = text_lower.str.contains('sorry').astype(int)
    features['has_family'] = text_lower.str.contains('family').astype(int)
    features['has_kids'] = text_lower.str.contains('kid').astype(int)
    features['has_work'] = text_lower.str.contains('work').astype(int)
    features['has_money'] = text_lower.str.contains('money').astype(int)
    features['has_pay'] = text_lower.str.contains('pay').astype(int)
    features['has_hungry'] = text_lower.str.contains('hungry').astype(int)
    features['has_food'] = text_lower.str.contains('food').astype(int)
    features['has_help'] = text_lower.str.contains('help').astype(int)
    features['has_emergency'] = text_lower.str.contains('emergency').astype(int)
    
    # === CATEGORICAL FEATURES (encode with LabelEncoder) ===
    
    # CRITICAL: EXCLUDE giver_username_if_known due to leakage
    categorical_features = []
    
    if 'request_title' in df.columns:
        features['request_title'] = df['request_title']
        categorical_features.append('request_title')
    
    if 'requester_subreddits_at_request' in df.columns:
        features['requester_subreddits_at_request'] = df['requester_subreddits_at_request'].fillna('N/A')
        categorical_features.append('requester_subreddits_at_request')
    
    if 'requester_username' in df.columns:
        features['requester_username'] = df['requester_username']
        categorical_features.append('requester_username')
    
    return features, categorical_features

# Extract features
print("Extracting features...")
train_features, categorical_cols = extract_features(train_df)
test_features, _ = extract_features(test_df, is_train=False)

print(f"Train features shape: {train_features.shape}")
print(f"Test features shape: {test_features.shape}")
print(f"Categorical columns: {categorical_cols}")

## Handle Missing Values and Encode Categorical Features

In [None]:
# Handle missing values
print("Handling missing values...")
train_features = train_features.fillna(0)
test_features = test_features.fillna(0)

# Encode categorical features
print("Encoding categorical features...")
encoders = {}

for col in categorical_cols:
    if col in train_features.columns:
        encoder = LabelEncoder()
        # Fit on combined data to handle unseen categories
        combined = pd.concat([train_features[col], test_features[col]], axis=0).astype(str)
        encoder.fit(combined)
        
        train_features[col] = encoder.transform(train_features[col].astype(str))
        test_features[col] = encoder.transform(test_features[col].astype(str))
        
        encoders[col] = encoder
        print(f"  Encoded {col}: {len(encoder.classes_)} classes")

print(f"Final train shape: {train_features.shape}")
print(f"Final test shape: {test_features.shape}")

## Cross-Validation Setup

Use 5-fold stratified CV to handle class imbalance.

In [None]:
# Prepare data for CV
X = train_features.copy()
y = train_df['requester_received_pizza'].values

# 5-fold stratified CV
n_folds = 5
cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Store results
fold_predictions = []
fold_scores = {'auc': [], 'logloss': []}
feature_importance_list = []

print(f"Starting {n_folds}-fold stratified CV...")
print(f"Class distribution: {np.bincount(y)}")

## Train LightGBM Model with CV

In [None]:
# LightGBM parameters
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'num_threads': 4
}

fold = 1
for train_idx, val_idx in cv.split(X, y):
    print(f"\n=== Fold {fold}/{n_folds} ===")
    
    # Split data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Create LightGBM datasets
    train_set = lgb.Dataset(X_train, label=y_train)
    val_set = lgb.Dataset(X_val, label=y_val, reference=train_set)
    
    # Train model
    model = lgb.train(
        params,
        train_set,
        num_boost_round=1000,
        valid_sets=[val_set],
        valid_names=['val'],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(0)
        ]
    )
    
    # Predict on validation set
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    
    # Calculate metrics
    auc = roc_auc_score(y_val, val_pred)
    logloss = log_loss(y_val, val_pred)
    
    print(f"Fold {fold} AUC: {auc:.4f}")
    print(f"Fold {fold} Log Loss: {logloss:.4f}")
    
    fold_scores['auc'].append(auc)
    fold_scores['logloss'].append(logloss)
    fold_predictions.append(val_pred)
    
    # Store feature importance
    importance = pd.DataFrame({
        'feature': X.columns,
        'importance': model.feature_importance(importance_type='gain')
    }).sort_values('importance', ascending=False)
    feature_importance_list.append(importance)
    
    fold += 1

## Evaluate Results

In [None]:
# Calculate overall CV scores
print("\n=== CROSS-VALIDATION RESULTS ===")
mean_auc = np.mean(fold_scores['auc'])
std_auc = np.std(fold_scores['auc'])
mean_logloss = np.mean(fold_scores['logloss'])
std_logloss = np.std(fold_scores['logloss'])

print(f"AUC: {mean_auc:.4f} ± {std_auc:.4f}")
print(f"Log Loss: {mean_logloss:.4f} ± {std_logloss:.4f}")

# Compare to previous baseline
print(f"\nComparison to exp_000 (with leakage):")
print(f"  Previous AUC: 0.7850 (estimated)")
print(f"  Current AUC: {mean_auc:.4f}")
print(f"  Drop: {0.7850 - mean_auc:.4f}")

# Feature importance analysis
print(f"\n=== FEATURE IMPORTANCE ANALYSIS ===")
avg_importance = pd.concat(feature_importance_list).groupby('feature')['importance'].mean().sort_values(ascending=False)
print("Top 10 features:")
print(avg_importance.head(10))

# Check for potential leakage in top features
top_feature = avg_importance.index[0]
top_importance = avg_importance.iloc[0]
second_importance = avg_importance.iloc[1]

print(f"\nLeakage check:")
print(f"Top feature: {top_feature} (importance: {top_importance:.1f})")
print(f"Second feature: {avg_importance.index[1]} (importance: {second_importance:.1f})")
print(f"Ratio: {top_importance/second_importance:.2f}x")

if top_importance > 2 * second_importance:
    print("⚠️  WARNING: Top feature has >2x importance - potential leakage!")
else:
    print("✓ No obvious leakage detected")

## Generate Predictions for Test Set

In [None]:
# Train final model on full training data
print("\n=== TRAINING FINAL MODEL ===")
final_model = lgb.train(
    params,
    lgb.Dataset(X, label=y),
    num_boost_round=1000,
    callbacks=[lgb.log_evaluation(0)]
)

# Predict on test set
test_pred = final_model.predict(test_features)

# Create submission
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_pred
})

print(f"Submission shape: {submission.shape}")
print(f"Prediction range: [{test_pred.min():.4f}, {test_pred.max():.4f}]")
print(f"Prediction mean: {test_pred.mean():.4f}")

# Save submission
submission_path = "/home/submission/submission_002.csv"
submission.to_csv(submission_path, index=False)
print(f"Submission saved to: {submission_path}")