# Honest Baseline: TF-IDF + LightGBM (NO LEAKAGE)

**CRITICAL**: This experiment EXCLUDES all leakage features identified in evolver_loop1_analysis:
- ❌ requester_user_flair (post-outcome reward badges)
- ❌ giver_username_if_known (only known after pizza given)
- ❌ All _at_retrieval features (collected after outcome)

**SAFE features used:**
- ✅ Text features (request_title, request_text/request_text_edit_aware)
- ✅ Text length and meta-features
- ✅ User activity at request time (comments, posts, upvotes)
- ✅ Account age and temporal features
- ✅ Cleaned post_was_edited boolean

This will give us an HONEST baseline to improve upon.

In [13]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## Load Data

In [14]:
# Load training data
train_path = '/home/data/train.json'
with open(train_path, 'r') as f:
    train_data = json.load(f)
train_df = pd.DataFrame(train_data)

# Load test data
test_path = '/home/data/test.json'
with open(test_path, 'r') as f:
    test_data = json.load(f)
test_df = pd.DataFrame(test_data)

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Target distribution:")
print(train_df['requester_received_pizza'].value_counts(normalize=True))

Training samples: 2878
Test samples: 1162
Target distribution:
requester_received_pizza
False    0.751564
True     0.248436
Name: proportion, dtype: float64


## Feature Engineering (SAFE FEATURES ONLY)

In [15]:
# Text preprocessing and feature engineering - SAFE FEATURES ONLY
def engineer_features_safe(df, is_train=True):
    """Engineer features from the raw data - EXCLUDING ALL LEAKAGE"""
    features = pd.DataFrame(index=df.index)
    
    # 1. Text features - combine title and text for full context
    # Use request_text if available (train), otherwise use request_text_edit_aware (test)
    if 'request_text' in df.columns:
        text_col = 'request_text'
    else:
        text_col = 'request_text_edit_aware'
    
    features['full_text'] = df['request_title'].fillna('') + ' ' + df[text_col].fillna('')
    
    # 2. Text length features
    features['text_length'] = df[text_col].fillna('').str.len()
    features['title_length'] = df['request_title'].fillna('').str.len()
    features['total_text_length'] = features['text_length'] + features['title_length']
    features['text_word_count'] = df[text_col].fillna('').str.split().str.len()
    features['title_word_count'] = df['request_title'].fillna('').str.split().str.len()
    features['total_word_count'] = features['text_word_count'] + features['title_word_count']
    
    # 3. User activity features (at request time - SAFE)
    features['requester_number_of_subreddits_at_request'] = df['requester_number_of_subreddits_at_request']
    features['requester_account_age_in_days_at_request'] = df['requester_account_age_in_days_at_request']
    features['requester_days_since_first_post_on_raop_at_request'] = df['requester_days_since_first_post_on_raop_at_request']
    
    # 4. Requester activity metrics (at request time - SAFE)
    features['requester_upvotes_plus_downvotes_at_request'] = df['requester_upvotes_plus_downvotes_at_request']
    features['requester_upvotes_minus_downvotes_at_request'] = df['requester_upvotes_minus_downvotes_at_request']
    features['requester_number_of_posts_at_request'] = df['requester_number_of_posts_at_request']
    features['requester_number_of_comments_at_request'] = df['requester_number_of_comments_at_request']
    
    # 5. Temporal features
    features['request_hour'] = pd.to_datetime(df['unix_timestamp_of_request_utc'], unit='s').dt.hour
    features['request_day_of_week'] = pd.to_datetime(df['unix_timestamp_of_request_utc'], unit='s').dt.dayofweek
    
    # 6. Binary indicators for key patterns
    features['has_thanks'] = features['full_text'].str.lower().str.contains('thanks').astype(int)
    features['has_thank'] = features['full_text'].str.lower().str.contains('thank').astype(int)
    features['has_please'] = features['full_text'].str.lower().str.contains('please').astype(int)
    features['has_because'] = features['full_text'].str.lower().str.contains('because').astype(int)
    features['has_pay'] = features['full_text'].str.lower().str.contains('pay').astype(int)
    features['has_forward'] = features['full_text'].str.lower().str.contains('forward').astype(int)
    
    # 7. Post_was_edited (only in train, fill with 0 for test)
    if 'post_was_edited' in df.columns:
        features['post_was_edited'] = df['post_was_edited'].map({True: 1, False: 0})
    else:
        features['post_was_edited'] = 0
    
    return features

# Apply feature engineering to train and test data
print("Engineering features for training data...")
train_features = engineer_features_safe(train_df, is_train=True)

print("Engineering features for test data...")
test_features = engineer_features_safe(test_df, is_train=False)

print(f"Train features shape: {train_features.shape}")
print(f"Test features shape: {test_features.shape}")

Engineering features for training data...
Engineering features for test data...
Train features shape: (2878, 23)
Test features shape: (1162, 23)


## TF-IDF Vectorization

In [16]:
# Create TF-IDF features from text
# Use unigrams and bigrams, limit features to manage memory
tfidf = TfidfVectorizer(
    max_features=5000,  # Limit features for baseline
    ngram_range=(1, 2),  # Unigrams and bigrams
    stop_words='english',
    lowercase=True,
    min_df=2,  # Ignore very rare terms
    max_df=0.95  # Ignore very common terms
)

# Fit TF-IDF on combined train and test text for consistency
combined_text = pd.concat([
    train_features['full_text'],
    test_features['full_text']
], axis=0)

tfidf.fit(combined_text)

# Transform text to TF-IDF features
train_tfidf = tfidf.transform(train_features['full_text'])
test_tfidf = tfidf.transform(test_features['full_text'])

print(f"TF-IDF features shape: {train_tfidf.shape}")

# Convert TF-IDF to DataFrame for easier handling
tfidf_feature_names = [f'tfidf_{i}' for i in range(train_tfidf.shape[1])]
train_tfidf_df = pd.DataFrame(train_tfidf.toarray(), columns=tfidf_feature_names, index=train_features.index)
test_tfidf_df = pd.DataFrame(test_tfidf.toarray(), columns=tfidf_feature_names, index=test_features.index)

# Combine TF-IDF features with meta features
meta_feature_cols = [col for col in train_features.columns if col != 'full_text']
train_features_combined = pd.concat([train_features[meta_feature_cols], train_tfidf_df], axis=1)
test_features_combined = pd.concat([test_features[meta_feature_cols], test_tfidf_df], axis=1)

print(f"Combined train features shape: {train_features_combined.shape}")
print(f"Combined test features shape: {test_features_combined.shape}")

TF-IDF features shape: (2878, 5000)


Combined train features shape: (2878, 5022)
Combined test features shape: (1162, 5022)


## Model Training with Cross-Validation

In [17]:
# Prepare data for training
X = train_features_combined
y = train_df['requester_received_pizza'].astype(int)

# Stratified K-Fold to handle class imbalance
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)

# Store predictions for ensembling
train_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(test_features_combined))
fold_scores = []

print(f"Starting {n_splits}-fold cross-validation with SAFE features only...")
print(f"This should produce a REALISTIC score (not 1.0) if leakage is properly removed\n")

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print(f"Fold {fold + 1}/{n_splits}")
    
    # Split data
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    # Calculate scale_pos_weight for handling class imbalance
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    print(f"  Scale pos weight: {scale_pos_weight:.2f}")
    
    # Train LightGBM model
    model = lgb.LGBMClassifier(
        n_estimators=500,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=RANDOM_SEED,
        scale_pos_weight=scale_pos_weight,
        n_jobs=-1
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='auc',
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predictions
    valid_pred = model.predict_proba(X_valid)[:, 1]
    test_pred = model.predict_proba(test_features_combined)[:, 1]
    
    # Store predictions
    train_predictions[valid_idx] = valid_pred
    test_predictions += test_pred / n_splits
    
    # Calculate AUC
    fold_auc = roc_auc_score(y_valid, valid_pred)
    fold_scores.append(fold_auc)
    print(f"  Fold AUC: {fold_auc:.4f}\n")

# Overall CV score
cv_score = np.mean(fold_scores)
cv_std = np.std(fold_scores)
print(f"Overall CV Score: {cv_score:.4f} ± {cv_std:.4f}")
print(f"Individual fold scores: {[f'{score:.4f}' for score in fold_scores]}")

Starting 5-fold cross-validation with SAFE features only...
This should produce a REALISTIC score (not 1.0) if leakage is properly removed

Fold 1/5
  Scale pos weight: 3.02


[LightGBM] [Info] Number of positive: 572, number of negative: 1730
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012576 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23900
[LightGBM] [Info] Number of data points in the train set: 2302, number of used features: 868
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.248480 -> initscore=-1.106738
[LightGBM] [Info] Start training from score -1.106738
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[8]	valid_0's auc: 0.620343	valid_0's binary_logloss: 0.554811
  Fold AUC: 0.6203

Fold 2/5
  Scale pos weight: 3.02


[LightGBM] [Info] Number of positive: 572, number of negative: 1730
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012470 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23819
[LightGBM] [Info] Number of data points in the train set: 2302, number of used features: 866
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.248480 -> initscore=-1.106738
[LightGBM] [Info] Start training from score -1.106738
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[4]	valid_0's auc: 0.594519	valid_0's binary_logloss: 0.556467
  Fold AUC: 0.5945

Fold 3/5
  Scale pos weight: 3.02


[LightGBM] [Info] Number of positive: 572, number of negative: 1730
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016418 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24053
[LightGBM] [Info] Number of data points in the train set: 2302, number of used features: 871
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.248480 -> initscore=-1.106738
[LightGBM] [Info] Start training from score -1.106738
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[9]	valid_0's auc: 0.676004	valid_0's binary_logloss: 0.545344
  Fold AUC: 0.6760

Fold 4/5
  Scale pos weight: 3.03


[LightGBM] [Info] Number of positive: 572, number of negative: 1731
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011893 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23946
[LightGBM] [Info] Number of data points in the train set: 2303, number of used features: 858
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.248372 -> initscore=-1.107316
[LightGBM] [Info] Start training from score -1.107316
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[4]	valid_0's auc: 0.586765	valid_0's binary_logloss: 0.556099
  Fold AUC: 0.5868

Fold 5/5
  Scale pos weight: 3.03


[LightGBM] [Info] Number of positive: 572, number of negative: 1731
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014217 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23972
[LightGBM] [Info] Number of data points in the train set: 2303, number of used features: 876
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.248372 -> initscore=-1.107316
[LightGBM] [Info] Start training from score -1.107316
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[6]	valid_0's auc: 0.648771	valid_0's binary_logloss: 0.550615
  Fold AUC: 0.6488

Overall CV Score: 0.6253 ± 0.0334
Individual fold scores: ['0.6203', '0.5945', '0.6760', '0.5868', '0.6488']


## Feature Importance Analysis

In [18]:
# Get feature importance from the last fold model
feature_importance = model.booster_.feature_importance(importance_type='gain')
feature_names = X.columns.tolist()

# Create importance dataframe
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Top 20 Most Important Features:")
print(importance_df.head(20))

# Analyze feature types
tfidf_features = importance_df[importance_df['feature'].str.startswith('tfidf_')]
meta_features = importance_df[~importance_df['feature'].str.startswith('tfidf_')]

print(f"\nFeature type summary:")
print(f"TF-IDF features: {len(tfidf_features)} features, total importance: {tfidf_features['importance'].sum():.2f}")
print(f"Meta features: {len(meta_features)} features, total importance: {meta_features['importance'].sum():.2f}")

print(f"\nTop 10 Meta Features:")
print(meta_features.head(10))

Top 20 Most Important Features:


                                                feature  importance
0                                           text_length  560.267405
10         requester_upvotes_minus_downvotes_at_request  492.960100
7              requester_account_age_in_days_at_request  298.813401
8     requester_days_since_first_post_on_raop_at_req...  263.273500
5                                      total_word_count  182.463097
9           requester_upvotes_plus_downvotes_at_request  173.939099
2                                     total_text_length  166.691002
1                                          title_length  164.766601
1144                                         tfidf_1122  141.499403
11                 requester_number_of_posts_at_request   94.801300
13                                         request_hour   90.783199
3194                                         tfidf_3172   85.780200
1448                                         tfidf_1426   80.232998
4520                                         tfi

## Create Submission

In [19]:
# Create submission file
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure the format matches sample submission
print("Submission shape:", submission.shape)
print("\nFirst few rows:")
print(submission.head())

# Save submission
submission_path = '/home/submission/submission_002_honest_baseline.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

# Check distribution of predictions
print(f"\nPrediction distribution:")
print(f"Mean: {submission['requester_received_pizza'].mean():.4f}")
print(f"Std: {submission['requester_received_pizza'].std():.4f}")
print(f"Min: {submission['requester_received_pizza'].min():.4f}")
print(f"Max: {submission['requester_received_pizza'].max():.4f}")

# Compare to target distribution
print(f"\nTarget distribution in training:")
print(f"Mean (positive rate): {y.mean():.4f}")
print(f"Our prediction mean: {submission['requester_received_pizza'].mean():.4f}")
print(f"Difference: {abs(submission['requester_received_pizza'].mean() - y.mean()):.4f}")

if abs(submission['requester_received_pizza'].mean() - y.mean()) < 0.05:
    print("✅ Prediction distribution matches target distribution well")
else:
    print("⚠️  Prediction distribution differs significantly from target - check calibration")

Submission shape: (1162, 2)

First few rows:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.358005
1   t3_roiuw                  0.281126
2   t3_mjnbq                  0.278332
3   t3_t8wd1                  0.333561
4  t3_1m4zxu                  0.278972

Submission saved to: /home/submission/submission_002_honest_baseline.csv

Prediction distribution:
Mean: 0.2978
Std: 0.0391
Min: 0.2040
Max: 0.3869

Target distribution in training:
Mean (positive rate): 0.2484
Our prediction mean: 0.2978
Difference: 0.0494
✅ Prediction distribution matches target distribution well
