# Honest Baseline: TF-IDF + LightGBM (NO LEAKAGE)

**CRITICAL**: This experiment EXCLUDES all leakage features identified in evolver_loop1_analysis:
- ❌ requester_user_flair (post-outcome reward badges)
- ❌ giver_username_if_known (only known after pizza given)
- ❌ All _at_retrieval features (collected after outcome)

**SAFE features used:**
- ✅ Text features (request_title, request_text/request_text_edit_aware)
- ✅ Text length and meta-features
- ✅ User activity at request time (comments, posts, upvotes)
- ✅ Account age and temporal features
- ✅ Cleaned post_was_edited boolean

This will give us an HONEST baseline to improve upon.

In [None]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## Load Data

In [None]:
# Load training data
train_path = '/home/data/train.json'
with open(train_path, 'r') as f:
    train_data = json.load(f)
train_df = pd.DataFrame(train_data)

# Load test data
test_path = '/home/data/test.json'
with open(test_path, 'r') as f:
    test_data = json.load(f)
test_df = pd.DataFrame(test_data)

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Target distribution:")
print(train_df['requester_received_pizza'].value_counts(normalize=True))

## Feature Engineering (SAFE FEATURES ONLY)

In [None]:
# Text preprocessing and feature engineering - SAFE FEATURES ONLY
def engineer_features_safe(df, is_train=True):
    """Engineer features from the raw data - EXCLUDING ALL LEAKAGE"""
    features = pd.DataFrame(index=df.index)
    
    # 1. Text features - combine title and text for full context
    # Use request_text if available (train), otherwise use request_text_edit_aware (test)
    if 'request_text' in df.columns:
        text_col = 'request_text'
    else:
        text_col = 'request_text_edit_aware'
    
    features['full_text'] = df['request_title'].fillna('') + ' ' + df[text_col].fillna('')
    
    # 2. Text length features
    features['text_length'] = df[text_col].fillna('').str.len()
    features['title_length'] = df['request_title'].fillna('').str.len()
    features['total_text_length'] = features['text_length'] + features['title_length']
    features['text_word_count'] = df[text_col].fillna('').str.split().str.len()
    features['title_word_count'] = df['request_title'].fillna('').str.split().str.len()
    
    # 3. Clean post_was_edited (fix timestamp issue from EDA) - SAFE
    if 'post_was_edited' in df.columns:
        features['post_was_edited_clean'] = df['post_was_edited'].apply(
            lambda x: str(x).lower() == 'true'
        ).astype(int)
    else:
        features['post_was_edited_clean'] = 0
    
    # 4. User activity features at REQUEST TIME ONLY (SAFE - no leakage)
    if 'requester_number_of_comments_at_request' in df.columns:
        features['comments_at_request'] = df['requester_number_of_comments_at_request'].fillna(0)
    else:
        features['comments_at_request'] = 0
        
    if 'requester_number_of_posts_at_request' in df.columns:
        features['posts_at_request'] = df['requester_number_of_posts_at_request'].fillna(0)
    else:
        features['posts_at_request'] = 0
        
    if 'requester_upvotes_plus_downvotes_at_request' in df.columns:
        features['votes_at_request'] = df['requester_upvotes_plus_downvotes_at_request'].fillna(0)
    else:
        features['votes_at_request'] = 0
        
    if 'requester_number_of_comments_in_raop_at_request' in df.columns:
        features['raop_comments_at_request'] = df['requester_number_of_comments_in_raop_at_request'].fillna(0)
    else:
        features['raop_comments_at_request'] = 0
        
    if 'requester_number_of_posts_on_raop_at_request' in df.columns:
        features['raop_posts_at_request'] = df['requester_number_of_posts_on_raop_at_request'].fillna(0)
    else:
        features['raop_posts_at_request'] = 0
    
    # 5. Account age at request time (SAFE)
    if 'requester_account_age_in_days_at_request' in df.columns:
        features['account_age_days'] = df['requester_account_age_in_days_at_request'].fillna(0)
    else:
        features['account_age_days'] = 0
    
    # 6. Time since first post on RAOP at request time (SAFE)
    if 'requester_days_since_first_post_on_raop_at_request' in df.columns:
        features['days_since_first_raop'] = df['requester_days_since_first_post_on_raop_at_request'].fillna(0)
    else:
        features['days_since_first_raop'] = 0
    
    # 7. Subreddit count at request time (SAFE)
    if 'requester_number_of_subreddits_at_request' in df.columns:
        features['subreddits_at_request'] = df['requester_number_of_subreddits_at_request'].fillna(0)
    else:
        features['subreddits_at_request'] = 0
    
    # 8. Upvotes minus downvotes at request time (SAFE)
    if 'requester_upvotes_minus_downvotes_at_request' in df.columns:
        features['net_votes_at_request'] = df['requester_upvotes_minus_downvotes_at_request'].fillna(0)
    else:
        features['net_votes_at_request'] = 0
    
    # 9. Request text edit aware length (alternative text field)
    if 'request_text_edit_aware' in df.columns:
        features['edit_aware_length'] = df['request_text_edit_aware'].fillna('').str.len()
    else:
        features['edit_aware_length'] = 0
    
    # 10. Temporal features from timestamp (SAFE)
    if 'unix_timestamp_of_request' in df.columns:
        timestamps = pd.to_datetime(df['unix_timestamp_of_request'], unit='s')
        features['hour_of_day'] = timestamps.dt.hour
        features['day_of_week'] = timestamps.dt.dayofweek
        features['is_weekend'] = features['day_of_week'].isin([5, 6]).astype(int)
        features['is_night'] = features['hour_of_day'].isin([0, 1, 2, 3, 4, 5]).astype(int)
    else:
        features['hour_of_day'] = 0
        features['day_of_week'] = 0
        features['is_weekend'] = 0
        features['is_night'] = 0
    
    # 11. Activity ratios (SAFE)
    features['comments_to_posts_ratio'] = features['comments_at_request'] / (features['posts_at_request'] + 1)
    features['raop_activity_ratio'] = (features['raop_comments_at_request'] + features['raop_posts_at_request']) / \
                                      (features['comments_at_request'] + features['posts_at_request'] + 1)
    features['votes_per_comment'] = features['votes_at_request'] / (features['comments_at_request'] + 1)
    
    return features

# Engineer features for train and test
train_features = engineer_features_safe(train_df)
test_features = engineer_features_safe(test_df, is_train=False)

print(f"Engineered features shape: {train_features.shape}")
print(f"Feature columns ({len(train_features.columns)} total):")
for i, col in enumerate(train_features.columns):
    print(f"{i+1:2d}. {col}")

## TF-IDF Vectorization

In [None]:
# Create TF-IDF features from text
# Use unigrams and bigrams, limit features to manage memory
tfidf = TfidfVectorizer(
    max_features=5000,  # Limit features for baseline
    ngram_range=(1, 2),  # Unigrams and bigrams
    stop_words='english',
    lowercase=True,
    min_df=2,  # Ignore very rare terms
    max_df=0.95  # Ignore very common terms
)

# Fit TF-IDF on combined train and test text for consistency
combined_text = pd.concat([
    train_features['full_text'],
    test_features['full_text']
], axis=0)

tfidf.fit(combined_text)

# Transform text to TF-IDF features
train_tfidf = tfidf.transform(train_features['full_text'])
test_tfidf = tfidf.transform(test_features['full_text'])

print(f"TF-IDF features shape: {train_tfidf.shape}")

# Convert TF-IDF to DataFrame for easier handling
tfidf_feature_names = [f'tfidf_{i}' for i in range(train_tfidf.shape[1])]
train_tfidf_df = pd.DataFrame(train_tfidf.toarray(), columns=tfidf_feature_names, index=train_features.index)
test_tfidf_df = pd.DataFrame(test_tfidf.toarray(), columns=tfidf_feature_names, index=test_features.index)

# Combine TF-IDF with other features (drop the raw text column)
train_features_combined = pd.concat([
    train_features.drop('full_text', axis=1),
    train_tfidf_df
], axis=1)

test_features_combined = pd.concat([
    test_features.drop('full_text', axis=1),
    test_tfidf_df
], axis=1)

# Ensure train and test have the same columns
missing_in_test = set(train_features_combined.columns) - set(test_features_combined.columns)
missing_in_train = set(test_features_combined.columns) - set(train_features_combined.columns)

print(f"Missing in test: {missing_in_test}")
print(f"Missing in train: {missing_in_train}")

# Add missing columns with zeros
for col in missing_in_test:
    test_features_combined[col] = 0
for col in missing_in_train:
    train_features_combined[col] = 0

# Reorder columns to match
test_features_combined = test_features_combined[train_features_combined.columns]

print(f"Final feature matrix shape: {train_features_combined.shape}")
print(f"Test feature matrix shape: {test_features_combined.shape}")

## Model Training with Cross-Validation

In [None]:
# Prepare data for training
X = train_features_combined
y = train_df['requester_received_pizza'].astype(int)

# Stratified K-Fold to handle class imbalance
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)

# Store predictions for ensembling
train_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(test_features_combined))
fold_scores = []

print(f"Starting {n_splits}-fold cross-validation with SAFE features only...")
print(f"This should produce a REALISTIC score (not 1.0) if leakage is properly removed\n")

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print(f"Fold {fold + 1}/{n_splits}")
    
    # Split data
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    # Calculate scale_pos_weight for handling class imbalance
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    print(f"  Scale pos weight: {scale_pos_weight:.2f}")
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)
    
    # LightGBM parameters - conservative for baseline
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'scale_pos_weight': scale_pos_weight,
        'verbose': -1,
        'seed': RANDOM_SEED
    }
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(0)
        ]
    )
    
    # Predict on validation set
    valid_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    train_predictions[valid_idx] = valid_pred
    
    # Calculate AUC for this fold
    fold_auc = roc_auc_score(y_valid, valid_pred)
    fold_scores.append(fold_auc)
    print(f"  Fold {fold + 1} AUC: {fold_auc:.4f}")
    
    # Predict on test set
    test_pred = model.predict(test_features_combined, num_iteration=model.best_iteration)
    test_predictions += test_pred / n_splits

# Overall CV score
cv_score = roc_auc_score(y, train_predictions)
print(f"\n{'='*60}")
print(f"CROSS-VALIDATION RESULTS (SAFE FEATURES ONLY):")
print(f"Mean AUC: {np.mean(fold_scores):.4f} ± {np.std(fold_scores):.4f}")
print(f"Overall CV AUC: {cv_score:.4f}")
print(f"{'='*60}")

# Check if score is realistic (should be < 0.90 to confirm no leakage)
if cv_score > 0.90:
    print("⚠️  WARNING: CV score still > 0.90 - possible remaining leakage!")
    print("   Investigate features for hidden leakage.")
else:
    print("✅ SUCCESS: CV score < 0.90 - leakage likely removed!")
    print("   This is a realistic baseline to improve upon.")

## Feature Importance Analysis

In [None]:
# Get feature importance from the last fold model
feature_importance = model.feature_importance(importance_type='gain')
feature_names = X.columns.tolist()

# Create importance dataframe
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Top 20 Most Important Features:")
print(importance_df.head(20))

# Analyze feature types
tfidf_features = importance_df[importance_df['feature'].str.startswith('tfidf_')]
meta_features = importance_df[~importance_df['feature'].str.startswith('tfidf_')]

print(f"\nFeature type summary:")
print(f"TF-IDF features: {len(tfidf_features)} features, total importance: {tfidf_features['importance'].sum():.2f}")
print(f"Meta features: {len(meta_features)} features, total importance: {meta_features['importance'].sum():.2f}")

print(f"\nTop 10 Meta Features:")
print(meta_features.head(10))

## Create Submission

In [None]:
# Create submission file
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure the format matches sample submission
print("Submission shape:", submission.shape)
print("\nFirst few rows:")
print(submission.head())

# Save submission
submission_path = '/home/submission/submission_002_honest_baseline.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

# Check distribution of predictions
print(f"\nPrediction distribution:")
print(f"Mean: {submission['requester_received_pizza'].mean():.4f}")
print(f"Std: {submission['requester_received_pizza'].std():.4f}")
print(f"Min: {submission['requester_received_pizza'].min():.4f}")
print(f"Max: {submission['requester_received_pizza'].max():.4f}")

# Compare to target distribution
print(f"\nTarget distribution in training:")
print(f"Mean (positive rate): {y.mean():.4f}")
print(f"Our prediction mean: {submission['requester_received_pizza'].mean():.4f}")
print(f"Difference: {abs(submission['requester_received_pizza'].mean() - y.mean()):.4f}")

if abs(submission['requester_received_pizza'].mean() - y.mean()) < 0.05:
    print("✅ Prediction distribution matches target distribution well")
else:
    print("⚠️  Prediction distribution differs significantly from target - check calibration")