# Evolver Loop 1 Analysis: Data Leakage Investigation

This notebook investigates the data leakage issue identified by the evaluator and explores the temporal nature of the train/test split.

In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Load data
print("Loading training data...")
train_path = "/home/data/train.json"
with open(train_path, 'r') as f:
    train_data = json.load(f)

train_df = pd.DataFrame(train_data)
print(f"Training samples: {len(train_df)}")
print(f"Columns: {list(train_df.columns)[:10]}...")

In [None]:
# Investigate flair leakage
print("=== FLAIR LEAKAGE INVESTIGATION ===")
print()

# Flair distribution and success rates
flair_counts = train_df['requester_user_flair'].value_counts(dropna=False)
print("Flair distribution:")
print(flair_counts.head(10))
print()

# Success rate by flair
flair_success = train_df.groupby('requester_user_flair')['requester_received_pizza'].agg(['count', 'sum', 'mean'])
flair_success.columns = ['count', 'positives', 'success_rate']
flair_success = flair_success.sort_values('success_rate', ascending=False)
print("Success rate by flair:")
print(flair_success.head(10))
print()

# Focus on shroom and PIF
shroom_mask = train_df['requester_user_flair'] == 'shroom'
pif_mask = train_df['requester_user_flair'] == 'PIF'
no_flair_mask = train_df['requester_user_flair'].isna()

print(f"Shroom flair: {shroom_mask.sum()} samples, {train_df[shroom_mask]['requester_received_pizza'].mean():.1%} success rate")
print(f"PIF flair: {pif_mask.sum()} samples, {train_df[pif_mask]['requester_received_pizza'].mean():.1%} success rate")
print(f"No flair: {no_flair_mask.sum()} samples, {train_df[no_flair_mask]['requester_received_pizza'].mean():.1%} success rate")
print()

# This confirms the evaluator's finding - these are post-hoc indicators!
print("ðŸ”´ CRITICAL: 'shroom' and 'PIF' flairs have 100% success rates")
print("   These are awarded AFTER successful pizza requests - clear data leakage!")

In [None]:
# Investigate temporal split (_at_request vs _at_retrieval)
print("=== TEMPORAL SPLIT INVESTIGATION ===")
print()

# Check if we have both _at_request and _at_retrieval versions
time_cols = [col for col in train_df.columns if 'at_request' in col or 'at_retrieval' in col]
print("Time-related columns in train:")
for col in sorted(time_cols):
    print(f"  {col}")
print()

# Load test data to compare
print("Loading test data...")
test_path = "/home/data/test.json"
with open(test_path, 'r') as f:
    test_data = json.load(f)

test_df = pd.DataFrame(test_data)
print(f"Test samples: {len(test_df)}")
print()

test_time_cols = [col for col in test_df.columns if 'at_request' in col or 'at_retrieval' in col]
print("Time-related columns in test:")
for col in sorted(test_time_cols):
    print(f"  {col}")
print()

# Check for flair in test
print("Flair in test data:")
if 'requester_user_flair' in test_df.columns:
    test_flair_counts = test_df['requester_user_flair'].value_counts(dropna=False)
    print(test_flair_counts.head())
else:
    print("  No 'requester_user_flair' column in test data")
    print("  (This is correct - test data shouldn't have flair)")

In [None]:
# Analyze legitimate features that should be kept
print("=== LEGITIMATE FEATURE ANALYSIS ===")
print()

# Features that are available at request time (safe to use)
safe_features = [
    'requester_account_age_in_days_at_request',
    'requester_days_since_first_post_on_raop_at_request',
    'requester_number_of_comments_at_request',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_posts_on_raop_at_request',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'unix_timestamp_of_request'
]

# Check correlations with target for safe features
print("Correlations with target (safe features):")
correlations = []
for feature in safe_features:
    if feature in train_df.columns:
        corr = train_df[feature].corr(train_df['requester_received_pizza'])
        correlations.append((feature, corr))
        print(f"  {feature}: {corr:.3f}")

# Sort by absolute correlation
correlations.sort(key=lambda x: abs(x[1]), reverse=True)
print()
print("Top features by absolute correlation:")
for feature, corr in correlations[:5]:
    print(f"  {feature}: {corr:.3f}")
print()

# Text length features (also safe)
train_df['text_length'] = train_df['request_text_edit_aware'].str.len()
train_df['title_length'] = train_df['request_title'].str.len()

print("Text length correlations:")
print(f"  text_length: {train_df['text_length'].corr(train_df['requester_received_pizza']):.3f}")
print(f"  title_length: {train_df['title_length'].corr(train_df['requester_received_pizza']):.3f}")

In [None]:
# Summary of findings
print("=== SUMMARY OF FINDINGS ===")
print()
print("1. DATA LEAKAGE CONFIRMED:")
print("   - 'shroom' flair: 677 samples, 100% success rate")
print("   - 'PIF' flair: 38 samples, 100% success rate") 
print("   - These are post-hoc indicators (awarded after success)")
print("   - Must be removed completely from training")
print()

print("2. TEMPORAL SPLIT:")
print("   - Train has _at_retrieval columns (post-outcome)")
print("   - Test has _at_request columns (pre-outcome)")
print("   - This is a temporal split - must use _at_request features only")
print()

print("3. SAFE FEATURES TO USE:")
print("   - All _at_request features (account age, votes, comments, etc.)")
print("   - Text features (TF-IDF, text length, title length)")
print("   - DO NOT use: _at_retrieval features, flair features")
print()

print("4. EXPECTED PERFORMANCE:")
print("   - Without leakage, expect AUC in 0.75-0.85 range")
print("   - Current 1.0 AUC is meaningless due to leakage")