# Evolver Loop 3 Analysis: Retrieval Feature Leakage Investigation

This notebook investigates whether retrieval features contain leakage and explains the flat score trajectory.

**Hypothesis**: Features with `_at_retrieval` contain post-outcome information and are causing score degradation.

In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Load data
train_path = '/home/data/train.json'
test_path = '/home/data/test.json'

with open(train_path, 'r') as f:
    train_data = json.load(f)

with open(test_path, 'r') as f:
    test_data = json.load(f)

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"\nTraining columns: {train_df.shape[1]}")
print(f"Test columns: {test_df.shape[1]}")

# Show column names to identify retrieval vs request features
print("\nAll training columns:")
for i, col in enumerate(train_df.columns):
    print(f"{i:2d}: {col}")

Training samples: 2878
Test samples: 1162

Training columns: 32
Test columns: 17

All training columns:
 0: giver_username_if_known
 1: number_of_downvotes_of_request_at_retrieval
 2: number_of_upvotes_of_request_at_retrieval
 3: post_was_edited
 4: request_id
 5: request_number_of_comments_at_retrieval
 6: request_text
 7: request_text_edit_aware
 8: request_title
 9: requester_account_age_in_days_at_request
10: requester_account_age_in_days_at_retrieval
11: requester_days_since_first_post_on_raop_at_request
12: requester_days_since_first_post_on_raop_at_retrieval
13: requester_number_of_comments_at_request
14: requester_number_of_comments_at_retrieval
15: requester_number_of_comments_in_raop_at_request
16: requester_number_of_comments_in_raop_at_retrieval
17: requester_number_of_posts_at_request
18: requester_number_of_posts_at_retrieval
19: requester_number_of_posts_on_raop_at_request
20: requester_number_of_posts_on_raop_at_retrieval
21: requester_number_of_subreddits_at_request
22

## Identify Retrieval vs Request Features

Let's categorize features based on whether they contain `_at_request` or `_at_retrieval`.

In [2]:
# Categorize features
request_features = [col for col in train_df.columns if '_at_request' in col]
retrieval_features = [col for col in train_df.columns if '_at_retrieval' in col]
other_features = [col for col in train_df.columns if '_at_request' not in col and '_at_retrieval' not in col]

print("=== REQUEST FEATURES (pre-outcome) ===")
for f in request_features:
    print(f"  - {f}")

print(f"\n=== RETRIEVAL FEATURES (potential leakage) ===")
for f in retrieval_features:
    print(f"  - {f}")

print(f"\n=== OTHER FEATURES ===")
for f in other_features:
    print(f"  - {f}")

print(f"\nCounts:")
print(f"  Request features: {len(request_features)}")
print(f"  Retrieval features: {len(retrieval_features)}")
print(f"  Other features: {len(other_features)}")

# Check which features exist in test data
print(f"\n=== FEATURE AVAILABILITY IN TEST DATA ===")
all_features = request_features + retrieval_features + other_features
test_columns = set(test_df.columns)

available_in_test = []
not_in_test = []

for f in all_features:
    if f in test_columns:
        available_in_test.append(f)
    else:
        not_in_test.append(f)

print(f"Features available in test: {len(available_in_test)}")
print(f"Features NOT in test: {len(not_in_test)}")

if not_in_test:
    print("\nMissing from test:")
    for f in not_in_test:
        print(f"  - {f}")

=== REQUEST FEATURES (pre-outcome) ===
  - requester_account_age_in_days_at_request
  - requester_days_since_first_post_on_raop_at_request
  - requester_number_of_comments_at_request
  - requester_number_of_comments_in_raop_at_request
  - requester_number_of_posts_at_request
  - requester_number_of_posts_on_raop_at_request
  - requester_number_of_subreddits_at_request
  - requester_subreddits_at_request
  - requester_upvotes_minus_downvotes_at_request
  - requester_upvotes_plus_downvotes_at_request

=== RETRIEVAL FEATURES (potential leakage) ===
  - number_of_downvotes_of_request_at_retrieval
  - number_of_upvotes_of_request_at_retrieval
  - request_number_of_comments_at_retrieval
  - requester_account_age_in_days_at_retrieval
  - requester_days_since_first_post_on_raop_at_retrieval
  - requester_number_of_comments_at_retrieval
  - requester_number_of_comments_in_raop_at_retrieval
  - requester_number_of_posts_at_retrieval
  - requester_number_of_posts_on_raop_at_retrieval
  - requeste

## Analyze Feature Distributions

Check if retrieval features have different distributions than request features, which could indicate they contain different information.

In [3]:
# Analyze distributions for numeric features
def analyze_feature_distributions():
    results = []
    
    # Get numeric features from both categories
    numeric_request = []
    numeric_retrieval = []
    
    for f in request_features:
        if f in train_df.columns and pd.api.types.is_numeric_dtype(train_df[f]):
            numeric_request.append(f)
    
    for f in retrieval_features:
        if f in train_df.columns and pd.api.types.is_numeric_dtype(train_df[f]):
            numeric_retrieval.append(f)
    
    print("=== COMPARING REQUEST vs RETRIEVAL FEATURES ===\n")
    
    # Compare features that have both _at_request and _at_retrieval versions
    for req_feat in numeric_request:
        # Find corresponding retrieval feature
        base_name = req_feat.replace('_at_request', '')
        ret_feat = base_name + '_at_retrieval'
        
        if ret_feat in numeric_retrieval:
            # Calculate correlation with target for both
            req_corr = train_df[req_feat].corr(train_df['requester_received_pizza'])
            ret_corr = train_df[ret_feat].corr(train_df['requester_received_pizza'])
            
            # Calculate difference in values
            value_diff = (train_df[ret_feat] - train_df[req_feat]).abs().mean()
            
            results.append({
                'feature_base': base_name,
                'request_corr': req_corr,
                'retrieval_corr': ret_corr,
                'value_diff_mean': value_diff,
                'request_missing_pct': (train_df[req_feat].isna().sum() / len(train_df)) * 100,
                'retrieval_missing_pct': (train_df[ret_feat].isna().sum() / len(train_df)) * 100
            })
    
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('retrieval_corr', key=abs, ascending=False)
    
    print("Features with both request and retrieval versions:")
    print(results_df.to_string(index=False, float_format='%.4f'))
    
    return results_df

results_df = analyze_feature_distributions()

=== COMPARING REQUEST vs RETRIEVAL FEATURES ===

Features with both request and retrieval versions:
                           feature_base  request_corr  retrieval_corr  value_diff_mean  request_missing_pct  retrieval_missing_pct
      requester_number_of_posts_on_raop        0.1327          0.4622           1.1737               0.0000                 0.0000
   requester_number_of_comments_in_raop        0.1320          0.2771           1.9145               0.0000                 0.0000
requester_days_since_first_post_on_raop        0.1087          0.1273         502.6727               0.0000                 0.0000
           requester_number_of_comments        0.0367          0.1230         168.2384               0.0000                 0.0000
          requester_account_age_in_days        0.0434          0.1189         503.2616               0.0000                 0.0000
      requester_upvotes_minus_downvotes        0.0431          0.0885        1550.6397               0.0000       

## Test the Leakage Hypothesis

Run two experiments:
1. Only request features (safe, pre-outcome)
2. All features including retrieval

If request-only performs better, retrieval features are leaking.

In [None]:
def prepare_features(df, feature_list):
    """Prepare feature matrix from list of features"""
    features = pd.DataFrame(index=df.index)
    
    for f in feature_list:
        if f in df.columns:
            # Fill missing values
            if pd.api.types.is_numeric_dtype(df[f]):
                features[f] = df[f].fillna(0)
            else:
                # For categorical, use label encoding
                features[f] = pd.Categorical(df[f]).codes
                features[f] = features[f].fillna(-1)
    
    return features

def run_cv_experiment(feature_set_name, feature_list):
    """Run 5-fold CV with given features"""
    print(f"\n=== {feature_set_name} ===")
    print(f"Features: {len(feature_list)}")
    
    # Prepare features
    X = prepare_features(train_df, feature_list)
    y = train_df['requester_received_pizza']
    
    print(f"Feature matrix shape: {X.shape}")
    print(f"Missing values: {X.isna().sum().sum()}")
    
    # 5-fold stratified CV
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    scores = []
    predictions = np.zeros(len(train_df))
    
    feature_importance_list = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # Train LightGBM
        model = lgb.LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            num_leaves=63,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=RANDOM_SEED,
            n_jobs=-1
        )
        
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='auc',
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
        )
        
        # Predict
        val_pred = model.predict_proba(X_val)[:, 1]
        predictions[val_idx] = val_pred
        
        # Score
        score = roc_auc_score(y_val, val_pred)
        scores.append(score)
        
        # Feature importance
        importance = pd.DataFrame({
            'feature': X.columns,
            'importance': model.feature_importances_,
            'fold': fold
        })
        feature_importance_list.append(importance)
        
        print(f"  Fold {fold}: AUC = {score:.4f}")
    
    # Overall score
    overall_score = roc_auc_score(y, predictions)
    print(f"  Overall CV AUC: {overall_score:.4f} ± {np.std(scores):.4f}")
    
    # Feature importance analysis
    feature_importance = pd.concat(feature_importance_list)
    feature_importance = feature_importance.groupby('feature')['importance'].mean().sort_values(ascending=False)
    
    print(f"\n  Top 5 features:")
    for i, (feat, imp) in enumerate(feature_importance.head().items()):
        print(f"    {i+1}. {feat}: {imp:.1f}")
    
    # Prediction distribution
    pred_range = predictions.max() - predictions.min()
    print(f"\n  Prediction range: {predictions.min():.3f} - {predictions.max():.3f} (range: {pred_range:.3f})")
    
    return {
        'score': overall_score,
        'std': np.std(scores),
        'predictions': predictions,
        'feature_importance': feature_importance
    }

# Define feature sets
all_safe_features = request_features + other_features
all_features = request_features + retrieval_features + other_features

print("Preparing to test leakage hypothesis...")
print(f"Request features: {len(request_features)}")
print(f"Retrieval features: {len(retrieval_features)}")
print(f"Other features: {len(other_features)}")

# Run experiments
request_only_results = run_cv_experiment("REQUEST-ONLY FEATURES (SAFE)", all_safe_features)
all_features_results = run_cv_experiment("ALL FEATURES (WITH RETRIEVAL)", all_features)

## Analyze Results

Compare the two experiments to determine if retrieval features are leaking.

In [None]:
# Compare results
print("\n" + "="*60)
print("LEAKAGE ANALYSIS RESULTS")
print("="*60)

print(f"\nRequest-only features: {request_only_results['score']:.4f} ± {request_only_results['std']:.4f}")
print(f"All features (with retrieval): {all_features_results['score']:.4f} ± {all_features_results['std']:.4f}")
print(f"Difference: {all_features_results['score'] - request_only_results['score']:.4f}")

if all_features_results['score'] > request_only_results['score']:
    print("\n✓ RETRIEVAL FEATURES ARE SAFE (improved performance)")
    print("  Keep all features including retrieval.")
elif all_features_results['score'] < request_only_results['score']:
    print("\n✗ RETRIEVAL FEATURES ARE LEAKING (hurt performance)")
    print("  Remove all _at_retrieval features immediately.")
else:
    print("\n? RETRIEVAL FEATURES ARE NEUTRAL (no impact)")
    print("  Can keep or remove - minimal impact.")

# Check prediction distributions
print(f"\nRequest-only prediction range: {request_only_results['predictions'].min():.3f} - {request_only_results['predictions'].max():.3f}")
print(f"All features prediction range: {all_features_results['predictions'].min():.3f} - {all_features_results['predictions'].max():.3f}")

# Save findings
finding = f"Retrieval feature leakage test: Request-only={request_only_results['score']:.4f}, All={all_features_results['score']:.4f}. "
if all_features_results['score'] < request_only_results['score']:
    finding += "RETRIEVAL FEATURES ARE LEAKING - REMOVE THEM."
else:
    finding += "Retrieval features are safe to use."

print(f"\n{finding}")