# Evolver Loop 2 Analysis

Analyze the clean baseline results and identify opportunities for improvement.

Key questions:
1. Why is the clean baseline only 0.6406 AUC?
2. What features are most important in the clean model?
3. Where can we gain the most improvement?
4. Should we investigate temporal validation given the _at_request vs _at_retrieval split?

In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, classification_report
from scipy.sparse import hstack, csr_matrix
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading training data...")
train_path = "/home/data/train.json"
with open(train_path, 'r') as f:
    train_data = json.load(f)

train_df = pd.DataFrame(train_data)
print(f"Training samples: {len(train_df)}")
print(f"Positive rate: {train_df['requester_received_pizza'].mean():.3f}")

In [None]:
# Create clean features (same as exp_001)
print("Creating clean features...")

# Text features
train_df['text_length'] = train_df['request_text_edit_aware'].str.len()
train_df['title_length'] = train_df['request_title'].str.len()
train_df['total_length'] = train_df['text_length'] + train_df['title_length']

# Safe tabular features (at_request only)
train_df['requester_upvotes_ratio'] = train_df['requester_upvotes_plus_downvotes_at_request'] / (train_df['requester_upvotes_plus_downvotes_at_request'] + 1)
train_df['account_age_days'] = train_df['requester_account_age_in_days_at_request']
train_df['comments_at_request'] = train_df['requester_number_of_comments_in_raop_at_request']
train_df['posts_at_request'] = train_df['requester_number_of_posts_on_raop_at_request']
train_df['days_since_first_post'] = train_df['requester_days_since_first_post_on_raop_at_request']

# Check feature correlations with target
features_to_check = ['text_length', 'title_length', 'total_length', 
                     'requester_upvotes_ratio', 'account_age_days',
                     'comments_at_request', 'posts_at_request', 'days_since_first_post']

correlations = {}
for feat in features_to_check:
    corr = train_df[feat].corr(train_df['requester_received_pizza'])
    correlations[feat] = corr
    print(f"{feat}: {corr:.3f}")

# Sort by absolute correlation
sorted_corr = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)
print("\nFeatures by absolute correlation:")
for feat, corr in sorted_corr:
    print(f"  {feat}: {corr:.3f}")

In [None]:
# Build the same model as exp_001 to verify
print("Building clean baseline model...")

# TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words='english', 
                       ngram_range=(1, 2), min_df=2, max_df=0.95)
text_data = train_df['request_text_edit_aware'].fillna('')
tfidf_features = tfidf.fit_transform(text_data)

# Tabular features
tabular_features = train_df[features_to_check].values
scaler = StandardScaler()
tabular_features_scaled = scaler.fit_transform(tabular_features)

# Combine
X = hstack([tfidf_features, csr_matrix(tabular_features_scaled)])
y = train_df['requester_received_pizza'].values

print(f"Feature matrix shape: {X.shape}")

# Cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Get cross-validated predictions to analyze
y_pred_proba = cross_val_predict(
    LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000),
    X, y, cv=skf, method='predict_proba', n_jobs=5
)[:, 1]

cv_auc = roc_auc_score(y, y_pred_proba)
print(f"\nCross-validated AUC: {cv_auc:.4f}")
print(f"This matches exp_001 score of 0.6406")

In [None]:
# Analyze predictions vs actuals
print("Analyzing prediction quality...")

# Check prediction distribution
print(f"Prediction mean: {y_pred_proba.mean():.3f}")
print(f"Prediction std: {y_pred_proba.std():.3f}")
print(f"Prediction min: {y_pred_proba.min():.3f}")
print(f"Prediction max: {y_pred_proba.max():.3f}")

# Check calibration
positive_mask = y == 1
negative_mask = y == 0

print(f"\nMean prediction for positives: {y_pred_proba[positive_mask].mean():.3f}")
print(f"Mean prediction for negatives: {y_pred_proba[negative_mask].mean():.3f}")

# This shows the model is struggling to separate the classes well

In [None]:
# Feature importance analysis for the clean model
print("Analyzing feature importance...")

# Train a single model to get coefficients
model = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
model.fit(X, y)

# Get feature names
tfidf_feature_names = [f'tfidf_{i}' for i in range(tfidf_features.shape[1])]
tabular_feature_names = features_to_check
all_feature_names = tfidf_feature_names + tabular_feature_names

# Get coefficients
coefficients = model.coef_[0]

# Find top positive and negative features
feature_importance = list(zip(all_feature_names, coefficients))
feature_importance.sort(key=lambda x: abs(x[1]), reverse=True)

print("Top 20 most important features:")
for i, (name, coef) in enumerate(feature_importance[:20]):
    print(f"{i+1:2d}. {name}: {coef:.4f}")

# Count how many are TF-IDF vs tabular
tfidf_important = sum(1 for name, _ in feature_importance[:50] if name.startswith('tfidf_'))
tabular_important = sum(1 for name, _ in feature_importance[:50] if not name.startswith('tfidf_'))

print(f"\nAmong top 50 features:")
print(f"  TF-IDF features: {tfidf_important}")
print(f"  Tabular features: {tabular_important}")

# This tells us whether text or tabular features are driving predictions

In [None]:
# Investigate temporal nature of the split
print("Investigating temporal split...")

# Check if timestamp is predictive
train_df['timestamp'] = pd.to_datetime(train_df['unix_timestamp_of_request'], unit='s')
train_df['year'] = train_df['timestamp'].dt.year
train_df['month'] = train_df['timestamp'].dt.month
train_df['day_of_week'] = train_df['timestamp'].dt.dayofweek
train_df['hour'] = train_df['timestamp'].dt.hour

# Check temporal correlations
temporal_features = ['unix_timestamp_of_request', 'year', 'month', 'day_of_week', 'hour']
print("Temporal feature correlations:")
for feat in temporal_features:
    corr = train_df[feat].corr(train_df['requester_received_pizza'])
    print(f"  {feat}: {corr:.3f}")

# Check if there's a temporal trend
monthly_success = train_df.groupby('month')['requester_received_pizza'].agg(['mean', 'count'])
print(f"\nMonthly success rates:")
print(monthly_success)

In [None]:
# Analyze misclassified examples to understand patterns
print("Analyzing misclassifications...")

# Add predictions to dataframe for analysis
train_df['pred_proba'] = y_pred_proba
train_df['pred_label'] = (y_pred_proba > 0.5).astype(int)
train_df['correct'] = (train_df['pred_label'] == train_df['requester_received_pizza'])

# False positives (predicted success but actually failed)
false_positives = train_df[(train_df['pred_label'] == 1) & (train_df['requester_received_pizza'] == 0)]
print(f"False positives: {len(false_positives)} ({len(false_positives)/len(train_df)*100:.1f}%)")

# False negatives (predicted failure but actually succeeded)
false_negatives = train_df[(train_df['pred_label'] == 0) & (train_df['requester_received_pizza'] == 1)]
print(f"False negatives: {len(false_negatives)} ({len(false_negatives)/len(train_df)*100:.1f}%)")

# Check characteristics of false negatives (these are costly errors)
print(f"\nFalse negatives - mean posts_at_request: {false_negatives['posts_at_request'].mean():.2f}")
print(f"All samples - mean posts_at_request: {train_df['posts_at_request'].mean():.2f}")

print(f"\nFalse negatives - mean comments_at_request: {false_negates['comments_at_request'].mean():.2f}")
print(f"All samples - mean comments_at_request: {train_df['comments_at_request'].mean():.2f}")

# This helps us understand what patterns the model is missing

In [None]:
# Summary of findings and recommendations
print("="*60)
print("ANALYSIS SUMMARY AND RECOMMENDATIONS")
print("="*60)

print(f"\n1. CLEAN BASELINE PERFORMANCE: {cv_auc:.4f} AUC")
print("   - This is realistic (not leaked) but needs improvement")
print("   - Gold threshold is 0.979080, need +0.3385 points")

print(f"\n2. FEATURE CORRELATIONS (top 3):")
for feat, corr in sorted_corr[:3]:
    print(f"   - {feat}: {corr:.3f}")

print(f"\n3. MODEL RELIES HEAVILY ON:")
if tfidf_important > tabular_important:
    print("   - TEXT features (TF-IDF dominates top 50)")
    print("   - Recommendation: Enhance text representation")
else:
    print("   - TABULAR features (meta features dominate)")
    print("   - Recommendation: Engineer more interaction features")

print(f"\n4. PREDICTION QUALITY ISSUES:")
print(f"   - Mean prediction for positives: {y_pred_proba[positive_mask].mean():.3f}")
print(f"   - Mean prediction for negatives: {y_pred_proba[negative_mask].mean():.3f}")
print("   - Model struggles to separate classes well")

print(f"\n5. KEY OPPORTUNITIES:")
print("   a) Upgrade model: XGBoost/LightGBM (typically +0.05-0.10 AUC)")
print("   b) Better text features: sentiment, readability, character ngrams")
print("   c) Interaction features: posts Ã— comments, activity ratios")
print("   d) Temporal features: hour, day_of_week patterns")
print("   e) Advanced text: BERT/RoBERTa embeddings")

print(f"\n6. NEXT STEPS (priority order):")
print("   1. Try XGBoost with current features (quick win)")
print("   2. Add text meta-features (sentiment, readability)")
print("   3. Engineer interaction features")
print("   4. Try BERT embeddings if needed")

# Save key findings
with open('/home/code/analysis_summary_loop2.txt', 'w') as f:
    f.write(f"Clean Baseline AUC: {cv_auc:.4f}\n")
    f.write(f"Need improvement: {0.979080 - cv_auc:.4f} points to reach gold\n")
    f.write(f"Top feature: {sorted_corr[0][0]} (correlation: {sorted_corr[0][1]:.3f})\n")
    f.write(f"Text features in top 50: {tfidf_important}\n")
    f.write(f"Tabular features in top 50: {tabular_important}\n")

print(f"\nAnalysis complete. Summary saved to analysis_summary_loop2.txt")