# Evolver Loop 2 Analysis

## Goal: Diagnose failures and identify breakthrough strategies

Current Status:
- Best CV: 0.6374 (baseline)
- Latest: 0.6129 (enhanced features) - WORSE
- Evaluator verdict: UNRELIABLE due to data leakage
- Gap to gold: 0.3417

Focus areas:
1. Understand why enhanced features failed
2. Validate data leakage impact
3. Research BERT/transformer approaches
4. Design ensemble strategy

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)
train_df = pd.DataFrame(train_data)

with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)
test_df = pd.DataFrame(test_data)

print(f"Train: {len(train_df)}, Test: {len(test_df)}")
print(f"Target distribution: {train_df['requester_received_pizza'].value_counts(normalize=True).to_dict()}")

# Check for data leakage issue
print("\n=== DATA LEAKAGE ANALYSIS ===")
combined_text = pd.concat([
    train_df['request_title'].fillna('') + ' ' + train_df['request_text_edit_aware'].fillna(''),
    test_df['request_title'].fillna('') + ' ' + test_df['request_text_edit_aware'].fillna('')
])
print(f"Combined text samples: {len(combined_text)}")
print(f"Train text samples: {len(train_df)}")
print(f"Test text samples: {len(test_df)}")
print(f"Leakage issue: TF-IDF fit on {len(combined_text)} samples instead of {len(train_df)} training samples")
print(f"This means test data influenced training - INVALID CV!")

In [None]:
# Analyze feature dimensionality issue
print("=== FEATURE DIMENSIONALITY ANALYSIS ===")

# Simulate the enhanced features setup
# Text features
train_df['text_length'] = train_df['request_text_edit_aware'].str.len()
train_df['word_count'] = train_df['request_text_edit_aware'].str.split().str.len()
train_df['title_length'] = train_df['request_title'].str.len()
train_df['title_word_count'] = train_df['request_title'].str.split().str.len()
train_df['avg_word_length'] = train_df['text_length'] / (train_df['word_count'] + 1)

# Persuasion features
persuasion_words = ['please', 'help', 'appreciate', 'grateful', 'thank', 'thanks', 'kind', 'generous', 'need']
for word in persuasion_words:
    train_df[f'has_{word}'] = train_df['request_text_edit_aware'].str.lower().str.contains(word).astype(int)

# Punctuation
train_df['exclamation_count'] = train_df['request_text_edit_aware'].str.count('!')
train_df['question_count'] = train_df['request_text_edit_aware'].str.count('\?')

# Interaction features
train_df['user_credibility'] = (train_df['requester_upvotes_plus_downvotes_at_request']) / (train_df['requester_account_age_in_days_at_request'] + 1)
train_df['comments_per_post'] = train_df['requester_number_of_comments_at_request'] / (train_df['requester_number_of_posts_at_request'] + 1)
train_df['upvote_ratio'] = train_df['requester_upvotes_minus_downvotes_at_request'] / (train_df['requester_upvotes_plus_downvotes_at_request'] + 1)
train_df['subreddit_diversity'] = train_df['requester_number_of_subreddits_at_request'] / (train_df['requester_account_age_in_days_at_request'] + 1)
train_df['request_quality'] = train_df['word_count'] * train_df['upvote_ratio']

# Temporal
train_df['request_hour'] = pd.to_datetime(train_df['unix_timestamp_of_request'], unit='s').dt.hour
train_df['request_day_of_week'] = pd.to_datetime(train_df['unix_timestamp_of_request'], unit='s').dt.dayofweek
train_df['is_hour_15'] = (train_df['request_hour'] == 15).astype(int)

# Count features
base_features = [
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_account_age_in_days_at_request'
]

text_features = ['text_length', 'word_count', 'title_length', 'title_word_count', 'avg_word_length']
persuasion_features = [f'has_{word}' for word in persuasion_words]
punctuation_features = ['exclamation_count', 'question_count']
interaction_features = [
    'user_credibility', 'comments_per_post', 'upvote_ratio', 'subreddit_diversity', 
    'request_quality', 'request_hour', 'request_day_of_week', 'is_hour_15'
]

all_num_features = base_features + text_features + persuasion_features + punctuation_features + interaction_features
available_features = [f for f in all_num_features if f in train_df.columns]

print(f"Numerical features: {len(available_features)}")
print(f"TF-IDF features (enhanced): 10,000")
print(f"Total features: {len(available_features) + 10000}")
print(f"Training samples: {len(train_df)}")
print(f"Feature-to-sample ratio: {(len(available_features) + 10000) / len(train_df):.2f}:1")
print(f"\nThis is SEVERELY OVERPARAMETRIZED!")
print(f"Recommended ratio: < 0.1:1 for generalization")

# Check correlations among engineered features
print(f"\n=== FEATURE CORRELATION ANALYSIS ===")
corr_matrix = train_df[available_features].corr().abs()
high_corr = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if corr_matrix.iloc[i, j] > 0.9:
            high_corr.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))

print(f"Highly correlated feature pairs (>0.9): {len(high_corr)}")
if high_corr:
    print("Top correlated pairs:")
    for f1, f2, corr in high_corr[:5]:
        print(f"  {f1} <-> {f2}: {corr:.3f}")

In [None]:
# Test proper TF-IDF fitting (no leakage)
print("=== PROPER TF-IDF VALIDATION ===")

# Combine title and text
combined_text_train = train_df['request_title'].fillna('') + ' ' + train_df['request_text_edit_aware'].fillna('')
combined_text_test = test_df['request_title'].fillna('') + ' ' + test_df['request_text_edit_aware'].fillna('')

# Test different TF-IDF configurations
tfidf_configs = [
    {"name": "Baseline (100 features)", "max_features": 100, "ngram_range": (1, 1)},
    {"name": "Enhanced-Leakage (10k features)", "max_features": 10000, "ngram_range": (1, 3)},
    {"name": "Proper-No-Leakage (2k features)", "max_features": 2000, "ngram_range": (1, 2)},
]

results = []

for config in tfidf_configs:
    print(f"\nTesting: {config['name']}")
    
    # Create TF-IDF
    tfidf = TfidfVectorizer(
        max_features=config['max_features'],
        ngram_range=config['ngram_range'],
        stop_words='english',
        sublinear_tf=True,
        min_df=2,
        max_df=0.95
    )
    
    if "Leakage" in config['name']:
        # Fit on combined (WRONG - what exp_001 did)
        all_text = pd.concat([combined_text_train, combined_text_test])
        tfidf.fit(all_text)
        print(f"  Fitting on {len(all_text)} samples (TRAIN+TEST) - LEAKAGE!")
    else:
        # Fit on train only (CORRECT)
        tfidf.fit(combined_text_train)
        print(f"  Fitting on {len(combined_text_train)} samples (TRAIN ONLY) - VALID")
    
    # Transform
    tfidf_train = tfidf.transform(combined_text_train)
    
    # Quick validation with simple model
    y = train_df['requester_received_pizza'].astype(int).values
    
    # Simple LightGBM with few iterations
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(tfidf_train, y)):
        X_tr, X_val = tfidf_train[train_idx], tfidf_train[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]
        
        params = {
            'objective': 'binary',
            'metric': 'auc',
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'learning_rate': 0.1,
            'verbose': -1,
            'scale_pos_weight': 3.0
        }
        
        train_data = lgb.Dataset(X_tr, label=y_tr)
        valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
        
        model = lgb.train(
            params,
            train_data,
            num_boost_round=100,
            valid_sets=[valid_data],
            callbacks=[lgb.early_stopping(10), lgb.log_evaluation(0)]
        )
        
        val_pred = model.predict(X_val, num_iteration=model.best_iteration)
        fold_score = roc_auc_score(y_val, val_pred)
        fold_scores.append(fold_score)
    
    cv_score = np.mean(fold_scores)
    cv_std = np.std(fold_scores)
    
    results.append({
        'config': config['name'],
        'score': cv_score,
        'std': cv_std,
        'n_features': tfidf_train.shape[1]
    })
    
    print(f"  CV Score: {cv_score:.4f} ± {cv_std:.4f}")
    print(f"  Features: {tfidf_train.shape[1]}")

# Summary
print(f"\n=== SUMMARY ===")
results_df = pd.DataFrame(results)
print(results_df)

best_config = results_df.loc[results_df['score'].idxmax()]
print(f"\nBest configuration: {best_config['config']}")
print(f"Score: {best_config['score']:.4f} ± {best_config['std']:.4f}")
print(f"Features: {best_config['n_features']}")

In [None]:
# Research BERT approach feasibility
print("=== BERT FEASIBILITY ANALYSIS ===")

# Check if GPU is available (BERT needs it for reasonable speed)
import torch
if torch.cuda.is_available():
    print(f"✓ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("✗ No GPU available - BERT will be very slow on CPU")

# Estimate BERT processing time
n_samples = len(train_df) + len(test_df)
print(f"\nTotal samples to process: {n_samples}")
print(f"Average text length: {train_df['request_text_edit_aware'].str.len().mean():.0f} chars")

if torch.cuda.is_available():
    print(f"Estimated processing time on GPU: ~5-10 minutes")
else:
    print(f"Estimated processing time on CPU: ~1-2 hours (NOT RECOMMENDED)")

# Check available pretrained models
print(f"\nRecommended models:")
print(f"1. distilbert-base-uncased (fastest, 768 dims)")
print(f"2. bert-base-uncased (standard, 768 dims)")
print(f"3. all-MiniLM-L6-v2 (very fast, 384 dims)")

# Memory requirements
embedding_dim = 768
memory_per_sample = embedding_dim * 4 / 1e6  # MB
print(f"\nMemory requirements:")
print(f"  Embedding dimension: {embedding_dim}")
print(f"  Memory per sample: {memory_per_sample:.2f} MB")
print(f"  Total memory for all samples: {memory_per_sample * n_samples:.0f} MB")
print(f"  This is manageable even on CPU")

In [None]:
# Analyze what went wrong with enhanced features
print("=== FAILURE ANALYSIS ===")

print("Why did enhanced features (0.6129) perform WORSE than baseline (0.6374)?")
print()
print("1. DATA LEAKAGE (Primary Issue)")
print("   - TF-IDF fit on train+test data")
print("   - Test distribution influenced training")
print("   - CV scores are INVALID")
print()
print("2. OVERPARAMETERIZATION (Secondary Issue)")
print(f"   - 10,000 TF-IDF features + {len(available_features)} numerical features")
print(f"   - {(len(available_features) + 10000) / len(train_df):.2f} features per sample")
print("   - Severe overfitting risk")
print()
print("3. FEATURE QUALITY (Tertiary Issue)")
print("   - Many engineered features may be noisy")
print("   - No feature selection performed")
print("   - High correlation between some features")
print()
print("4. MODEL INSTABILITY")
print("   - Early stopping varied 9-140 rounds across folds")
print("   - High learning rate (0.05) may cause instability")
print()
print("=== WHAT TO DO NEXT ===")
print()
print("1. FIX LEAKAGE: Fit TF-IDF only on training data")
print("2. REDUCE DIMENSIONALITY: Use 1,000-2,000 TF-IDF features max")
print("3. TRY BERT: Use pretrained embeddings (768 dims vs 10,000)")
print("4. ENSEMBLE: Combine multiple models for stability")
print("5. FEATURE SELECTION: Keep only important features")
print()
print("Priority order: 3 > 1 > 4 > 2 > 5")
print("BERT is most likely to give breakthrough improvement")