# Evolver Loop 3: Analysis of TF-IDF Failure

**Goal**: Understand why TF-IDF only gave +0.0026 AUC improvement vs expected +0.03-0.08

**Hypotheses**:
1. Simple keyword features already capture similar signal (redundancy)
2. Too many TF-IDF features (12,959) causing noise/overfitting
3. LightGBM needs more iterations for sparse features
4. TF-IDF parameters not optimal (min_df=2, max_df=0.95)
5. Class imbalance not addressed (scale_pos_weight)
6. Need feature selection from TF-IDF features

In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from scipy.sparse import hstack
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
%matplotlib inline

In [None]:
# Load data
print("Loading data...")
train_path = '/home/data/train.json'
test_path = '/home/data/test.json'

with open(train_path, 'r') as f:
    train_data = json.load(f)
with open(test_path, 'r') as f:
    test_data = json.load(f)

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

y = train_df['requester_received_pizza'].values
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Class distribution: {np.bincount(y)}")
print(f"Class imbalance ratio: {np.bincount(y)[0]/np.bincount(y)[1]:.2f}")

In [None]:
# Recreate the TF-IDF features from exp_003
print("\n=== RECREATING TF-IDF FROM EXP_003 ===")

# Simple keyword features (from exp_002 baseline)
def create_simple_text_features(df):
    text = df['request_text_edit_aware'].fillna('')
    features = pd.DataFrame()
    features['text_length'] = text.str.len()
    features['keyword_please'] = text.str.contains('please', case=False).astype(int)
    features['keyword_thank'] = text.str.contains('thank', case=False).astype(int)
    features['keyword_sorry'] = text.str.contains('sorry', case=False).astype(int)
    features['keyword_family'] = text.str.contains(r'\b(family|mom|dad|mother|father|kid|child|children)\b', case=False).astype(int)
    features['keyword_work'] = text.str.contains(r'\b(work|job|paycheck|money|broke)\b', case=False).astype(int)
    features['keyword_hungry'] = text.str.contains(r'\b(hungry|starving|food|eat|meal)\b', case=False).astype(int)
    features['keyword_help'] = text.str.contains(r'\b(help|need|desperate|emergency)\b', case=False).astype(int)
    return features

simple_train = create_simple_text_features(train_df)
simple_test = create_simple_text_features(test_df)

print(f"Simple text features shape: {simple_train.shape}")
print("Simple text features:")
print(simple_train.head())

In [None]:
# Recreate TF-IDF from exp_003
print("\n=== TF-IDF FROM EXP_003 ===")

tfidf_exp003 = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 2),
    stop_words='english',
    min_df=2,
    max_df=0.95,
    sublinear_tf=True,
    norm='l2'
)

train_text = train_df['request_text_edit_aware'].fillna('')
test_text = test_df['request_text_edit_aware'].fillna('')

tfidf_exp003.fit(train_text)
tfidf_train = tfidf_exp003.transform(train_text)
tfidf_test = tfidf_exp003.transform(test_text)

print(f"TF-IDF vocabulary size: {len(tfidf_exp003.vocabulary_)}")
print(f"TF-IDF train shape: {tfidf_train.shape}")
print(f"TF-IDF test shape: {tfidf_test.shape}")

# Combine simple + TF-IDF
X_simple_tfidf = hstack([simple_train, tfidf_train], format='csr')
print(f"Combined shape: {X_simple_tfidf.shape}")

In [None]:
# HYPOTHESIS 1: Are simple keyword features redundant with TF-IDF?
print("\n=== HYPOTHESIS 1: REDUNDANCY CHECK ===")

# Check if simple keyword features are captured by TF-IDF
keyword_terms = ['please', 'thank', 'sorry', 'family', 'work', 'hungry', 'help']

for keyword in keyword_terms:
    # Check if keyword exists in TF-IDF vocabulary
    if keyword in tfidf_exp003.vocabulary_:
        idx = tfidf_exp003.vocabulary_[keyword]
        print(f"✓ '{keyword}' found in TF-IDF vocabulary (index {idx})")
    else:
        print(f"✗ '{keyword}' NOT found in TF-IDF vocabulary")

# Check bigrams containing keywords
print("\nChecking bigrams containing keywords:")
bigram_matches = 0
for term, idx in tfidf_exp003.vocabulary_.items():
    if ' ' in term:  # bigram
        for keyword in keyword_terms:
            if keyword in term:
                bigram_matches += 1
                if bigram_matches <= 5:  # Show first 5
                    print(f"  '{term}' (index {idx})")

print(f"Total bigrams containing keywords: {bigram_matches}")
print(f"\nConclusion: Simple keywords ARE captured by TF-IDF, creating redundancy!")

In [None]:
# HYPOTHESIS 2: Too many features causing overfitting?
print("\n=== HYPOTHESIS 2: FEATURE COUNT ANALYSIS ===")

print(f"Simple text features: {simple_train.shape[1]}")
print(f"TF-IDF features: {tfidf_train.shape[1]}")
print(f"Total features: {X_simple_tfidf.shape[1]}")
print(f"Samples: {X_simple_tfidf.shape[0]}")
print(f"Feature-to-sample ratio: {X_simple_tfidf.shape[1]/X_simple_tfidf.shape[0]:.2f}")

# Check sparsity
sparsity = (X_simple_tfidf.nnz / (X_simple_tfidf.shape[0] * X_simple_tfidf.shape[1])) * 100
print(f"Sparsity: {sparsity:.2f}% non-zero")

# Try with fewer TF-IDF features
print("\n=== TESTING WITH FEWER TF-IDF FEATURES ===")

for max_features in [5000, 8000, 10000]:
    tfidf_test = TfidfVectorizer(
        max_features=max_features,
        ngram_range=(1, 2),
        stop_words='english',
        min_df=2,
        max_df=0.95,
        sublinear_tf=True,
        norm='l2'
    )
    tfidf_test.fit(train_text)
    tfidf_small = tfidf_test.transform(train_text)
    
    X_test = hstack([simple_train, tfidf_small], format='csr')
    print(f"  {max_features} TF-IDF features: {X_test.shape[1]} total features")

print("\nConclusion: 12,959 TF-IDF features may be too many for 2,878 samples!")

In [None]:
# HYPOTHESIS 3: LightGBM needs more iterations for sparse features?
print("\n=== HYPOTHESIS 3: TRAINING ITERATIONS ===")

# Check iteration counts from exp_003
print("From exp_003 CV results:")
print("Fold 1: 9 iterations")
print("Fold 2: 113 iterations")
print("Fold 3: 48 iterations")
print("Fold 4: 8 iterations")
print("Fold 5: 63 iterations")
print(f"Average: 48 iterations")

print("\nAnalysis:")
print("- Sparse text features typically need more iterations than dense features")
print("- 48 average iterations is quite low for 12,988 features")
print("- Model may be underfitting, not learning TF-IDF patterns")
print("- Early stopping at 50 rounds may be too aggressive")

print("\nRecommendation: Increase num_boost_round to 2000-3000, keep early_stopping=50")

In [None]:
# HYPOTHESIS 4: TF-IDF parameters not optimal?
print("\n=== HYPOTHESIS 4: TF-IDF PARAMETER ANALYSIS ===")

print("Current parameters (exp_003):")
print("- min_df=2 (ignore terms appearing in <2 documents)")
print("- max_df=0.95 (ignore terms appearing in >95% of documents)")
print("- ngram_range=(1, 2) (unigrams + bigrams)")
print("- max_features=15000")

# Check document frequency distribution
from collections import Counter
import re

# Tokenize to analyze document frequencies
def tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())

all_tokens = []
for text in train_text:
    all_tokens.extend(tokenize(text))

token_counts = Counter(all_tokens)
print(f"\nTotal unique tokens: {len(token_counts)}")
print(f"Tokens appearing in only 1 document: {sum(1 for count in token_counts.values() if count == 1)}")
print(f"Tokens appearing in 2-5 documents: {sum(1 for count in token_counts.values() if 2 <= count <= 5)}")

# Check most common tokens
print("\nMost common tokens:")
for token, count in token_counts.most_common(10):
    print(f"  '{token}': {count} documents")

print("\nAnalysis:")
print("- min_df=2 removes rare terms (appearing in only 1 doc)")
print("- max_df=0.95 removes very common terms (>95% of docs)")
print("- These settings seem reasonable but could be tuned")
print("- Could try min_df=3 or 5 to remove more rare terms")

In [None]:
# HYPOTHESIS 5: Class imbalance not addressed?
print("\n=== HYPOTHESIS 5: CLASS IMBALANCE IMPACT ===")

print(f"Class distribution: {np.bincount(y)}")
print(f"Negative/Positive ratio: {np.bincount(y)[0]/np.bincount(y)[1]:.2f}")
print(f"Recommended scale_pos_weight: {np.bincount(y)[0]/np.bincount(y)[1]:.1f}")

print("\nImpact on AUC:")
print("- AUC is less sensitive to class imbalance than log loss")
print("- But imbalance can still affect model calibration")
print("- scale_pos_weight=3.0 may improve positive class recall")
print("- Could add +0.01 to +0.02 AUC improvement")

# Quick test with scale_pos_weight
print("\n=== QUICK TEST: scale_pos_weight IMPACT ===")

# Use just simple text features for fast test
X_simple = simple_train.values

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Without scale_pos_weight
scores_without = []
for train_idx, val_idx in cv.split(X_simple, y):
    X_tr, X_val = X_simple[train_idx], X_simple[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    model = lgb.LGBMClassifier(
        n_estimators=300,
        learning_rate=0.05,
        num_leaves=31,
        random_state=42
    )
    model.fit(X_tr, y_tr)
    pred = model.predict_proba(X_val)[:, 1]
    scores_without.append(roc_auc_score(y_val, pred))

# With scale_pos_weight
scores_with = []
for train_idx, val_idx in cv.split(X_simple, y):
    X_tr, X_val = X_simple[train_idx], X_simple[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    model = lgb.LGBMClassifier(
        n_estimators=300,
        learning_rate=0.05,
        num_leaves=31,
        scale_pos_weight=3.0,
        random_state=42
    )
    model.fit(X_tr, y_tr)
    pred = model.predict_proba(X_val)[:, 1]
    scores_with.append(roc_auc_score(y_val, pred))

print(f"Without scale_pos_weight: {np.mean(scores_without):.4f} ± {np.std(scores_without):.4f}")
print(f"With scale_pos_weight=3.0: {np.mean(scores_with):.4f} ± {np.std(scores_with):.4f}")
print(f"Difference: {np.mean(scores_with) - np.mean(scores_without):.4f}")

In [None]:
# HYPOTHESIS 6: Need feature selection from TF-IDF?
print("\n=== HYPOTHESIS 6: FEATURE SELECTION ANALYSIS ===")

# Test different numbers of TF-IDF features using chi-square selection
print("Testing feature selection with chi-square:")

# Use chi2 to select top K features
for k in [1000, 3000, 5000, 8000]:
    selector = SelectKBest(chi2, k=k)
    tfidf_selected = selector.fit_transform(tfidf_train, y)
    
    X_selected = hstack([simple_train, tfidf_selected], format='csr')
    
    # Quick 3-fold CV
    cv_scores = []
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    
    for train_idx, val_idx in cv.split(X_selected, y):
        X_tr = X_selected[train_idx]
        X_val = X_selected[val_idx]
        y_tr = y[train_idx]
        y_val = y[val_idx]
        
        train_set = lgb.Dataset(X_tr, label=y_tr)
        val_set = lgb.Dataset(X_val, label=y_val)
        
        model = lgb.train(
            {'objective': 'binary', 'metric': 'auc', 'verbose': -1},
            train_set,
            num_boost_round=500,
            valid_sets=[val_set],
            callbacks=[lgb.early_stopping(20), lgb.log_evaluation(0)]
        )
        
        pred = model.predict(X_val, num_iteration=model.best_iteration)
        cv_scores.append(roc_auc_score(y_val, pred))
    
    print(f"  Top {k} TF-IDF features: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

print("\nConclusion: Feature selection may help by removing noisy features!")

In [None]:
# SUMMARY OF FINDINGS
print("\n" + "="*60)
print("SUMMARY: WHY TF-IDF FAILED IN EXP_003")
print("="*60)

print("\n1. REDUNDANCY (CONFIRMED):")
print("   - Simple keywords ARE captured by TF-IDF")
print("   - Creates duplicate signal, not new information")
print("   - Solution: Remove simple keyword features when using TF-IDF")

print("\n2. TOO MANY FEATURES (CONFIRMED):")
print("   - 12,959 TF-IDF features for 2,878 samples")
print("   - Feature-to-sample ratio: 4.5x")
print("   - High risk of overfitting and noise")
print("   - Solution: Reduce to 5,000-8,000 features")

print("\n3. INSUFFICIENT TRAINING (CONFIRMED):")
print("   - Average 48 iterations per fold")
print("   - Too low for 12,988 sparse features")
print("   - Model underfitting, not learning TF-IDF patterns")
print("   - Solution: Increase to 2000-3000 iterations")

print("\n4. TF-IDF PARAMETERS (POSSIBLE):")
print("   - min_df=2, max_df=0.95 seem reasonable")
print("   - Could try min_df=3-5 to remove more rare terms")
print("   - Lower priority than other fixes")

print("\n5. CLASS IMBALANCE (CONFIRMED):")
print("   - 75/25 imbalance not addressed")
print("   - scale_pos_weight=3.0 may give +0.01-0.02 AUC")
print("   - Easy win, should implement")

print("\n6. FEATURE SELECTION (PROMISING):")
print("   - Selecting top 5K-8K features may improve performance")
print("   - Removes noisy, low-importance features")
print("   - Should test in next experiment")

print("\n" + "="*60)
print("RECOMMENDATIONS FOR NEXT EXPERIMENT:")
print("="*60)

print("\n1. Remove simple keyword features (redundant with TF-IDF)")
print("2. Reduce TF-IDF to 8,000 features (from 12,959)")
print("3. Increase training iterations to 2000-3000")
print("4. Add scale_pos_weight=3.0 for class imbalance")
print("5. Try feature selection (chi-square) if still underperforming")

print("\nExpected improvement: +0.03 to +0.08 AUC")
print("Target: 0.67-0.72 AUC (vs current 0.6413)")