# Experiment 004: Fix TF-IDF Implementation

**Goal**: Address the 6 issues identified in evolver_loop3_analysis.ipynb that caused TF-IDF to only add +0.0026 AUC

**Changes from exp_003**:
1. Remove redundant simple keyword features (6 of 7 are in TF-IDF vocabulary)
2. Reduce TF-IDF features from 12,959 to 8,000 (max_features=8000)
3. Increase training iterations from ~50 to 2000 (num_boost_round=2000)
4. Add class imbalance handling (scale_pos_weight=3.0)
5. Keep all tabular features (upvotes_minus_downvotes, account_age_at_request, etc.)

**Expected outcome**: 0.6413 → 0.67-0.72 AUC (+0.03 to +0.08)

In [4]:
import pandas as pd
import numpy as np
import json
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss
from scipy.sparse import hstack
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

In [5]:
# Load data
print("Loading data...")
train_path = '/home/data/train.json'
test_path = '/home/data/test.json'

with open(train_path, 'r') as f:
    train_data = json.load(f)
with open(test_path, 'r') as f:
    test_data = json.load(f)

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

y = train_df['requester_received_pizza'].values
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Class distribution: {np.bincount(y)}")
print(f"Class imbalance ratio: {np.bincount(y)[0]/np.bincount(y)[1]:.2f}")

Loading data...
Train shape: (2878, 32)
Test shape: (1162, 17)
Class distribution: [2163  715]
Class imbalance ratio: 3.03


In [6]:
# Extract tabular features (same as exp_002 baseline)
print("\nExtracting tabular features...")

def extract_features(df):
    features = {}
    
    # Text length features
    features['text_length'] = df['request_text_edit_aware'].fillna('').str.len()
    features['word_count'] = df['request_text_edit_aware'].fillna('').str.split().str.len()
    features['avg_word_length'] = features['text_length'] / np.maximum(features['word_count'], 1)
    
    # Engagement features (using available columns)
    features['upvotes_minus_downvotes'] = df['requester_upvotes_minus_downvotes_at_request']
    features['upvotes_plus_downvotes'] = df['requester_upvotes_plus_downvotes_at_request']
    
    # Activity features
    features['num_posts_at_request'] = df['requester_number_of_posts_at_request']
    features['num_comments_at_request'] = df['requester_number_of_comments_at_request']
    features['comments_per_post'] = features['num_comments_at_request'] / np.maximum(features['num_posts_at_request'], 1)
    
    # Account age (already in days, no conversion needed)
    features['account_age_at_request'] = df['requester_account_age_in_days_at_request']
    
    # Subreddit count
    features['requester_subreddits_at_request'] = df['requester_subreddits_at_request'].apply(
        lambda x: len(x) if isinstance(x, list) else 0
    )
    
    return pd.DataFrame(features)

X_tabular_train = extract_features(train_df)
X_tabular_test = extract_features(test_df)

print(f"Tabular features shape: {X_tabular_train.shape}")
print(f"Tabular features: {list(X_tabular_train.columns)}")


Extracting tabular features...
Tabular features shape: (2878, 10)
Tabular features: ['text_length', 'word_count', 'avg_word_length', 'upvotes_minus_downvotes', 'upvotes_plus_downvotes', 'num_posts_at_request', 'num_comments_at_request', 'comments_per_post', 'account_age_at_request', 'requester_subreddits_at_request']


In [7]:
# Extract TF-IDF features (FIX #1 & #2: Remove simple keywords, reduce to 8000 features)
print("\nExtracting TF-IDF features...")

# Use same parameters as exp_003 but with max_features=8000
tfidf = TfidfVectorizer(
    max_features=8000,  # FIX #2: Reduced from 15000 to 8000
    ngram_range=(1, 2),
    stop_words='english',
    min_df=2,
    max_df=0.95,
    sublinear_tf=True,
    norm='l2'
)

train_text = train_df['request_text_edit_aware'].fillna('')
test_text = test_df['request_text_edit_aware'].fillna('')

tfidf.fit(train_text)
tfidf_train = tfidf.transform(train_text)
tfidf_test = tfidf.transform(test_text)

print(f"TF-IDF vocabulary size: {len(tfidf.vocabulary_)}")
print(f"TF-IDF train shape: {tfidf_train.shape}")
print(f"TF-IDF test shape: {tfidf_test.shape}")


Extracting TF-IDF features...


TF-IDF vocabulary size: 8000
TF-IDF train shape: (2878, 8000)
TF-IDF test shape: (1162, 8000)


In [8]:
# Combine features
print("\nCombining features...")
X_train = hstack([X_tabular_train, tfidf_train], format='csr')
X_test = hstack([X_tabular_test, tfidf_test], format='csr')

print(f"Final train shape: {X_train.shape}")
print(f"Final test shape: {X_test.shape}")
print(f"Total features: {X_train.shape[1]}")
print(f"Feature-to-sample ratio: {X_train.shape[1] / X_train.shape[0]:.2f}")


Combining features...
Final train shape: (2878, 8010)
Final test shape: (1162, 8010)
Total features: 8010
Feature-to-sample ratio: 2.78


In [12]:
# Cross-validation with FIXED parameters
print("\n" + "="*60)
print("TRAINING WITH FIXED TF-IDF IMPLEMENTATION")
print("="*60)

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(y))
test_preds = np.zeros(len(test_df))
auc_scores = []
logloss_scores = []
fold_iterations = []

feature_importance_list = []

for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, y)):
    print(f"\nFold {fold+1}/5")
    
    X_tr = X_train[train_idx]
    X_val = X_train[val_idx]
    y_tr = y[train_idx]
    y_val = y[val_idx]
    
    # FIX #3 & #4: Increase iterations and add class imbalance handling
    train_set = lgb.Dataset(X_tr, label=y_tr)
    val_set = lgb.Dataset(X_val, label=y_val)
    
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'learning_rate': 0.05,
        'num_leaves': 31,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'scale_pos_weight': 3.0  # FIX #4: Add class imbalance handling
    }
    
    model = lgb.train(
        params,
        train_set,
        num_boost_round=2000,  # FIX #3: Increased from default to 2000
        valid_sets=[val_set],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(100)
        ]
    )
    
    fold_iterations.append(model.best_iteration)
    
    # Predictions
    oof_preds[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
    test_preds += model.predict(X_test, num_iteration=model.best_iteration) / 5
    
    # Metrics
    auc = roc_auc_score(y_val, oof_preds[val_idx])
    logloss = log_loss(y_val, oof_preds[val_idx])
    auc_scores.append(auc)
    logloss_scores.append(logloss)
    
    print(f"  AUC: {auc:.4f}, Log Loss: {logloss:.4f}, Iterations: {model.best_iteration}")
    
    # Feature importance
    importance = model.feature_importance(importance_type='gain')
    feature_importance_list.append(importance)

print("\n" + "="*60)
print("CROSS-VALIDATION RESULTS")
print("="*60)
print(f"AUC: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")
print(f"Log Loss: {np.mean(logloss_scores):.4f} ± {np.std(logloss_scores):.4f}")
print(f"Avg Iterations: {np.mean(fold_iterations):.1f}")
print(f"Improvement from exp_003: {np.mean(auc_scores) - 0.6413:.4f} AUC")


TRAINING WITH FIXED TF-IDF IMPLEMENTATION

Fold 1/5
Training until validation scores don't improve for 50 rounds


[100]	valid_0's auc: 0.612688


[200]	valid_0's auc: 0.638092


Early stopping, best iteration is:
[204]	valid_0's auc: 0.639674
  AUC: 0.6397, Log Loss: 0.5906, Iterations: 204

Fold 2/5
Training until validation scores don't improve for 50 rounds


[100]	valid_0's auc: 0.612558


Early stopping, best iteration is:
[116]	valid_0's auc: 0.614626
  AUC: 0.6146, Log Loss: 0.5971, Iterations: 116

Fold 3/5
Training until validation scores don't improve for 50 rounds


[100]	valid_0's auc: 0.620924
Early stopping, best iteration is:
[55]	valid_0's auc: 0.634345
  AUC: 0.6343, Log Loss: 0.5922, Iterations: 55

Fold 4/5
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[23]	valid_0's auc: 0.614591
  AUC: 0.6146, Log Loss: 0.5791, Iterations: 23

Fold 5/5
Training until validation scores don't improve for 50 rounds


[100]	valid_0's auc: 0.59358
Early stopping, best iteration is:
[50]	valid_0's auc: 0.611483
  AUC: 0.6115, Log Loss: 0.5975, Iterations: 50

CROSS-VALIDATION RESULTS
AUC: 0.6229 ± 0.0117
Log Loss: 0.5913 ± 0.0067
Avg Iterations: 89.6
Improvement from exp_003: -0.0184 AUC


In [13]:
# Feature importance analysis
print("\n" + "="*60)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*60)

# Average importance across folds
mean_importance = np.mean(feature_importance_list, axis=0)
feature_names = list(X_tabular_train.columns) + [f'tfidf_{i}' for i in range(len(tfidf.vocabulary_))]

# Create importance dataframe
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': mean_importance
}).sort_values('importance', ascending=False)

# Top 20 features
print("\nTop 20 features:")
print(importance_df.head(20).to_string(index=False))

# Check for leakage (no feature should dominate >2x)
top_feature = importance_df.iloc[0]
second_feature = importance_df.iloc[1]
ratio = top_feature['importance'] / second_feature['importance']

print(f"\nTop feature: {top_feature['feature']} ({top_feature['importance']:.1f})")
print(f"Second feature: {second_feature['feature']} ({second_feature['importance']:.1f})")
print(f"Importance ratio: {ratio:.2f}")

if ratio > 2.0:
    print("⚠️  WARNING: Potential leakage detected! Top feature dominates >2x")
else:
    print("✓ No leakage detected (ratio < 2.0)")

# Count TF-IDF features in top 20
tfidf_in_top20 = sum(1 for f in importance_df.head(20)['feature'] if f.startswith('tfidf_'))
print(f"\nTF-IDF features in top 20: {tfidf_in_top20}")

if tfidf_in_top20 > 0:
    print("✓ TF-IDF features are being used by the model")
else:
    print("⚠️  WARNING: TF-IDF features not appearing in top 20")


FEATURE IMPORTANCE ANALYSIS

Top 20 features:
                        feature  importance
                    text_length 1123.948355
        upvotes_minus_downvotes  997.356684
                avg_word_length  836.148489
         account_age_at_request  796.516267
         upvotes_plus_downvotes  779.201386
                     word_count  756.666638
              comments_per_post  678.145814
                     tfidf_5501  670.155736
        num_comments_at_request  483.755933
           num_posts_at_request  446.864390
requester_subreddits_at_request  411.986361
                     tfidf_2652  331.184582
                     tfidf_3181  303.681979
                     tfidf_5993  298.322479
                     tfidf_2000  288.711841
                     tfidf_7020  263.642773
                     tfidf_1501  249.216445
                     tfidf_7473  234.093467
                     tfidf_3650  232.839103
                     tfidf_7684  209.167999

Top feature: text_length (11

In [14]:
# Create submission
print("\n" + "="*60)
print("CREATING SUBMISSION")
print("="*60)

# Load sample submission to get correct format
sample_sub = pd.read_csv('/home/data/sampleSubmission.csv')
print(f"Sample submission shape: {sample_sub.shape}")
print(f"Sample submission columns: {sample_sub.columns.tolist()}")

submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_preds
})

print(f"\nSubmission shape: {submission.shape}")
print(f"Submission columns: {submission.columns.tolist()}")
print(f"Prediction range: [{test_preds.min():.4f}, {test_preds.max():.4f}]")

# Save submission
submission_path = '/home/code/submission_candidates/candidate_008.csv'
submission.to_csv(submission_path, index=False)
print(f"\n✓ Submission saved to: {submission_path}")

# Verify format matches sample
if list(submission.columns) == list(sample_sub.columns):
    print("✓ Column format matches sample submission")
else:
    print("⚠️  Column format mismatch!")
    print(f"Expected: {sample_sub.columns.tolist()}")
    print(f"Got: {submission.columns.tolist()}")


CREATING SUBMISSION
Sample submission shape: (1162, 2)
Sample submission columns: ['request_id', 'requester_received_pizza']

Submission shape: (1162, 2)
Submission columns: ['request_id', 'requester_received_pizza']
Prediction range: [0.0711, 0.7803]

✓ Submission saved to: /home/code/submission_candidates/candidate_008.csv
✓ Column format matches sample submission


In [15]:
# Summary
print("\n" + "="*60)
print("EXPERIMENT 004 SUMMARY")
print("="*60)
print(f"Model: LightGBM with fixed TF-IDF implementation")
print(f"CV AUC: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")
print(f"CV Log Loss: {np.mean(logloss_scores):.4f} ± {np.std(logloss_scores):.4f}")
print(f"Improvement from exp_003: {np.mean(auc_scores) - 0.6413:.4f} AUC")
print(f"Features: {X_train.shape[1]} total")
print(f"  - Tabular: {X_tabular_train.shape[1]}")
print(f"  - TF-IDF: {len(tfidf.vocabulary_)}")
print(f"Avg training iterations: {np.mean(fold_iterations):.1f}")
print(f"Class imbalance handling: scale_pos_weight=3.0")
print(f"\nKey changes from exp_003:")
print(f"1. ✓ Removed simple keyword features (redundant with TF-IDF)")
print(f"2. ✓ Reduced TF-IDF from 12,959 to 8,000 features")
print(f"3. ✓ Increased iterations from ~50 to {np.mean(fold_iterations):.0f}")
print(f"4. ✓ Added scale_pos_weight=3.0 for class imbalance")
print(f"\nSubmission: {submission_path}")


EXPERIMENT 004 SUMMARY
Model: LightGBM with fixed TF-IDF implementation
CV AUC: 0.6229 ± 0.0117
CV Log Loss: 0.5913 ± 0.0067
Improvement from exp_003: -0.0184 AUC
Features: 8010 total
  - Tabular: 10
  - TF-IDF: 8000
Avg training iterations: 89.6
Class imbalance handling: scale_pos_weight=3.0

Key changes from exp_003:
1. ✓ Removed simple keyword features (redundant with TF-IDF)
2. ✓ Reduced TF-IDF from 12,959 to 8,000 features
3. ✓ Increased iterations from ~50 to 90
4. ✓ Added scale_pos_weight=3.0 for class imbalance

Submission: /home/code/submission_candidates/candidate_008.csv
