# Experiment 003: Add TF-IDF Text Features

**Goal**: Add TF-IDF features to capture text signal more effectively than simple keyword counting.

**Changes from exp_002:**
- Add TF-IDF features (unigrams + bigrams) on request_text_edit_aware
- 10,000-15,000 features initially
- Keep all existing features to measure marginal gain
- Report both AUC and log loss
- Check for leakage via feature importance

**Expected outcome**: AUC improvement of +0.03 to +0.08 (target: 0.67-0.72 AUC)

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import warnings
warnings.filterwarnings('ignore')

# Set random seed
np.random.seed(42)

In [None]:
def extract_features(df, is_train=True):
    """Extract features from the data - ONLY use columns available in both train and test."""
    features = pd.DataFrame()
    
    # === TABULAR FEATURES (using only 'at_request' columns available in test) ===
    
    # Upvotes/downvotes at request time
    features['upvotes_at_request'] = df['requester_upvotes_plus_downvotes_at_request']
    features['upvotes_minus_downvotes'] = df['requester_upvotes_minus_downvotes_at_request']
    
    # Requester activity at request time
    features['num_comments_at_request'] = df['requester_number_of_comments_at_request']
    features['num_posts_at_request'] = df['requester_number_of_posts_at_request']
    features['num_comments_in_raop_at_request'] = df['requester_number_of_comments_in_raop_at_request']
    features['num_posts_in_raop_at_request'] = df['requester_number_of_posts_on_raop_at_request']
    
    # Activity ratios
    features['comments_per_post'] = features['num_comments_at_request'] / (features['num_posts_at_request'] + 1)
    features['comments_in_raop_per_post'] = features['num_comments_in_raop_at_request'] / (features['num_posts_in_raop_at_request'] + 1)
    
    # Account age at request (convert from timestamp to days)
    features['account_age_at_request'] = pd.to_datetime(df['requester_account_age_at_request']).astype('int64') // 1e9 / 86400
    
    # Request title length
    features['title_length'] = df['request_title'].fillna('').str.len()
    
    # === TEXT FEATURES (from request_text_edit_aware - available in both train and test) ===
    
    # Text length and basic stats
    text = df['request_text_edit_aware'].fillna('')
    features['text_length'] = text.str.len()
    features['word_count'] = text.str.split().str.len()
    features['sentence_count'] = text.str.count(r'[.!?]+') + 1
    features['avg_word_length'] = features['text_length'] / (features['word_count'] + 1)
    features['avg_sentence_length'] = features['word_count'] / features['sentence_count']
    
    # Punctuation and capitalization
    features['exclamation_count'] = text.str.count('!')
    features['question_count'] = text.str.count(r'\?')
    features['caps_count'] = text.str.count(r'[A-Z]')
    features['caps_ratio'] = features['caps_count'] / (features['text_length'] + 1)
    
    # Simple keyword indicators (keeping these for comparison with TF-IDF)
    features['keyword_please'] = text.str.contains('please', case=False).astype(int)
    features['keyword_thank'] = text.str.contains('thank', case=False).astype(int)
    features['keyword_sorry'] = text.str.contains('sorry', case=False).astype(int)
    features['keyword_family'] = text.str.contains(r'\b(family|mom|dad|mother|father|kid|child|children)\b', case=False).astype(int)
    features['keyword_work'] = text.str.contains(r'\b(work|job|paycheck|money|broke)\b', case=False).astype(int)
    features['keyword_hungry'] = text.str.contains(r'\b(hungry|starving|food|eat|meal)\b', case=False).astype(int)
    features['keyword_help'] = text.str.contains(r'\b(help|need|desperate|emergency)\b', case=False).astype(int)
    
    # Categorical features (will be encoded later)
    categorical_features = ['request_title', 'requester_subreddits_at_request', 'requester_username']
    for col in categorical_features:
        features[col] = df[col].fillna('N/A')
    
    return features, categorical_features

In [None]:
def encode_categorical_features(df, categorical_features):
    """Encode categorical features using LabelEncoder."""
    df_encoded = df.copy()
    label_encoders = {}
    
    for col in categorical_features:
        if col in df_encoded.columns:
            le = LabelEncoder()
            # Handle NaN values
            df_encoded[col] = df_encoded[col].fillna('N/A')
            df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
            label_encoders[col] = le
    
    return df_encoded, label_encoders

In [None]:
# Load data
print("Loading data...")
train_path = '/home/data/train.json'
test_path = '/home/data/test.json'

import json
with open(train_path, 'r') as f:
    train_data = json.load(f)
with open(test_path, 'r') as f:
    test_data = json.load(f)

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Target distribution:\n{train_df['requester_received_pizza'].value_counts(normalize=True)}")

# Extract features
print("\nExtracting features...")
train_features, categorical_features = extract_features(train_df, is_train=True)
test_features, _ = extract_features(test_df, is_train=False)

print(f"Train features shape: {train_features.shape}")
print(f"Test features shape: {test_features.shape}")
print(f"Categorical features: {categorical_features}")

In [None]:
# === ADD TF-IDF FEATURES ===
print("\n=== ADDING TF-IDF FEATURES ===")

# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(
    max_features=15000,  # Start with 15K features
    ngram_range=(1, 2),  # Unigrams + bigrams
    stop_words='english',
    min_df=2,  # Ignore very rare terms
    max_df=0.95,  # Ignore very common terms
    sublinear_tf=True,  # Use sublinear TF scaling
    norm='l2'  # L2 normalization
)

# Fit TF-IDF on training text and transform both train and test
train_text = train_df['request_text_edit_aware'].fillna('')
test_text = test_df['request_text_edit_aware'].fillna('')

print("Fitting TF-IDF on training text...")
tfidf.fit(train_text)

print(f"TF-IDF vocabulary size: {len(tfidf.vocabulary_)}")

print("Transforming training text...")
train_tfidf = tfidf.transform(train_text)

print("Transforming test text...")
test_tfidf = tfidf.transform(test_text)

print(f"Train TF-IDF shape: {train_tfidf.shape}")
print(f"Test TF-IDF shape: {test_tfidf.shape}")

In [None]:
# Handle missing values in tabular features
print("\nHandling missing values...")
train_features = train_features.fillna(0)
test_features = test_features.fillna(0)

# Encode categorical features
print("Encoding categorical features...")
encoders = {}

for col in categorical_features:
    if col in train_features.columns:
        encoder = LabelEncoder()
        # Fit on combined data to handle unseen categories
        combined = pd.concat([train_features[col], test_features[col]], axis=0).astype(str)
        encoder.fit(combined)
        
        train_features[col] = encoder.transform(train_features[col].astype(str))
        test_features[col] = encoder.transform(test_features[col].astype(str))
        
        encoders[col] = encoder
        print(f"  Encoded {col}: {len(encoder.classes_)} classes")

print(f"Final train shape: {train_features.shape}")
print(f"Final test shape: {test_features.shape}")

In [None]:
# Prepare data for CV
print("\n=== PREPARING DATA FOR CV ===")

# Convert to sparse matrices for efficiency
train_features_sparse = train_features.astype(np.float32)
test_features_sparse = test_features.astype(np.float32)

# Combine tabular features with TF-IDF features
X = hstack([train_features_sparse, train_tfidf], format='csr')
y = train_df['requester_received_pizza'].values

print(f"Combined feature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Feature types: {X.dtype}")
print(f"Sparsity: {(X.nnz / (X.shape[0] * X.shape[1])) * 100:.2f}% non-zero")

# 5-fold stratified CV
n_folds = 5
cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Store results
fold_predictions = []
fold_scores = {'auc': [], 'logloss': []}
feature_importance_list = []

print(f"Starting {n_folds}-fold stratified CV...")
print(f"Class distribution: {np.bincount(y)}")

In [None]:
# LightGBM parameters
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'num_threads': 4
}

fold = 1
for train_idx, val_idx in cv.split(X, y):
    print(f"\n=== Fold {fold}/{n_folds} ===")
    
    # Split data
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Create LightGBM datasets
    train_set = lgb.Dataset(X_train, label=y_train)
    val_set = lgb.Dataset(X_val, label=y_val, reference=train_set)
    
    # Train model
    model = lgb.train(
        params,
        train_set,
        num_boost_round=1000,
        valid_sets=[val_set],
        valid_names=['val'],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(0)
        ]
    )
    
    # Predict on validation set
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    
    # Calculate metrics
    val_auc = roc_auc_score(y_val, val_pred)
    val_logloss = log_loss(y_val, val_pred)
    
    print(f"Fold {fold} AUC: {val_auc:.4f}")
    print(f"Fold {fold} Log Loss: {val_logloss:.4f}")
    
    # Store results
    fold_scores['auc'].append(val_auc)
    fold_scores['logloss'].append(val_logloss)
    
    # Feature importance
    importance = pd.DataFrame({
        'feature': [f'f{i}' for i in range(X.shape[1])],
        'importance': model.feature_importance(importance_type='gain')
    })
    feature_importance_list.append(importance)
    
    fold += 1

In [None]:
# Calculate overall CV scores
print("\n=== CROSS-VALIDATION RESULTS ===")
mean_auc = np.mean(fold_scores['auc'])
std_auc = np.std(fold_scores['auc'])
mean_logloss = np.mean(fold_scores['logloss'])
std_logloss = np.std(fold_scores['logloss'])

print(f"AUC: {mean_auc:.4f} ± {std_auc:.4f}")
print(f"Log Loss: {mean_logloss:.4f} ± {std_logloss:.4f}")

# Compare to previous baseline
print(f"\nComparison to exp_002 (no TF-IDF):")
print(f"  Previous AUC: 0.6387")
print(f"  Current AUC: {mean_auc:.4f}")
print(f"  Improvement: {mean_auc - 0.6387:.4f}")

# Feature importance analysis
print(f"\n=== FEATURE IMPORTANCE ANALYSIS ===")
avg_importance = pd.concat(feature_importance_list).groupby('feature')['importance'].mean().sort_values(ascending=False)

# Get top features (first 10 from tabular, then top TF-IDF features)
top_features = avg_importance.head(20)
print("Top 20 features:")
for i, (feature, importance) in enumerate(top_features.items(), 1):
    if feature.startswith('f') and int(feature[1:]) >= train_features.shape[1]:
        feature_name = f"TF-IDF feature {int(feature[1:]) - train_features.shape[1]}"
    else:
        # Map back to original feature names
        feature_idx = int(feature[1:])
        if feature_idx < len(train_features.columns):
            feature_name = train_features.columns[feature_idx]
        else:
            feature_name = f"TF-IDF feature {feature_idx - len(train_features.columns)}"
    print(f"{i:2d}. {feature_name}: {importance:.1f}")

# Check for potential leakage in top features
top_feature = avg_importance.index[0]
top_importance = avg_importance.iloc[0]
second_importance = avg_importance.iloc[1]
ratio = top_importance / second_importance

print(f"\nLeakage check:")
print(f"Top feature: {top_feature} (importance: {top_importance:.1f})")
print(f"Second feature: {avg_importance.index[1]} (importance: {second_importance:.1f})")
print(f"Ratio: {ratio:.2f}x")
if ratio > 2.0:
    print("⚠️  WARNING: Potential leakage detected!")
else:
    print("✓ No obvious leakage detected")

In [None]:
# === GENERATE PREDICTIONS FOR TEST SET ===
print("\n=== TRAINING FINAL MODEL ===")

# Combine test features with TF-IDF
X_test_combined = hstack([test_features_sparse, test_tfidf], format='csr')

# Train final model on full training data
final_model = lgb.train(
    params,
    lgb.Dataset(X, label=y),
    num_boost_round=1000,
    callbacks=[lgb.log_evaluation(0)]
)

# Predict on test set
test_pred = final_model.predict(X_test_combined)

# Create submission
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_pred
})

print(f"Submission shape: {submission.shape}")
print(f"Prediction range: [{test_pred.min():.4f}, {test_pred.max():.4f}]")
print(f"Prediction mean: {test_pred.mean():.4f}")

# Save submission
submission_path = "/home/submission/submission_003.csv"
submission.to_csv(submission_path, index=False)
print(f"Submission saved to: {submission_path}")