# Experiment 002: TF-IDF + Psycholinguistic Features

**Goal**: Implement TF-IDF n-gram features and psycholinguistic word categories to significantly improve from baseline 0.6433 AUC.

**Strategy**: 
- Add TF-IDF vectorizer (1-3 n-grams, max_features=10000)
- Add psycholinguistic category counts (reciprocity, hardship, family, gratitude, food)
- Combine with existing metadata features
- Use LightGBM with proper class imbalance handling (scale_pos_weight=3.025)
- Target: >0.75 AUC

In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
import lightgbm as lgb
import re
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

# Set seeds for reproducibility
np.random.seed(42)
import random
random.seed(42)

# Load data
print("Loading data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

print(f"Train samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

In [None]:
# Extract basic features (same as baseline)
def extract_basic_features(data):
    features = []
    for item in data:
        # Text features
        text = item['request_text_edit_aware']
        text_length = len(text)
        word_count = len(text.split())
        
        # Account age in days (at request time)
        account_age_days = item['requester_account_age_in_days_at_request']
        
        # Activity features (at request time)
        comments_at_request = item['requester_number_of_comments_at_request']
        posts_at_request = item['requester_number_of_posts_at_request']
        
        # Voting features (at request time)
        upvotes_at_request = item['requester_upvotes_plus_downvotes_at_request']
        
        # Subreddit diversity (at request time)
        subreddits_at_request = item['requester_number_of_subreddits_at_request']
        
        # Time features
        request_hour = pd.to_datetime(item['unix_timestamp_of_request'], unit='s').hour
        request_dayofweek = pd.to_datetime(item['unix_timestamp_of_request'], unit='s').dayofweek
        
        features.append({
            'text_length': text_length,
            'word_count': word_count,
            'account_age_days': account_age_days,
            'comments_at_request': comments_at_request,
            'posts_at_request': posts_at_request,
            'upvotes_at_request': upvotes_at_request,
            'subreddits_at_request': subreddits_at_request,
            'request_hour': request_hour,
            'request_dayofweek': request_dayofweek,
        })
    
    return pd.DataFrame(features)

# Extract psycholinguistic features
def extract_psycholinguistic_features(data):
    """Extract word counts for psycholinguistic categories"""
    
    # Define word categories based on data findings
    categories = {
        'reciprocity': ['offer', 'pay', 'return', 'favor', 'back', 'help', 'kindness', 'generous', 'repay', 'owe'],
        'hardship': ['struggle', 'broke', 'bills', 'unemployed', 'hungry', 'hard', 'difficult', 'tough', 'poor', 'broke', 'starving'],
        'family': ['family', 'kids', 'children', 'mother', 'father', 'daughter', 'son', 'wife', 'husband', 'parent', 'baby'],
        'gratitude': ['thank', 'appreciate', 'grateful', 'bless', 'kind', 'generous', 'thanks', 'blessed', 'appreciation'],
        'food_specific': ['pizza', 'hungry', 'food', 'meal', 'eat', 'starving', 'dinner', 'lunch', 'breakfast', 'craving']
    }
    
    features = []
    for item in data:
        text = item['request_text_edit_aware'].lower()
        word_count = len(text.split())
        
        # Count words in each category
        category_counts = {}
        for category, words in categories.items():
            count = sum(1 for word in words if word in text)
            # Normalize by text length to avoid bias toward longer texts
            category_counts[f'{category}_count'] = count
            category_counts[f'{category}_ratio'] = count / max(word_count, 1)
        
        features.append(category_counts)
    
    return pd.DataFrame(features)

print("Extracting basic features...")
X_train_basic = extract_basic_features(train_data)
X_test_basic = extract_basic_features(test_data)

print("Extracting psycholinguistic features...")
X_train_psycho = extract_psycholinguistic_features(train_data)
X_test_psycho = extract_psycholinguistic_features(test_data)

print(f"Basic features shape: {X_train_basic.shape}")
print(f"Psycholinguistic features shape: {X_train_psycho.shape}")

In [None]:
# Extract TF-IDF features
print("Extracting TF-IDF features...")

# Use request_text_edit_aware (leakage-free version)
train_texts = [item['request_text_edit_aware'] for item in train_data]
test_texts = [item['request_text_edit_aware'] for item in test_data]

# Configure TF-IDF vectorizer based on research findings
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),  # 1-3 word n-grams
    max_features=10000,  # Limit vocabulary size
    min_df=2,            # Ignore very rare terms
    max_df=0.9,          # Ignore very common terms
    sublinear_tf=True,   # Sublinear term frequency scaling
    stop_words='english' # Remove common English stop words
)

# Fit on training data and transform both train and test
X_train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
X_test_tfidf = tfidf_vectorizer.transform(test_texts)

print(f"TF-IDF features shape: {X_train_tfidf.shape}")
print(f"TF-IDF density: {X_train_tfidf.nnz / (X_train_tfidf.shape[0] * X_train_tfidf.shape[1]):.4f}")

In [None]:
# Combine all features
print("Combining features...")

# Combine basic and psycholinguistic features (dense)
X_train_dense = pd.concat([X_train_basic, X_train_psycho], axis=1)
X_test_dense = pd.concat([X_test_basic, X_test_psycho], axis=1)

# Convert dense to sparse matrix for efficient stacking
from scipy.sparse import csr_matrix
X_train_dense_sparse = csr_matrix(X_train_dense.values)
X_test_dense_sparse = csr_matrix(X_test_dense.values)

# Stack TF-IDF (sparse) with dense features
X_train = hstack([X_train_tfidf, X_train_dense_sparse])
X_test = hstack([X_test_tfidf, X_test_dense_sparse])

print(f"Final training features shape: {X_train.shape}")
print(f"Feature types: TF-IDF ({X_train_tfidf.shape[1]}) + Dense ({X_train_dense_sparse.shape[1]})")

# Target variable
y_train = np.array([item['requester_received_pizza'] for item in train_data])
print(f"Class distribution: {np.bincount(y_train)}")
print(f"Positive rate: {y_train.mean():.3f}")

# Calculate scale_pos_weight for LightGBM
scale_pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()
print(f"Scale pos weight: {scale_pos_weight:.3f}")

In [None]:
# Cross-validation setup
print("Setting up cross-validation...")
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Store results
cv_scores = []
oof_predictions = np.zeros(len(train_data))
test_predictions = np.zeros(len(test_data))

# Feature importance tracking
feature_importance_list = []

print(f"Starting {n_folds}-fold CV...")

In [None]:
# Train LightGBM model with cross-validation
for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"\nFold {fold + 1}/{n_folds}")
    
    # Split data
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    # Create LightGBM datasets
    train_set = lgb.Dataset(X_tr, label=y_tr)
    val_set = lgb.Dataset(X_val, label=y_val)
    
    # Model parameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'scale_pos_weight': scale_pos_weight,  # Handle class imbalance
        'seed': 42
    }
    
    # Train model
    model = lgb.train(
        params,
        train_set,
        num_boost_round=1000,
        valid_sets=[val_set],
        valid_names=['val'],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(100)
        ]
    )
    
    # Predict on validation set
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    oof_predictions[val_idx] = val_pred
    
    # Calculate AUC for this fold
    fold_auc = roc_auc_score(y_val, val_pred)
    cv_scores.append(fold_auc)
    print(f"Fold {fold + 1} AUC: {fold_auc:.4f}")
    
    # Predict on test set
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    test_predictions += test_pred / n_folds
    
    # Store feature importance
    importance = model.feature_importance(importance_type='gain')
    feature_importance_list.append(importance)

# Overall CV score
mean_auc = np.mean(cv_scores)
std_auc = np.std(cv_scores)
print(f"\n{'='*50}")
print(f"CV AUC: {mean_auc:.4f} ± {std_auc:.4f}")
print(f"{'='*50}")

# Overall OOF AUC
oof_auc = roc_auc_score(y_train, oof_predictions)
print(f"OOF AUC: {oof_auc:.4f}")

In [None]:
# Analyze feature importance
print("Analyzing feature importance...")

# Average feature importance across folds
mean_importance = np.mean(feature_importance_list, axis=0)

# Create feature names
n_tfidf_features = X_train_tfidf.shape[1]
dense_feature_names = list(X_train_dense.columns)
feature_names = [f'tfidf_{i}' for i in range(n_tfidf_features)] + dense_feature_names

# Create importance dataframe
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': mean_importance
}).sort_values('importance', ascending=False)

print("\nTop 20 most important features:")
print(importance_df.head(20))

# Check if TF-IDF features dominate
n_top_features = 50
tfidf_important = sum(1 for f in importance_df.head(n_top_features)['feature'] if f.startswith('tfidf_'))
print(f"\nAmong top {n_top_features} features:")
print(f"- TF-IDF features: {tfidf_important}")
print(f"- Dense features: {n_top_features - tfidf_important}")

# Check psycholinguistic feature importance
psycho_features = [f for f in feature_names if any(cat in f for cat in ['reciprocity', 'hardship', 'family', 'gratitude', 'food_specific'])]
psycho_importance = importance_df[importance_df['feature'].isin(psycho_features)]
print(f"\nPsycholinguistic features total importance: {psycho_importance['importance'].sum():.2f}")
print("Top psycholinguistic features:")
print(psycho_importance.head(10))

In [None]:
# Create submission
print("\nCreating submission file...")

# Load sample submission to get format
sample_sub = pd.read_csv('/home/data/sampleSubmission.csv')
print(f"Sample submission shape: {sample_sub.shape}")
print(f"Sample submission columns: {sample_sub.columns.tolist()}")

# Create submission dataframe
submission = pd.DataFrame({
    'request_id': [item['request_id'] for item in test_data],
    'requester_received_pizza': test_predictions
})

print(f"Submission shape: {submission.shape}")
print(f"Prediction range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)
print(f"Submission saved to: {submission_path}")

# Also save to experiment folder
import os
os.makedirs('/home/code/experiments/002_tfidf_text_features', exist_ok=True)
exp_submission_path = '/home/code/experiments/002_tfidf_text_features/submission.csv'
submission.to_csv(exp_submission_path, index=False)
print(f"Submission also saved to: {exp_submission_path}")

In [None]:
# Summary
print("\n" + "="*60)
print("EXPERIMENT SUMMARY")
print("="*60)
print(f"Model: LightGBM with TF-IDF + psycholinguistic + metadata features")
print(f"CV AUC: {mean_auc:.4f} ± {std_auc:.4f}")
print(f"OOF AUC: {oof_auc:.4f}")
print(f"Improvement over baseline: {mean_auc - 0.6433:.4f}")
print(f"\nKey findings:")
print(f"- TF-IDF features: {n_tfidf_features} terms")
print(f"- Psycholinguistic categories: {len(psycho_features)} features")
print(f"- Dense metadata features: {len(dense_feature_names)} features")
print(f"- TF-IDF dominates top features: {tfidf_important}/{n_top_features}")
print(f"- Psycholinguistic importance: {psycho_importance['importance'].sum():.2f}")
print("="*60)

In [None]:
# Summary
print("\n" + "="*60)
print("EXPERIMENT SUMMARY")
print("="*60)
print(f"Model: LightGBM with TF-IDF + psycholinguistic + metadata features")
print(f"CV AUC: {mean_auc:.4f} ± {std_auc:.4f}")
print(f"OOF AUC: {oof_auc:.4f}")
print(f"Improvement over baseline: {mean_auc - 0.6433:.4f}")
print(f"\nKey findings:")
print(f"- TF-IDF features: {n_tfidf_features} terms")
print(f"- Psycholinguistic categories: {len(psycho_features)} features")
print(f"- Dense metadata features: {len(dense_feature_names)} features")
print(f"- TF-IDF dominates top features: {tfidf_important}/{n_top_features}")
print(f"- Psycholinguistic importance: {psycho_importance['importance'].sum():.2f}")
print("="*60)