# Experiment 003: TF-IDF Noise Fixes

**Goal**: Fix the TF-IDF noise problem that caused a 0.0216 AUC drop in exp_002

**Key Changes**:
1. Remove stop_words='english' to keep domain-specific words
2. Reduce TF-IDF features from 10,000 to 3,000 using chi-square selection
3. Add character n-grams (3-5) for robust pattern matching
4. Improve psycholinguistic features with phrase patterns and word boundaries
5. Add high-value specific features (imgur links, key phrases)

**Expected Outcome**: Recover to baseline (0.6433) and ideally exceed 0.70

In [6]:
import pandas as pd
import numpy as np
import json
import re
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack, csr_matrix
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(42)

# Load data
print("Loading data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

print(f"Train samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")
print(f"Positive rate: {sum([x['requester_received_pizza'] for x in train_data]) / len(train_data):.4f}")

# Convert to DataFrames for easier processing
train = pd.DataFrame(train_data)
test = pd.DataFrame(test_data)

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Loading data...
Train samples: 2878
Test samples: 1162
Positive rate: 0.2484
Train shape: (2878, 32)
Test shape: (1162, 17)


import pandas as pd
import numpy as np
import json
import re
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(42)

# Load data
print("Loading data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

print(f"Train samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")
print(f"Positive rate: {sum([x['requester_received_pizza'] for x in train_data]) / len(train_data):.4f}")

In [7]:
def extract_metadata_features(df):
    """Extract metadata features that are safe from leakage"""
    features = pd.DataFrame()
    
    # Text length features
    features['text_length'] = df['request_text_edit_aware'].fillna('').str.len()
    features['word_count'] = df['request_text_edit_aware'].fillna('').str.split().str.len()
    features['sentence_count'] = df['request_text_edit_aware'].fillna('').str.count(r'[.!?]+') + 1
    
    # Account age (convert to days)
    features['account_age_days'] = df['requester_account_age_in_days_at_request']
    features['account_age_log'] = np.log1p(features['account_age_days'])
    
    # Activity features (at request time)
    features['total_comments_at_request'] = df['requester_number_of_comments_at_request']
    features['total_posts_at_request'] = df['requester_number_of_posts_at_request']
    features['raop_comments_at_request'] = df['requester_number_of_comments_in_raop_at_request']
    features['raop_posts_at_request'] = df['requester_number_of_posts_on_raop_at_request']
    
    # Upvotes/downvotes (at request time)
    features['upvotes_minus_downvotes_at_request'] = df['requester_upvotes_minus_downvotes_at_request']
    features['upvotes_plus_downvotes_at_request'] = df['requester_upvotes_plus_downvotes_at_request']
    features['vote_ratio_at_request'] = features['upvotes_minus_downvotes_at_request'] / (features['upvotes_plus_downvotes_at_request'] + 1)
    
    # Subreddit diversity
    features['num_subreddits_at_request'] = df['requester_number_of_subreddits_at_request']
    
    # Time features
    features['hour_of_day'] = pd.to_datetime(df['unix_timestamp_of_request'], unit='s').dt.hour
    features['day_of_week'] = pd.to_datetime(df['unix_timestamp_of_request'], unit='s').dt.dayofweek
    
    # Days since first RAOP post
    features['days_since_first_raop_at_request'] = df['requester_days_since_first_post_on_raop_at_request']
    
    return features

## 2. Improved TF-IDF Features

**Fixes**:
- Remove stop_words='english' to keep domain words
- Reduce to 3,000 features using chi-square selection
- Add character n-grams (3-5) for robust patterns

In [8]:
# Fill NaN values with empty string
train_text = train['request_text_edit_aware'].fillna('')
test_text = test['request_text_edit_aware'].fillna('')

print("Creating TF-IDF features...")

# Word n-grams (1-3) - WITHOUT stop words removal
word_vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),
    max_features=5000,  # Start with more, then select top 3k
    min_df=2,
    max_df=0.9,
    sublinear_tf=True,
    stop_words=None,  # Keep domain-specific words!
    lowercase=True
)

# Character n-grams (3-5) - more robust for short text
char_vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3, 5),
    max_features=2000,  # Fewer char n-grams needed
    min_df=2,
    max_df=0.9,
    sublinear_tf=True,
    lowercase=True
)

# Fit and transform on training data
X_word_train = word_vectorizer.fit_transform(train_text)
X_char_train = char_vectorizer.fit_transform(train_text)

# Transform test data
X_word_test = word_vectorizer.transform(test_text)
X_char_test = char_vectorizer.transform(test_text)

print(f"Word TF-IDF shape: {X_word_train.shape}")
print(f"Char TF-IDF shape: {X_char_train.shape}")

# Apply feature selection to word n-grams (keep top 3000 most predictive)
print("Applying chi-square feature selection...")
selector = SelectKBest(chi2, k=3000)
X_word_train_selected = selector.fit_transform(X_word_train, train['requester_received_pizza'])
X_word_test_selected = selector.transform(X_word_test)

print(f"Selected word TF-IDF shape: {X_word_train_selected.shape}")

Creating TF-IDF features...


Word TF-IDF shape: (2878, 5000)
Char TF-IDF shape: (2878, 2000)
Applying chi-square feature selection...
Selected word TF-IDF shape: (2878, 3000)


In [12]:
# Extract metadata features
print("Extracting metadata features...")
X_meta_train = extract_metadata_features(train)
X_meta_test = extract_metadata_features(test)

print(f"Metadata features shape: {X_meta_train.shape}")
print(f"Metadata features: {X_meta_train.columns.tolist()}")

Extracting metadata features...
Metadata features shape: (2878, 16)
Metadata features: ['text_length', 'word_count', 'sentence_count', 'account_age_days', 'account_age_log', 'total_comments_at_request', 'total_posts_at_request', 'raop_comments_at_request', 'raop_posts_at_request', 'upvotes_minus_downvotes_at_request', 'upvotes_plus_downvotes_at_request', 'vote_ratio_at_request', 'num_subreddits_at_request', 'hour_of_day', 'day_of_week', 'days_since_first_raop_at_request']


## 3. Improved Psycholinguistic Features

**Fixes**:
- Use phrase patterns instead of single words
- Add word boundaries to prevent false positives
- Add intensity modifiers (adverb + adjective patterns)

In [13]:
# Extract psycholinguistic features
print("Creating psycholinguistic features...")
X_psych_train = create_psycholinguistic_features(train)
X_psych_test = create_psycholinguistic_features(test)

print(f"Psycholinguistic features shape: {X_psych_train.shape}")
print(f"Psycholinguistic features: {X_psych_train.columns.tolist()}")

Creating psycholinguistic features...


Psycholinguistic features shape: (2878, 12)
Psycholinguistic features: ['reciprocity_phrases', 'hardship_phrases', 'gratitude_phrases', 'family_phrases', 'pizza_terms', 'intensity_modifiers', 'reciprocity_phrases_ratio', 'hardship_phrases_ratio', 'gratitude_phrases_ratio', 'family_phrases_ratio', 'pizza_terms_ratio', 'intensity_modifiers_ratio']


In [14]:
def create_psycholinguistic_features(df):
    """Create improved psycholinguistic features with phrase patterns"""
    features = pd.DataFrame()
    text = df['request_text_edit_aware'].fillna('').str.lower()
    
    # Reciprocity phrases (with word boundaries)
    reciprocity_patterns = [
        r'\bpay back\b', r'\breturn the favor\b', r'\bwhen i get paid\b',
        r'\bpayday\b', r'\bnext week\b', r'\bnext month\b',
        r'\bwill pay\b', r'\bcan pay\b', r'\bpay it forward\b'
    ]
    features['reciprocity_phrases'] = text.apply(lambda x: sum(len(re.findall(p, x)) for p in reciprocity_patterns))
    
    # Hardship phrases
    hardship_patterns = [
        r'\blost my job\b', r'\bmedical bills\b', r'\bcar broke down\b',
        r'\beviction notice\b', r'\bsingle parent\b', r'\bno money\b',
        r'\bcan\'t afford\b', r'\bunemployed\b', r'\bhomeless\b'
    ]
    features['hardship_phrases'] = text.apply(lambda x: sum(len(re.findall(p, x)) for p in hardship_patterns))
    
    # Gratitude phrases
    gratitude_patterns = [
        r'\bwould appreciate\b', r'\bwould be grateful\b', r'\bthank you in advance\b',
        r'\bbless you\b', r'\bso thankful\b', r'\bvery grateful\b'
    ]
    features['gratitude_phrases'] = text.apply(lambda x: sum(len(re.findall(p, x)) for p in gratitude_patterns))
    
    # Family phrases
    family_patterns = [
        r'\bmy kids\b', r'\bmy children\b', r'\bmy family\b',
        r'\bsingle mom\b', r'\bsingle dad\b', r'\bmy daughter\b', r'\bmy son\b'
    ]
    features['family_phrases'] = text.apply(lambda x: sum(len(re.findall(p, x)) for p in family_patterns))
    
    # Pizza-specific terms (with word boundaries)
    pizza_words = ['pizza', 'pizzas', 'pizzeria', 'dominos', 'papa johns', 'pizza hut', 'little caesars']
    pizza_pattern = r'\b(' + '|'.join(pizza_words) + r')\b'
    features['pizza_terms'] = text.str.count(pizza_pattern)
    
    # Intensity modifiers (adverb + adjective patterns)
    intensity_patterns = [
        r'\breally\s+\w+\b', r'\bso\s+\w+\b', r'\bvery\s+\w+\b',
        r'\bextremely\s+\w+\b', r'\bsuper\s+\w+\b'
    ]
    features['intensity_modifiers'] = text.apply(lambda x: sum(len(re.findall(p, x)) for p in intensity_patterns))
    
    # Normalize by text length (words)
    word_count = text.str.split().str.len().replace(0, 1)  # Avoid division by zero
    for col in features.columns:
        features[f'{col}_ratio'] = features[col] / word_count
    
    return features

print("Creating psycholinguistic features...")
X_psych_train = create_psycholinguistic_features(train)
X_psych_test = create_psycholinguistic_features(test)

print(f"Psycholinguistic features shape: {X_psych_train.shape}")

Creating psycholinguistic features...


Psycholinguistic features shape: (2878, 12)


In [15]:
# Extract specific features
print("Creating specific features...")
X_specific_train = create_specific_features(train)
X_specific_test = create_specific_features(test)

print(f"Specific features shape: {X_specific_train.shape}")
print(f"Specific features: {X_specific_train.columns.tolist()}")

Creating specific features...


Specific features shape: (2878, 8)
Specific features: ['imgur_link_count', 'has_imgur_link', 'need_phrases', 'time_references', 'question_marks', 'exclamation_marks', 'need_phrases_ratio', 'time_references_ratio']


## 4. High-Value Specific Features

In [17]:
# Combine all feature matrices
print("Combining all features...")

# Convert dense features to sparse format for efficient stacking
X_meta_train_sparse = csr_matrix(X_meta_train.values)
X_meta_test_sparse = csr_matrix(X_meta_test.values)

X_psych_train_sparse = csr_matrix(X_psych_train.values)
X_psych_test_sparse = csr_matrix(X_psych_test.values)

X_specific_train_sparse = csr_matrix(X_specific_train.values)
X_specific_test_sparse = csr_matrix(X_specific_test.values)

# Stack all features
X_train_combined = hstack([
    X_word_train_selected,  # 3000 selected word n-grams
    X_char_train,             # 2000 char n-grams
    X_meta_train_sparse,      # metadata features
    X_psych_train_sparse,     # psycholinguistic features
    X_specific_train_sparse   # specific high-value features
], format='csr')

X_test_combined = hstack([
    X_word_test_selected,
    X_char_test,
    X_meta_test_sparse,
    X_psych_test_sparse,
    X_specific_test_sparse
], format='csr')

print(f"Combined training features shape: {X_train_combined.shape}")
print(f"Combined test features shape: {X_test_combined.shape}")

# Target variable
y = train['requester_received_pizza'].values
print(f"Target distribution: {np.bincount(y)}")

Combining all features...
Combined training features shape: (2878, 5036)
Combined test features shape: (1162, 5036)
Target distribution: [2163  715]


In [18]:
def create_specific_features(df):
    """Create targeted features based on analysis findings"""
    features = pd.DataFrame()
    text = df['request_text_edit_aware'].fillna('').str.lower()
    
    # Imgur links (2.6x more common in successful requests)
    features['imgur_link_count'] = text.str.count(r'imgur\.com')
    features['has_imgur_link'] = (features['imgur_link_count'] > 0).astype(int)
    
    # Key phrases that indicate need/specificity
    need_patterns = [
        r'\bneed\b', r'\bplease\b', r'\bhelp\b', r'\banyone\b',
        r'\bkind\b', r'\bgenerous\b', r'\bappreciate\b'
    ]
    features['need_phrases'] = text.apply(lambda x: sum(len(re.findall(p, x)) for p in need_patterns))
    
    # Time references (indicates planning/payback ability)
    time_patterns = [
        r'\bnext week\b', r'\bnext month\b', r'\bthis week\b',
        r'\bthis month\b', r'\btomorrow\b', r'\bsoon\b'
    ]
    features['time_references'] = text.apply(lambda x: sum(len(re.findall(p, x)) for p in time_patterns))
    
    # Question marks (might indicate asking/pleading)
    features['question_marks'] = text.str.count(r'\?')
    
    # Exclamation marks (might indicate urgency/emotion)
    features['exclamation_marks'] = text.str.count(r'!')
    
    # Normalize by text length
    word_count = text.str.split().str.len().replace(0, 1)
    for col in ['need_phrases', 'time_references']:
        features[f'{col}_ratio'] = features[col] / word_count
    
    return features

print("Creating specific features...")
X_specific_train = create_specific_features(train)
X_specific_test = create_specific_features(test)

print(f"Specific features shape: {X_specific_train.shape}")

Creating specific features...


Specific features shape: (2878, 8)


## 5. Combine All Features

In [19]:
# Combine all feature matrices
print("Combining all features...")

# Stack sparse matrices (TF-IDF) and dense matrices (metadata, psych, specific)
from scipy.sparse import csr_matrix, hstack

# Convert dense features to sparse format for efficient stacking
X_meta_train_sparse = csr_matrix(X_meta_train.values)
X_meta_test_sparse = csr_matrix(X_meta_test.values)

X_psych_train_sparse = csr_matrix(X_psych_train.values)
X_psych_test_sparse = csr_matrix(X_psych_test.values)

X_specific_train_sparse = csr_matrix(X_specific_train.values)
X_specific_test_sparse = csr_matrix(X_specific_test.values)

# Stack all features
X_train_combined = hstack([
    X_word_train_selected,  # 3000 selected word n-grams
    X_char_train,             # 2000 char n-grams
    X_meta_train_sparse,      # metadata features
    X_psych_train_sparse,     # psycholinguistic features
    X_specific_train_sparse   # specific high-value features
], format='csr')

X_test_combined = hstack([
    X_word_test_selected,
    X_char_test,
    X_meta_test_sparse,
    X_psych_test_sparse,
    X_specific_test_sparse
], format='csr')

print(f"Combined training features shape: {X_train_combined.shape}")
print(f"Combined test features shape: {X_test_combined.shape}")

# Target variable
y = train['requester_received_pizza'].values
print(f"Target distribution: {np.bincount(y)}")

Combining all features...
Combined training features shape: (2878, 5036)
Combined test features shape: (1162, 5036)
Target distribution: [2163  715]


## 6. Model Training with Cross-Validation

In [None]:
# 5-fold stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Class imbalance handling
scale_pos_weight = (len(y) - sum(y)) / sum(y)
print(f"Scale pos weight: {scale_pos_weight:.3f}")

# Store predictions
cv_scores = []
oof_predictions = np.zeros(len(train))
test_predictions = np.zeros(len(test))

print("\nTraining LightGBM model with 5-fold CV...")

for fold, (train_idx, val_idx) in enumerate(cv.split(X_train_combined, y)):
    print(f"Fold {fold + 1}/5")
    
    # Split data
    X_train_fold = X_train_combined[train_idx]
    X_val_fold = X_train_combined[val_idx]
    y_train_fold = y[train_idx]
    y_val_fold = y[val_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
    val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)
    
    # Parameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'scale_pos_weight': scale_pos_weight
    }
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predict on validation set
    val_pred = model.predict(X_val_fold, num_iteration=model.best_iteration)
    oof_predictions[val_idx] = val_pred
    
    # Calculate fold score
    fold_score = roc_auc_score(y_val_fold, val_pred)
    cv_scores.append(fold_score)
    print(f"Fold {fold + 1} AUC: {fold_score:.4f}")
    
    # Predict on test set
    test_pred = model.predict(X_test_combined, num_iteration=model.best_iteration)
    test_predictions += test_pred / 5

# Overall CV score
cv_mean = np.mean(cv_scores)
cv_std = np.std(cv_scores)
print(f"\nCV AUC: {cv_mean:.4f} ± {cv_std:.4f}")

# OOF score
oof_score = roc_auc_score(y, oof_predictions)
print(f"OOF AUC: {oof_score:.4f}")

## 7. Feature Importance Analysis

In [None]:
# Get feature names for analysis
feature_names = []

# Word n-gram features (selected)
word_feature_names = word_vectorizer.get_feature_names_out()
selected_mask = selector.get_support()
selected_features = word_feature_names[selected_mask]
feature_names.extend([f'word_{f}' for f in selected_features])

# Char n-gram features
char_feature_names = char_vectorizer.get_feature_names_out()
feature_names.extend([f'char_{f}' for f in char_feature_names])

# Metadata features
feature_names.extend(X_meta_train.columns.tolist())

# Psycholinguistic features
feature_names.extend(X_psych_train.columns.tolist())

# Specific features
feature_names.extend(X_specific_train.columns.tolist())

print(f"Total features: {len(feature_names)}")

# Get feature importances from final model
importances = model.feature_importance(importance_type='gain')
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values('importance', ascending=False)

print("\nTop 20 features by importance:")
print(feature_importance_df.head(20))

# Analyze feature types
feature_importance_df['feature_type'] = feature_importance_df['feature'].apply(lambda x: x.split('_')[0])
type_importance = feature_importance_df.groupby('feature_type')['importance'].sum().sort_values(ascending=False)

print("\nImportance by feature type:")
print(type_importance)

## 8. Create Submission

In [None]:
# Create submission file
print("Creating submission file...")

# Check sample submission format
sample_sub = pd.read_csv('/home/data/sampleSubmission.csv')
print(f"Sample submission columns: {sample_sub.columns.tolist()}")
print(f"Sample submission shape: {sample_sub.shape}")

# Create submission
submission = pd.DataFrame({
    'request_id': test['request_id'],
    'requester_received_pizza': test_predictions
})

print(f"Submission shape: {submission.shape}")
print(f"Prediction range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)
print(f"Submission saved to: {submission_path}")

# Summary
print(f"\n{'='*50}")
print(f"EXPERIMENT SUMMARY")
print(f"{'='*50}")
print(f"Model: LightGBM with improved TF-IDF features")
print(f"CV AUC: {cv_mean:.4f} ± {cv_std:.4f}")
print(f"OOF AUC: {oof_score:.4f}")
print(f"Baseline AUC: 0.6433")
print(f"Improvement: {cv_mean - 0.6433:.4f}")
print(f"{'='*50}")