# Enhanced Features Experiment

Based on EDA findings:
- Text length matters (0.1199 correlation)
- Interaction features stronger than raw features
- Temporal patterns exist
- High variance suggests need for robust features

Improvements:
1. Enhanced TF-IDF (n-grams, more features)
2. Better interaction features
3. Class weighting for imbalance
4. Feature selection

In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)
train_df = pd.DataFrame(train_data)

with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)
test_df = pd.DataFrame(test_data)

print(f"Train: {len(train_df)}, Test: {len(test_df)}")
print(f"Target distribution: {train_df['requester_received_pizza'].value_counts(normalize=True).to_dict()}")

Loading data...
Train: 2878, Test: 1162
Target distribution: {False: 0.7515635858234886, True: 0.24843641417651147}


In [2]:
# Enhanced text features
print("Creating enhanced text features...")

# Text length features (from EDA - high correlation)
train_df['text_length'] = train_df['request_text_edit_aware'].str.len()
test_df['text_length'] = test_df['request_text_edit_aware'].str.len()
train_df['word_count'] = train_df['request_text_edit_aware'].str.split().str.len()
test_df['word_count'] = test_df['request_text_edit_aware'].str.split().str.len()
train_df['title_length'] = train_df['request_title'].str.len()
test_df['title_length'] = test_df['request_title'].str.len()
train_df['title_word_count'] = train_df['request_title'].str.split().str.len()
test_df['title_word_count'] = test_df['request_title'].str.split().str.len()

# Readability and language features
train_df['avg_word_length'] = train_df['text_length'] / (train_df['word_count'] + 1)
test_df['avg_word_length'] = test_df['text_length'] / (test_df['word_count'] + 1)

# Persuasion language (from EDA - "been" is important)
persuasion_words = ['please', 'help', 'appreciate', 'grateful', 'thank', 'thanks', 'kind', 'generous', 'need']
for word in persuasion_words:
    train_df[f'has_{word}'] = train_df['request_text_edit_aware'].str.lower().str.contains(word).astype(int)
    test_df[f'has_{word}'] = test_df['request_text_edit_aware'].str.lower().str.contains(word).astype(int)

# Punctuation patterns
train_df['exclamation_count'] = train_df['request_text_edit_aware'].str.count('!')
test_df['exclamation_count'] = test_df['request_text_edit_aware'].str.count('!')
train_df['question_count'] = train_df['request_text_edit_aware'].str.count('\?')
test_df['question_count'] = test_df['request_text_edit_aware'].str.count('\?')

print(f"Text features created: {[col for col in train_df.columns if col not in ['request_text_edit_aware', 'request_title', 'requester_received_pizza']]}")

Creating enhanced text features...


Text features created: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_subreddits_at_request', 'requester_upvotes_minus_downvotes_at_request', 'requester_upvotes_minus_downvotes_at_retrieval', 'requester_upvo

In [3]:
# Enhanced interaction features (from EDA - stronger than raw features)
print("Creating enhanced interaction features...")

# User credibility score (from EDA insight)
train_df['user_credibility'] = (train_df['requester_upvotes_plus_downvotes_at_request']) / (train_df['requester_account_age_in_days_at_request'] + 1)
test_df['user_credibility'] = (test_df['requester_upvotes_plus_downvotes_at_request']) / (test_df['requester_account_age_in_days_at_request'] + 1)

# Engagement ratio (from EDA)
train_df['comments_per_post'] = train_df['requester_number_of_comments_at_request'] / (train_df['requester_number_of_posts_at_request'] + 1)
test_df['comments_per_post'] = test_df['requester_number_of_comments_at_request'] / (test_df['requester_number_of_posts_at_request'] + 1)

# Upvote ratio (from EDA - 0.0953 correlation)
train_df['upvote_ratio'] = train_df['requester_upvotes_minus_downvotes_at_request'] / (train_df['requester_upvotes_plus_downvotes_at_request'] + 1)
test_df['upvote_ratio'] = test_df['requester_upvotes_minus_downvotes_at_request'] / (test_df['requester_upvotes_plus_downvotes_at_request'] + 1)

# Subreddit diversity
train_df['subreddit_diversity'] = train_df['requester_number_of_subreddits_at_request'] / (train_df['requester_account_age_in_days_at_request'] + 1)
test_df['subreddit_diversity'] = test_df['requester_number_of_subreddits_at_request'] / (test_df['requester_account_age_in_days_at_request'] + 1)

# Request quality score
train_df['request_quality'] = train_df['word_count'] * train_df['upvote_ratio']
test_df['request_quality'] = test_df['word_count'] * test_df['upvote_ratio']

# Temporal features (from EDA - hour 15 is best)
train_df['request_hour'] = pd.to_datetime(train_df['unix_timestamp_of_request'], unit='s').dt.hour
test_df['request_hour'] = pd.to_datetime(test_df['unix_timestamp_of_request'], unit='s').dt.hour
train_df['request_day_of_week'] = pd.to_datetime(train_df['unix_timestamp_of_request'], unit='s').dt.dayofweek
test_df['request_day_of_week'] = pd.to_datetime(test_df['unix_timestamp_of_request'], unit='s').dt.dayofweek

# Hour interaction (hour 15 is best from EDA)
train_df['is_hour_15'] = (train_df['request_hour'] == 15).astype(int)
test_df['is_hour_15'] = (test_df['request_hour'] == 15).astype(int)

print(f"Interaction features created")

Creating enhanced interaction features...
Interaction features created


In [None]:
# Enhanced TF-IDF features
print("Creating enhanced TF-IDF features...")

# Combine title and text for better context
combined_text_train = train_df['request_title'].fillna('') + ' ' + train_df['request_text_edit_aware'].fillna('')
combined_text_test = test_df['request_title'].fillna('') + ' ' + test_df['request_text_edit_aware'].fillna('')

# Enhanced TF-IDF with n-grams and more features
tfidf = TfidfVectorizer(
    max_features=10000,  # More features than baseline
    ngram_range=(1, 3),  # Include phrases
    stop_words='english',
    sublinear_tf=True,   # Better scaling
    min_df=2,
    max_df=0.95
)

# Fit on combined train+test for consistency
all_text = pd.concat([combined_text_train, combined_text_test])
tfidf.fit(all_text)

# Transform
tfidf_train = tfidf.transform(combined_text_train)
tfidf_test = tfidf.transform(combined_text_test)

print(f"TF-IDF shape: {tfidf_train.shape}")

In [None]:
# Prepare numerical features
print("Preparing numerical features...")

# Define feature groups
base_features = [
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_account_age_in_days_at_request'
]

text_features = ['text_length', 'word_count', 'title_length', 'title_word_count', 'avg_word_length']
persuasion_features = [f'has_{word}' for word in persuasion_words]
punctuation_features = ['exclamation_count', 'question_count']
interaction_features = [
    'user_credibility', 'comments_per_post', 'upvote_ratio', 'subreddit_diversity', 
    'request_quality', 'request_hour', 'request_day_of_week', 'is_hour_15'
]

all_num_features = base_features + text_features + persuasion_features + punctuation_features + interaction_features

# Select only features that exist
available_features = [f for f in all_num_features if f in train_df.columns]
print(f"Using {len(available_features)} numerical features")

# Prepare data
X_num_train = train_df[available_features].values
X_num_test = test_df[available_features].values

# Scale numerical features
scaler = StandardScaler()
X_num_train_scaled = scaler.fit_transform(X_num_train)
X_num_test_scaled = scaler.transform(X_num_test)

print(f"Numerical features shape: {X_num_train.shape}")

In [None]:
# Combine all features
print("Combining all features...")

from scipy.sparse import hstack

X_train_combined = hstack([tfidf_train, X_num_train_scaled])
X_test_combined = hstack([tfidf_test, X_num_test_scaled])

y = train_df['requester_received_pizza'].astype(int).values

print(f"Final training shape: {X_train_combined.shape}")
print(f"Final test shape: {X_test_combined.shape}")

In [None]:
# Combine all features
print("Combining all features...")

from scipy.sparse import hstack

X_train_combined = hstack([tfidf_train, X_num_train_scaled])
X_test_combined = hstack([tfidf_test, X_num_test_scaled])

y = train_df['requester_received_pizza'].astype(int).values

print(f"Final training shape: {X_train_combined.shape}")
print(f"Final test shape: {X_test_combined.shape}")

# Convert to csr format for better indexing
X_train_combined = X_train_combined.tocsr()
X_test_combined = X_test_combined.tocsr()

In [None]:
# Cross-validation with class weighting
print("Starting cross-validation...")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_scores = []
oof_predictions = np.zeros(len(train_df))

# Class weighting for imbalance
scale_pos_weight = 3.0  # From strategy (75%/25% = 3.0)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_combined, y)):
    print(f"\nFold {fold+1}/5")
    
    X_tr, X_val = X_train_combined[train_idx], X_train_combined[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    # LightGBM parameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'scale_pos_weight': scale_pos_weight
    }
    
    # Create datasets
    train_data = lgb.Dataset(X_tr, label=y_tr)
    valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        valid_names=['valid'],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predict
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    oof_predictions[val_idx] = val_pred
    
    # Score
    fold_score = roc_auc_score(y_val, val_pred)
    fold_scores.append(fold_score)
    print(f"Fold {fold+1} AUC: {fold_score:.4f}")

# Overall CV score
cv_score = roc_auc_score(y, oof_predictions)
print(f"\nCV Score: {cv_score:.4f} Â± {np.std(fold_scores):.4f}")
print(f"Individual folds: {fold_scores}")

In [None]:
# Train final model and generate predictions
print("Training final model on full data...")

# Train on full training data
final_model = lgb.train(
    params,
    lgb.Dataset(X_train_combined, label=y),
    num_boost_round=1000,
    callbacks=[lgb.log_evaluation(0)]
)

# Generate predictions
test_predictions = final_model.predict(X_test_combined)

# Create submission
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Submission saved with {len(submission)} predictions")
print(f"Prediction range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")
print(f"Mean prediction: {test_predictions.mean():.4f}")

In [None]:
# Feature importance analysis
print("Feature importance analysis...")

# Get feature names
tfidf_feature_names = [f'tfidf_{i}' for i in range(tfidf_train.shape[1])]
feature_names = tfidf_feature_names + available_features

# Get importance from final model (trained on full data)
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': final_model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print("Top 20 features by importance:")
print(importance_df.head(20))

# Save importance
importance_df.to_csv('/home/code/experiments/002_feature_importance.csv', index=False)
print("Feature importance saved to experiments/002_feature_importance.csv")

In [None]:
# Train final model and generate predictions
print("Training final model on full data...")

# Train on full training data
final_model = lgb.train(
    params,
    lgb.Dataset(X_train_combined, label=y),
    num_boost_round=1000,
    callbacks=[lgb.log_evaluation(0)]
)

# Generate predictions
test_predictions = final_model.predict(X_test_combined)

# Create submission
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Submission saved with {len(submission)} predictions")
print(f"Prediction range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")
print(f"Mean prediction: {test_predictions.mean():.4f}")