# Random Acts of Pizza - Baseline Model

## Strategy
- Simple LightGBM baseline with basic features
- TF-IDF for text features
- Stratified 5-fold CV
- Class weights to handle imbalance (24.8% success rate)

In [None]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading training data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

print(f"Training samples: {len(train_data)}")
print(f"Features per sample: {len(train_data[0])}")

In [None]:
# Convert to DataFrame for easier manipulation
train_df = pd.DataFrame(train_data)
print("Training data shape:", train_df.shape)
print("\nTarget distribution:")
print(train_df['requester_received_pizza'].value_counts(normalize=True))

# Check for missing values in key features
print("\nMissing values in key features:")
key_features = ['requester_received_pizza', 'request_text_edit_aware', 'request_title',
                'requester_number_of_posts_on_raop_at_request', 'requester_number_of_comments_in_raop_at_request',
                'request_number_of_comments_at_retrieval']
for feat in key_features:
    if feat in train_df.columns:
        missing = train_df[feat].isnull().sum()
        print(f"{feat}: {missing} ({missing/len(train_df)*100:.1f}%)")

In [None]:
# Basic feature engineering
print("Creating basic features...")

# Text features (using edit_aware version to avoid leakage)
train_df['text_length'] = train_df['request_text_edit_aware'].fillna('').str.len()
train_df['title_length'] = train_df['request_title'].fillna('').str.len()
train_df['word_count'] = train_df['request_text_edit_aware'].fillna('').str.split().str.len()

# RAOP activity features (high correlation based on strategy)
train_df['raop_posts'] = train_df['requester_number_of_posts_on_raop_at_request'].fillna(0)
train_df['raop_comments'] = train_df['requester_number_of_comments_in_raop_at_request'].fillna(0)
train_df['total_raop_activity'] = train_df['raop_posts'] + train_df['raop_comments']

# Request engagement
train_df['request_comments'] = train_df['request_number_of_comments_at_retrieval'].fillna(0)

# User reputation features
train_df['upvotes_minus_downvotes'] = train_df['requester_upvotes_minus_downvotes_at_request'].fillna(0)
train_df['upvotes_plus_downvotes'] = train_df['requester_upvotes_plus_downvotes_at_request'].fillna(0)

# Temporal features
train_df['account_age_days'] = train_df['requester_account_age_in_days_at_request'].fillna(0)

# User flair encoding
train_df['flair_shroom'] = (train_df['requester_user_flair'] == 'shroom').astype(int)
train_df['flair_pif'] = (train_df['requester_user_flair'] == 'PIF').astype(int)
train_df['flair_none'] = (train_df['requester_user_flair'].isnull()).astype(int)

print("Basic features created")
print(f"Feature columns: {[col for col in train_df.columns if col not in ['requester_received_pizza', 'request_id', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_username', 'giver_username_if_known', 'requester_subreddits_at_request']]}")

In [None]:
# Prepare features for modeling
print("Preparing feature matrix...")

# Select numeric features (exclude text, IDs, and target)
numeric_features = [
    'text_length', 'title_length', 'word_count',
    'raop_posts', 'raop_comments', 'total_raop_activity',
    'request_comments',
    'upvotes_minus_downvotes', 'upvotes_plus_downvotes',
    'account_age_days',
    'flair_shroom', 'flair_pif', 'flair_none'
]

# Create feature matrix
X_numeric = train_df[numeric_features].values
X_text = tfidf_matrix

# Convert to CSR format for efficient row indexing
from scipy.sparse import csr_matrix
if not isinstance(X_text, csr_matrix):
    X_text = csr_matrix(X_text)

# Combine features
from scipy.sparse import hstack
X = hstack([X_text, X_numeric])

# Ensure final matrix is CSR for efficient indexing
if not isinstance(X, csr_matrix):
    X = csr_matrix(X)

y = train_df['requester_received_pizza'].values
request_ids = train_df['request_id'].values

print(f"Final feature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Matrix type: {type(X)}")
print(f"Matrix format: {X.getformat()}")

In [None]:
# Stratified 5-fold CV
print("Starting stratified 5-fold CV...")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_scores = []
oof_predictions = np.zeros(len(train_df))

# Class weights for imbalance (approx 3:1 ratio)
scale_pos_weight = (len(y) - sum(y)) / sum(y)
print(f"Class imbalance: {sum(y)/len(y)*100:.1f}% positive, scale_pos_weight: {scale_pos_weight:.2f}")

fold = 1
for train_idx, val_idx in skf.split(X, y):
    print(f"\nFold {fold}...")
    
    # For sparse matrices, use proper indexing
    X_train = X[train_idx]
    X_val = X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # LightGBM parameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'scale_pos_weight': scale_pos_weight
    }
    
    # Create datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        valid_names=['val'],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predict
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    oof_predictions[val_idx] = val_pred
    
    # Calculate AUC
    auc = roc_auc_score(y_val, val_pred)
    fold_scores.append(auc)
    print(f"Fold {fold} AUC: {auc:.4f}")
    
    fold += 1

# Overall CV score
overall_auc = roc_auc_score(y, oof_predictions)
print(f"\nOverall CV AUC: {overall_auc:.4f}")
print(f"Mean fold AUC: {np.mean(fold_scores):.4f} Â± {np.std(fold_scores):.4f}")

In [None]:
# Load test data and create predictions
print("Loading test data...")
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

test_df = pd.DataFrame(test_data)
print(f"Test data shape: {test_df.shape}")

# Create same features for test data
print("Creating test features...")

# Text features
test_df['text_length'] = test_df['request_text_edit_aware'].fillna('').str.len()
test_df['title_length'] = test_df['request_title'].fillna('').str.len()
test_df['word_count'] = test_df['request_text_edit_aware'].fillna('').str.split().str.len()

# RAOP activity features
test_df['raop_posts'] = test_df['requester_number_of_posts_on_raop_at_request'].fillna(0)
test_df['raop_comments'] = test_df['requester_number_of_comments_in_raop_at_request'].fillna(0)
test_df['total_raop_activity'] = test_df['raop_posts'] + test_df['raop_comments']

# Request engagement
test_df['request_comments'] = test_df['request_number_of_comments_at_retrieval'].fillna(0)

# User reputation features
test_df['upvotes_minus_downvotes'] = test_df['requester_upvotes_minus_downvotes_at_request'].fillna(0)
test_df['upvotes_plus_downvotes'] = test_df['requester_upvotes_plus_downvotes_at_request'].fillna(0)

# Temporal features
test_df['account_age_days'] = test_df['requester_account_age_in_days_at_request'].fillna(0)

# User flair encoding
test_df['flair_shroom'] = (test_df['requester_user_flair'] == 'shroom').astype(int)
test_df['flair_pif'] = (test_df['requester_user_flair'] == 'PIF').astype(int)
test_df['flair_none'] = (test_df['requester_user_flair'].isnull()).astype(int)

# TF-IDF features
test_df['combined_text'] = (test_df['request_title'].fillna('') + ' ' + test_df['request_text_edit_aware'].fillna('')).str.strip()
test_tfidf = tfidf.transform(test_df['combined_text'])

# Create test feature matrix
X_test_numeric = test_df[numeric_features].values
X_test_text = test_tfidf

# Convert to CSR format
if not isinstance(X_test_text, csr_matrix):
    X_test_text = csr_matrix(X_test_text)

X_test = hstack([X_test_text, X_test_numeric])

# Ensure final test matrix is CSR
if not isinstance(X_test, csr_matrix):
    X_test = csr_matrix(X_test)

print(f"Test feature matrix shape: {X_test.shape}")
print(f"Test matrix format: {X_test.getformat()}")

In [None]:
# Generate predictions on test set using all training data
print("Training final model on all training data...")

# Train on full training data
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'scale_pos_weight': scale_pos_weight
}

train_data = lgb.Dataset(X, label=y)
final_model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    callbacks=[lgb.log_evaluation(0)]
)

# Predict on test set
test_predictions = final_model.predict(X_test)

# Create submission
submission_df = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

print("Submission preview:")
print(submission_df.head())
print(f"\nSubmission shape: {submission_df.shape}")

# Save submission
submission_df.to_csv('/home/submission/submission.csv', index=False)
print("Submission saved to /home/submission/submission.csv")

In [None]:
# Generate predictions on test set using all training data
print("Training final model on all training data...")

# Train on full training data
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'scale_pos_weight': scale_pos_weight
}

train_data = lgb.Dataset(X, label=y)
final_model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    callbacks=[lgb.log_evaluation(0)]
)

# Predict on test set
test_predictions = final_model.predict(X_test)

# Create submission
submission_df = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

print("Submission preview:")
print(submission_df.head())
print(f"\nSubmission shape: {submission_df.shape}")

# Save submission
submission_df.to_csv('/home/submission/submission.csv', index=False)
print("Submission saved to /home/submission/submission.csv")