# Baseline Model for Random Acts of Pizza

This notebook implements a baseline model for predicting pizza request success.

## Approach
1. Load and explore the data
2. Feature engineering:
   - Text features from request_title and request_text
   - Metadata features (user activity, karma, etc.)
3. Train LightGBM model with stratified CV
4. Generate predictions and create submission

In [5]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

In [6]:
# Load training data
print("Loading training data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

train_df = pd.DataFrame(train_data)
print(f"Training data shape: {train_df.shape}")
print(f"Columns: {train_df.columns.tolist()}")
print(f"Target distribution:")
print(train_df['requester_received_pizza'].value_counts(normalize=True))

# Check for missing text fields
print(f"\nMissing request_text: {train_df['request_text'].isna().sum()}")
print(f"Missing request_title: {train_df['request_title'].isna().sum()}")

Loading training data...
Training data shape: (2878, 32)
Columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request'

In [8]:
# Load test data
print("Loading test data...")
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

test_df = pd.DataFrame(test_data)
print(f"Test data shape: {test_df.shape}")
print(f"Columns: {test_df.columns.tolist()}")

# Check for missing text fields in test
print(f"\nMissing request_text_edit_aware: {test_df['request_text_edit_aware'].isna().sum()}")
print(f"Missing request_title: {test_df['request_title'].isna().sum()}")

Loading test data...
Test data shape: (1162, 17)
Columns: ['giver_username_if_known', 'request_id', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_days_since_first_post_on_raop_at_request', 'requester_number_of_comments_at_request', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_posts_at_request', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_subreddits_at_request', 'requester_subreddits_at_request', 'requester_upvotes_minus_downvotes_at_request', 'requester_upvotes_plus_downvotes_at_request', 'requester_username', 'unix_timestamp_of_request', 'unix_timestamp_of_request_utc']

Missing request_text_edit_aware: 0
Missing request_title: 0


In [16]:
# Basic feature engineering
print("Creating features...")

# Text features - use request_text_edit_aware for test since request_text is not available
train_df['text_combined'] = train_df['request_title'].fillna('') + ' ' + train_df['request_text'].fillna('')
test_df['text_combined'] = test_df['request_title'].fillna('') + ' ' + test_df['request_text_edit_aware'].fillna('')

# Text length features
train_df['title_length'] = train_df['request_title'].fillna('').str.len()
test_df['title_length'] = test_df['request_title'].fillna('').str.len()

train_df['text_length'] = train_df['request_text'].fillna('').str.len()
test_df['text_length'] = test_df['request_text_edit_aware'].fillna('').str.len()

train_df['text_combined_length'] = train_df['text_combined'].str.len()
test_df['text_combined_length'] = test_df['text_combined'].str.len()

# User activity features (at request time only - avoid data leakage)
activity_features = [
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request', 
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_posts_on_raop_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request'
]

# Account age features
train_df['account_age_days'] = train_df['requester_account_age_in_days_at_request']
test_df['account_age_days'] = test_df['requester_account_age_in_days_at_request']

# Subreddit diversity
train_df['subreddit_count'] = train_df['requester_number_of_subreddits_at_request']
test_df['subreddit_count'] = test_df['requester_number_of_subreddits_at_request']

# Days since first post on RAOP
train_df['days_since_first_raop_post'] = train_df['requester_days_since_first_post_on_raop_at_request']
test_df['days_since_first_raop_post'] = test_df['requester_days_since_first_post_on_raop_at_request']

# DO NOT USE - Data leakage features:
# - requester_user_flair (indicates past pizza receipt)
# - Any _at_retrieval features (future information)
# - giver_username_if_known (only known after success)

print("Feature creation completed!")
print(f"Train columns: {train_df.columns.tolist()}")

Creating features...
Feature creation completed!
Train columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 

In [17]:
# Create TF-IDF features for text
print("Creating TF-IDF features...")

# Use a subset of features to keep it manageable for baseline
vectorizer = TfidfVectorizer(
    max_features=1000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=5
)

# Fit on combined train and test text
combined_text = pd.concat([
    train_df['text_combined'],
    test_df['text_combined']
], axis=0)

vectorizer.fit(combined_text)

# Transform text
tfidf_train = vectorizer.transform(train_df['text_combined'])
tfidf_test = vectorizer.transform(test_df['text_combined'])

print(f"TF-IDF shape: {tfidf_train.shape}")

Creating TF-IDF features...


TF-IDF shape: (2878, 1000)


In [18]:
# Prepare features for modeling
print("Preparing feature matrix...")

# Select numeric features that exist in both train and test (no leakage)
numeric_features = [
    'title_length', 'text_length', 'text_combined_length',
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_posts_on_raop_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'account_age_days',
    'subreddit_count',
    'days_since_first_raop_post'
]

# Ensure all features exist and fill missing values
X_numeric_train = train_df[numeric_features].fillna(0)
X_numeric_test = test_df[numeric_features].fillna(0)

# Combine numeric and text features
from scipy.sparse import hstack

X_train = hstack([X_numeric_train.values, tfidf_train])
X_test = hstack([X_numeric_test.values, tfidf_test])

y_train = train_df['requester_received_pizza'].astype(int)

print(f"Final training matrix shape: {X_train.shape}")
print(f"Final test matrix shape: {X_test.shape}")

Preparing feature matrix...
Final training matrix shape: (2878, 1012)
Final test matrix shape: (1162, 1012)


In [19]:
# Train model with stratified CV
print("Training LightGBM model with stratified CV...")

# Define CV strategy
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store predictions
train_predictions = np.zeros(len(train_df))
test_predictions = np.zeros(len(test_df))

# Model parameters (optimized for binary classification)
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42
}

cv_scores = []

for fold, (train_idx, valid_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"Training fold {fold + 1}/5...")
    
    # Create datasets - convert to proper format for LightGBM
    X_tr = X_train.tocsr()[train_idx]
    X_val = X_train.tocsr()[valid_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_tr, label=y_tr)
    valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        valid_names=['valid'],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predictions
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    # Store predictions
    train_predictions[valid_idx] = val_pred
    test_predictions += test_pred / 5
    
    # Calculate fold score
    fold_score = roc_auc_score(y_val, val_pred)
    cv_scores.append(fold_score)
    print(f"Fold {fold + 1} AUC: {fold_score:.4f}")

# Overall CV score
overall_score = roc_auc_score(y_train, train_predictions)
print(f"\nOverall CV AUC: {overall_score:.4f}")
print(f"Mean CV AUC: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

Training LightGBM model with stratified CV...
Training fold 1/5...
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[37]	valid's auc: 0.672863
Fold 1 AUC: 0.6729
Training fold 2/5...
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[62]	valid's auc: 0.670408
Fold 2 AUC: 0.6704
Training fold 3/5...
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[2]	valid's auc: 0.691573
Fold 3 AUC: 0.6916
Training fold 4/5...
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[29]	valid's auc: 0.642224
Fold 4 AUC: 0.6422
Training fold 5/5...
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[57]	valid's auc: 0.688131
Fold 5 AUC: 0.6881

Overall CV AUC: 0.6597
Mean CV AUC: 0.6730 ± 0.0175


In [20]:
# Create submission file
print("Creating submission file...")

submission_df = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure the submission format matches the sample
print(f"Submission shape: {submission_df.shape}")
print(f"Submission head:")
print(submission_df.head())

# Save submission
submission_df.to_csv('/home/submission/submission.csv', index=False)
print("Submission saved to /home/submission/submission.csv")

Creating submission file...
Submission shape: (1162, 2)
Submission head:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.297432
1   t3_roiuw                  0.186182
2   t3_mjnbq                  0.193630
3   t3_t8wd1                  0.337752
4  t3_1m4zxu                  0.241093
Submission saved to /home/submission/submission.csv


In [21]:
# Feature importance analysis
print("Top 10 most important features:")
importance_df = pd.DataFrame({
    'feature': numeric_features + [f'tfidf_{i}' for i in range(tfidf_train.shape[1])],
    'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print(importance_df.head(10))

Top 10 most important features:
                                          feature  importance
746                                     tfidf_734  389.863929
1                                     text_length  338.306659
7    requester_upvotes_minus_downvotes_at_request  327.167749
11                     days_since_first_raop_post  306.172060
2                            text_combined_length  299.699718
10                                subreddit_count  296.029971
249                                     tfidf_237  294.359919
9                                account_age_days  285.036020
0                                    title_length  276.030140
4            requester_number_of_posts_at_request  227.925449
