# Random Acts of Pizza - Baseline Model

This notebook creates a baseline model for predicting which pizza requests will be successful.

## Approach
- Load JSON data
- Extract basic numerical features
- Simple text features (length, word count)
- LightGBM classifier
- 5-fold cross-validation

In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading training data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

print(f"Training samples: {len(train_data)}")
print(f"Sample keys: {list(train_data[0].keys())}")

Loading training data...
Training samples: 2878
Sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 're

In [2]:
# Convert to DataFrame and explore
train_df = pd.DataFrame(train_data)
print("Training DataFrame shape:", train_df.shape)
print("\nTarget distribution:")
print(train_df['requester_received_pizza'].value_counts())
print(f"Positive rate: {train_df['requester_received_pizza'].mean():.3f}")

# Check for missing values in key columns
print("\nMissing values in key columns:")
key_cols = ['requester_received_pizza', 'request_text_edit_aware', 'request_title', 
           'requester_account_age_in_days_at_request']
for col in key_cols:
    if col in train_df.columns:
        missing = train_df[col].isnull().sum()
        print(f"{col}: {missing} ({missing/len(train_df)*100:.1f}%)")

Training DataFrame shape: (2878, 32)

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64
Positive rate: 0.248

Missing values in key columns:
requester_received_pizza: 0 (0.0%)
request_text_edit_aware: 0 (0.0%)
request_title: 0 (0.0%)
requester_account_age_in_days_at_request: 0 (0.0%)


In [3]:
# Feature engineering - basic numerical features
print("Creating basic features...")

# Numerical features that should be available at prediction time
numerical_features = [
    'requester_account_age_in_days_at_request',
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_posts_on_raop_at_request',
    'requester_days_since_first_post_on_raop_at_request'
]

# Create feature DataFrame
features_df = train_df[numerical_features].copy()

# Fill missing values with median
for col in numerical_features:
    if features_df[col].isnull().any():
        median_val = features_df[col].median()
        features_df[col].fillna(median_val, inplace=True)
        print(f"Filled missing values in {col} with median: {median_val:.2f}")

# Text-based features - use request_text_edit_aware which is available in both train and test
print("\nCreating text features...")
features_df['title_length'] = train_df['request_title'].fillna('').str.len()
features_df['text_length'] = train_df['request_text_edit_aware'].fillna('').str.len()
features_df['title_word_count'] = train_df['request_title'].fillna('').str.split().str.len()
features_df['text_word_count'] = train_df['request_text_edit_aware'].fillna('').str.split().str.len()

print(f"Final feature shape: {features_df.shape}")
print(f"Features: {list(features_df.columns)}")

Creating basic features...

Creating text features...
Final feature shape: (2878, 13)
Features: ['requester_account_age_in_days_at_request', 'requester_number_of_comments_at_request', 'requester_number_of_posts_at_request', 'requester_number_of_subreddits_at_request', 'requester_upvotes_minus_downvotes_at_request', 'requester_upvotes_plus_downvotes_at_request', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_posts_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_request', 'title_length', 'text_length', 'title_word_count', 'text_word_count']


In [4]:
# Prepare data for modeling
X = features_df.copy()
y = train_df['requester_received_pizza'].astype(int)

print(f"X shape: {X.shape}, y shape: {y.shape}")
print(f"Feature types:\n{X.dtypes.value_counts()}")

# Check for any remaining missing values
missing_summary = X.isnull().sum()
if missing_summary.sum() > 0:
    print("\nRemaining missing values:")
    print(missing_summary[missing_summary > 0])
else:
    print("\nNo missing values in features!")

X shape: (2878, 13), y shape: (2878,)
Feature types:
int64      11
float64     2
Name: count, dtype: int64

No missing values in features!


In [5]:
# Cross-validation setup
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Model parameters (conservative for baseline)
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42
}

cv_scores = []
fold_predictions = np.zeros(len(X))

print(f"Starting {n_folds}-fold cross-validation...")

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"\nFold {fold}/{n_folds}")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        valid_names=['valid'],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predict and evaluate
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    fold_auc = roc_auc_score(y_val, val_pred)
    cv_scores.append(fold_auc)
    
    fold_predictions[val_idx] = val_pred
    
    print(f"Fold {fold} AUC: {fold_auc:.4f}")
    
    # Feature importance for this fold
    if fold == 1:  # Only show for first fold
        importance_df = pd.DataFrame({
            'feature': X.columns,
            'importance': model.feature_importance(importance_type='gain')
        }).sort_values('importance', ascending=False)
        print(f"\nTop 10 features by importance:")
        print(importance_df.head(10))

# Overall CV score
mean_auc = np.mean(cv_scores)
std_auc = np.std(cv_scores)
print(f"\n{'='*50}")
print(f"Cross-Validation Results:")
print(f"Mean AUC: {mean_auc:.4f} ± {std_auc:.4f}")
print(f"Individual folds: {cv_scores}")
print(f"{'='*50}")

# Overall ROC AUC
overall_auc = roc_auc_score(y, fold_predictions)
print(f"Overall ROC AUC (using out-of-fold predictions): {overall_auc:.4f}")

Starting 5-fold cross-validation...

Fold 1/5
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[17]	valid's auc: 0.635435
Fold 1 AUC: 0.6354

Top 10 features by importance:
                                            feature  importance
10                                      text_length  604.117201
4      requester_upvotes_minus_downvotes_at_request  525.117450
0          requester_account_age_in_days_at_request  454.082567
12                                  text_word_count  264.837350
1           requester_number_of_comments_at_request  238.979829
6   requester_number_of_comments_in_raop_at_request  234.705691
2              requester_number_of_posts_at_request  231.802262
9                                      title_length  230.719991
5       requester_upvotes_plus_downvotes_at_request  221.250430
3         requester_number_of_subreddits_at_request  173.186739

Fold 2/5
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[45]	valid's auc: 0.632843
Fold 2 AUC: 0.6328

Fold 3/5
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[19]	valid's auc: 0.68886
Fold 3 AUC: 0.6889

Fold 4/5
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3]	valid's auc: 0.590739
Fold 4 AUC: 0.5907

Fold 5/5
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[14]	valid's auc: 0.639334
Fold 5 AUC: 0.6393

Cross-Validation Results:
Mean AUC: 0.6374 ± 0.0312
Individual folds: [0.635435003795281, 0.6328429076696975, 0.6888596392060595, 0.5907391219891219, 0.6393340455840455]
Overall ROC AUC (using out-of-fold predictions): 0.6351


In [6]:
# Load test data and create predictions
print("Loading test data...")
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

test_df = pd.DataFrame(test_data)
print(f"Test samples: {len(test_df)}")

# Create same features for test data
test_features = test_df[numerical_features].copy()

# Fill missing values with training median
for col in numerical_features:
    if test_features[col].isnull().any():
        median_val = features_df[col].median()  # Use training median
        test_features[col].fillna(median_val, inplace=True)

# Text features - use request_text_edit_aware
test_features['title_length'] = test_df['request_title'].fillna('').str.len()
test_features['text_length'] = test_df['request_text_edit_aware'].fillna('').str.len()
test_features['title_word_count'] = test_df['request_title'].fillna('').str.split().str.len()
test_features['text_word_count'] = test_df['request_text_edit_aware'].fillna('').str.split().str.len()

# Keep only features that were used in training
test_features = test_features[features_df.columns]

print(f"Test features shape: {test_features.shape}")

# Train final model on full training data
print("\nTraining final model on full training data...")
train_data = lgb.Dataset(X, label=y)
final_model = lgb.train(
    params,
    train_data,
    num_boost_round=500,
    valid_sets=[train_data],
    callbacks=[lgb.log_evaluation(0)]
)

# Make predictions on test set
test_predictions = final_model.predict(test_features)
print(f"Test predictions shape: {test_predictions.shape}")
print(f"Prediction range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")
print(f"Mean prediction: {test_predictions.mean():.4f}")

Loading test data...


Test samples: 1162
Test features shape: (1162, 13)

Training final model on full training data...


Test predictions shape: (1162,)
Prediction range: [0.0002, 0.9752]
Mean prediction: 0.2044


In [7]:
# Create submission file
submission_df = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

print("Submission preview:")
print(submission_df.head())
print(f"\nSubmission shape: {submission_df.shape}")

# Save submission
submission_path = '/home/submission/submission.csv'
submission_df.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

# Verify submission format matches sample
sample_sub = pd.read_csv('/home/data/sampleSubmission.csv')
print(f"\nSample submission shape: {sample_sub.shape}")
print(f"Column match: {list(submission_df.columns) == list(sample_sub.columns)}")
print(f"ID match (first 5): {all(submission_df['request_id'].head() == sample_sub['request_id'].head())}")

Submission preview:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.150631
1   t3_roiuw                  0.205975
2   t3_mjnbq                  0.195447
3   t3_t8wd1                  0.047313
4  t3_1m4zxu                  0.198751

Submission shape: (1162, 2)

Submission saved to: /home/submission/submission.csv

Sample submission shape: (1162, 2)
Column match: True
ID match (first 5): True
