# Random Acts of Pizza - Baseline Model

This notebook creates a baseline model for predicting pizza request success.

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading training data...")
with open('/home/data/train.json', 'r') as f:
    train_data = [json.loads(line) for line in f]

print("Loading test data...")
with open('/home/data/test.json', 'r') as f:
    test_data = [json.loads(line) for line in f]

print(f"Train samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

In [None]:
# Convert to DataFrames
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print("Train columns:", train_df.columns.tolist())
print("\nTarget distribution:")
print(train_df['requester_received_pizza'].value_counts())
print(f"Success rate: {train_df['requester_received_pizza'].mean():.3f}")

In [None]:
# Basic feature engineering
print("Creating basic features...")

# Numeric features that are likely useful
numeric_features = [
    'requester_account_age_in_days_at_request',
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_posts_on_raop_at_request',
    'requester_days_since_first_post_on_raop_at_request',
    'number_of_upvotes_of_request_at_retrieval',
    'number_of_downvotes_of_request_at_retrieval',
    'request_number_of_comments_at_retrieval'
]

# Select features that exist in both train and test
available_features = [f for f in numeric_features if f in train_df.columns and f in test_df.columns]
print(f"Using {len(available_features)} numeric features")

# Fill missing values
X_train = train_df[available_features].fillna(-1)
X_test = test_df[available_features].fillna(-1)
y_train = train_df['requester_received_pizza'].astype(int)

print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")

In [None]:
# Train model with cross-validation
print("Training LightGBM model with 5-fold CV...")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = []

# Store out-of-fold predictions for analysis
oof_predictions = np.zeros(len(X_train))
test_predictions = np.zeros(len(X_test))

fold = 1
for train_idx, valid_idx in skf.split(X_train, y_train):
    print(f"\nFold {fold}")
    
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
    
    # Create LightGBM datasets
    train_set = lgb.Dataset(X_tr, label=y_tr)
    valid_set = lgb.Dataset(X_val, label=y_val)
    
    # Parameters (simple baseline)
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.1,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': 42
    }
    
    # Train model
    model = lgb.train(
        params,
        train_set,
        num_boost_round=1000,
        valid_sets=[valid_set],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predictions
    val_pred = model.predict(X_val)
    test_pred = model.predict(X_test)
    
    # Store predictions
    oof_predictions[valid_idx] = val_pred
    test_predictions += test_pred / 5
    
    # Calculate AUC
    auc = roc_auc_score(y_val, val_pred)
    auc_scores.append(auc)
    print(f"Fold {fold} AUC: {auc:.4f}")
    
    fold += 1

print(f"\nCV Results:")
print(f"Mean AUC: {np.mean(auc_scores):.4f} Â± {np.std(auc_scores):.4f}")
print(f"OOF AUC: {roc_auc_score(y_train, oof_predictions):.4f}")

In [None]:
# Create submission file
print("Creating submission file...")

submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure the submission has the correct format
print(f"Submission shape: {submission.shape}")
print(f"Submission columns: {submission.columns.tolist()}")
print("\nFirst few rows:")
print(submission.head())

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print("\nSubmission saved to /home/submission/submission.csv")