# Random Acts of Pizza - Baseline Model

This notebook creates a baseline model for predicting pizza request success.

In [8]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## Load Data

In [9]:
# Load training data - it's a JSON array
import json

with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

train_df = pd.DataFrame(train_data)
print(f"Training data shape: {train_df.shape}")
print(f"Columns: {train_df.columns.tolist()}")

# Load test data
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

test_df = pd.DataFrame(test_data)
print(f"Test data shape: {test_df.shape}")

# Check what columns are available
print(f"\nAvailable columns:")
for i, col in enumerate(train_df.columns):
    print(f"{i+1:2d}. {col}")

# Check target distribution
if 'requester_received_pizza' in train_df.columns:
    print(f"\nTarget distribution:")
    print(train_df['requester_received_pizza'].value_counts())
    print(f"Positive rate: {train_df['requester_received_pizza'].mean():.3f}")
else:
    print("\nTarget column not found!")

Training data shape: (2878, 32)
Columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_upvotes_minu

## Feature Engineering

In [None]:
def engineer_features(df):
    """Engineer features from the raw data"""
    features = pd.DataFrame(index=df.index)
    
    # Basic metadata features - handle missing columns gracefully
    if 'number_of_upvotes_of_request_at_retrieval' in df.columns:
        features['upvotes'] = df['number_of_upvotes_of_request_at_retrieval'].fillna(0)
    else:
        features['upvotes'] = 0
        
    if 'number_of_downvotes_of_request_at_retrieval' in df.columns:
        features['downvotes'] = df['number_of_downvotes_of_request_at_retrieval'].fillna(0)
    else:
        features['downvotes'] = 0
    
    features['net_votes'] = features['upvotes'] - features['downvotes']
    features['total_votes'] = features['upvotes'] + features['downvotes']
    
    # Comment features
    if 'request_number_of_comments_at_retrieval' in df.columns:
        features['num_comments'] = df['request_number_of_comments_at_retrieval'].fillna(0)
    else:
        features['num_comments'] = 0
    
    # Account age features (in days)
    if 'requester_account_age_in_days_at_request' in df.columns:
        features['account_age_days'] = df['requester_account_age_in_days_at_request'].fillna(0)
    else:
        features['account_age_days'] = 0
    features['account_age_years'] = features['account_age_days'] / 365.25
    
    # Requester activity features
    if 'requester_number_of_posts_at_request' in df.columns:
        features['requester_posts'] = df['requester_number_of_posts_at_request'].fillna(0)
    else:
        features['requester_posts'] = 0
        
    if 'requester_number_of_comments_at_request' in df.columns:
        features['requester_comments'] = df['requester_number_of_comments_at_request'].fillna(0)
    else:
        features['requester_comments'] = 0
    
    features['requester_total_activity'] = features['requester_posts'] + features['requester_comments']
    
    # RAOP-specific activity
    if 'requester_number_of_posts_on_raop_at_request' in df.columns:
        features['raop_posts'] = df['requester_number_of_posts_on_raop_at_request'].fillna(0)
    else:
        features['raop_posts'] = 0
        
    if 'requester_number_of_comments_in_raop_at_request' in df.columns:
        features['raop_comments'] = df['requester_number_of_comments_in_raop_at_request'].fillna(0)
    else:
        features['raop_comments'] = 0
    
    features['raop_total_activity'] = features['raop_posts'] + features['raop_comments']
    
    # Subreddit diversity
    if 'requester_number_of_subreddits_at_request' in df.columns:
        features['num_subreddits'] = df['requester_number_of_subreddits_at_request'].fillna(0)
    else:
        features['num_subreddits'] = 0
    
    # Vote ratios
    features['upvote_ratio'] = features['upvotes'] / (features['total_votes'] + 1)
    features['comment_to_vote_ratio'] = features['num_comments'] / (features['total_votes'] + 1)
    
    # Time-based features (from timestamp)
    if 'unix_timestamp_of_request' in df.columns:
        request_datetime = pd.to_datetime(df['unix_timestamp_of_request'], unit='s')
        features['request_hour'] = request_datetime.dt.hour
        features['request_dayofweek'] = request_datetime.dt.dayofweek
        features['request_month'] = request_datetime.dt.month
    else:
        features['request_hour'] = 0
        features['request_dayofweek'] = 0
        features['request_month'] = 0
    
    # User flair encoding
    if 'requester_user_flair' in df.columns:
        flair_map = {'None': 0, 'shroom': 1, 'PIF': 2}
        features['user_flair'] = df['requester_user_flair'].map(flair_map).fillna(0)
    else:
        features['user_flair'] = 0
    
    # Text length features
    if 'request_title' in df.columns:
        features['title_length'] = df['request_title'].fillna('').str.len()
    else:
        features['title_length'] = 0
        
    if 'request_text' in df.columns:
        features['text_length'] = df['request_text'].fillna('').str.len()
    else:
        features['text_length'] = 0
        
    if 'request_text_edit_aware' in df.columns:
        features['text_length_edit_aware'] = df['request_text_edit_aware'].fillna('').str.len()
    else:
        features['text_length_edit_aware'] = 0
    
    features['total_text_length'] = features['title_length'] + features['text_length']
    
    # Binary features
    if 'post_was_edited' in df.columns:
        features['post_was_edited'] = df['post_was_edited'].astype(int)
    else:
        features['post_was_edited'] = 0
    
    return features

# Engineer features for train and test
print("Engineering features...")
train_features = engineer_features(train_df)
test_features = engineer_features(test_df)

print(f"Train features shape: {train_features.shape}")
print(f"Test features shape: {test_features.shape}")

# Handle any missing values
train_features = train_features.fillna(0)
test_features = test_features.fillna(0)

# Ensure both have same columns
test_features = test_features.reindex(columns=train_features.columns, fill_value=0)
print(f"After alignment - Train: {train_features.shape}, Test: {test_features.shape}")

## Text Feature Extraction

In [None]:
# Extract TF-IDF features from text
print("Extracting text features...")

# Combine title and text for TF-IDF
n_samples = len(train_df)
train_text = train_df['request_title'] + ' ' + train_df['request_text_edit_aware']
test_text = test_df['request_title'] + ' ' + test_df['request_text_edit_aware']

# Use TF-IDF with limited features to keep it fast
vectorizer = TfidfVectorizer(
    max_features=1000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.9
)

# Fit on training text and transform both
X_text_train = vectorizer.fit_transform(train_text)
X_text_test = vectorizer.transform(test_text)

print(f"TF-IDF features shape: {X_text_train.shape}")

# Convert to dense for LightGBM (or keep sparse and use appropriate handling)
# For now, let's use just the metadata features for the baseline
X_train_meta = train_features.values
X_test_meta = test_features.values

# For this baseline, we'll use only metadata features to keep it simple and fast
# In future iterations, we can combine with text features
y_train = train_df['requester_received_pizza'].values

print(f"Training with {X_train_meta.shape[1]} metadata features")

## Model Training with Cross-Validation

In [None]:
# Set up cross-validation
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize arrays for predictions
oof_predictions = np.zeros(len(train_df))
test_predictions = np.zeros(len(test_df))

# Store feature importance
feature_importance_list = []

print(f"Training LightGBM model with {n_folds}-fold CV...")

fold_scores = []
for fold, (train_idx, valid_idx) in enumerate(skf.split(X_train_meta, y_train)):
    print(f"\nFold {fold + 1}/{n_folds}")
    
    # Split data
    X_tr, X_val = X_train_meta[train_idx], X_train_meta[valid_idx]
    y_tr, y_val = y_train[train_idx], y_train[valid_idx]
    
    # Create LightGBM datasets
    train_set = lgb.Dataset(X_tr, label=y_tr)
    valid_set = lgb.Dataset(X_val, label=y_val)
    
    # Parameters (simplified for baseline)
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': 42
    }
    
    # Train model
    model = lgb.train(
        params,
        train_set,
        num_boost_round=1000,
        valid_sets=[valid_set],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predictions
    oof_predictions[valid_idx] = model.predict(X_val, num_iteration=model.best_iteration)
    test_predictions += model.predict(X_test_meta, num_iteration=model.best_iteration) / n_folds
    
    # Calculate fold score
    fold_score = roc_auc_score(y_val, oof_predictions[valid_idx])
    fold_scores.append(fold_score)
    print(f"Fold {fold + 1} AUC: {fold_score:.4f}")
    
    # Store feature importance
    importance_df = pd.DataFrame({
        'feature': train_features.columns,
        'importance': model.feature_importance(importance_type='gain')
    })
    feature_importance_list.append(importance_df)

# Calculate overall CV score
cv_score = roc_auc_score(y_train, oof_predictions)
print(f"\nOverall CV AUC: {cv_score:.4f}")
print(f"Fold scores: {fold_scores}")
print(f"Mean ± Std: {np.mean(fold_scores):.4f} ± {np.std(fold_scores):.4f}")

# Feature importance analysis
feature_importance = pd.concat(feature_importance_list).groupby('feature')['importance'].mean().sort_values(ascending=False)
print(f"\nTop 10 features:")
print(feature_importance.head(10))

## Create Submission

In [None]:
# Create submission file
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure the submission is in the correct format
submission['requester_received_pizza'] = submission['requester_received_pizza'].astype(float)

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)

print(f"Submission saved to {submission_path}")
print(f"Submission shape: {submission.shape}")
print(f"\nPrediction statistics:")
print(submission['requester_received_pizza'].describe())

# Show first few predictions
print(f"\nFirst 5 predictions:")
print(submission.head())