# Baseline Model - Random Acts of Pizza

This notebook creates a baseline model for predicting pizza request success.

## Approach
1. Load JSON data
2. Extract structured features (numerical/meta data)
3. Simple text features from title and request text
4. LightGBM classifier
5. Cross-validation with AUC scoring

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

In [None]:
# Load training data
print("Loading training data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print("\nFirst sample keys:", list(train_data[0].keys())[:10])
print("\nTarget distribution:")
targets = [d['requester_received_pizza'] for d in train_data]
print(f"Total: {len(targets)}")
print(f"Received pizza: {sum(targets)} ({sum(targets)/len(targets):.2%})")
print(f"No pizza: {len(targets) - sum(targets)} ({(len(targets) - sum(targets))/len(targets):.2%})")

In [None]:
# Load test data
print("Loading test data...")
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

print(f"Number of test samples: {len(test_data)}")
print("\nFirst sample keys:", list(test_data[0].keys())[:10])

In [None]:
# Feature engineering function
def extract_features(data):
    features = []
    
    for sample in data:
        feat = {}
        
        # Basic metadata features
        feat['request_id'] = sample['request_id']
        
        # Vote features
        feat['upvotes'] = sample.get('number_of_upvotes_of_request_at_retrieval', 0)
        feat['downvotes'] = sample.get('number_of_downvotes_of_request_at_retrieval', 0)
        feat['vote_ratio'] = feat['upvotes'] / (feat['downvotes'] + 1)  # Avoid division by zero
        feat['net_votes'] = feat['upvotes'] - feat['downvotes']
        
        # Comment features
        feat['num_comments'] = sample.get('request_number_of_comments_at_retrieval', 0)
        
        # Account age features (convert to numeric)
        feat['account_age_days'] = sample.get('requester_account_age_in_days_at_request', 0)
        feat['account_age_years'] = feat['account_age_days'] / 365.25
        
        # Requester activity features
        feat['requester_comments'] = sample.get('requester_number_of_comments_at_request', 0)
        feat['requester_posts'] = sample.get('requester_number_of_posts_at_request', 0)
        feat['requester_comments_raop'] = sample.get('requester_number_of_comments_in_raop_at_request', 0)
        feat['requester_posts_raop'] = sample.get('requester_number_of_posts_on_raop_at_request', 0)
        
        # Upvote/downvote features for requester
        feat['requester_upvotes_minus_downvotes'] = sample.get('requester_upvotes_minus_downvotes_at_request', 0)
        feat['requester_upvotes_plus_downvotes'] = sample.get('requester_upvotes_plus_downvotes_at_request', 0)
        
        # User flair (convert to numeric)
        flair = sample.get('requester_user_flair', 'None')
        if flair == 'None':
            feat['user_flair'] = 0
        elif flair == 'shroom':
            feat['user_flair'] = 1
        elif flair == 'PIF':
            feat['user_flair'] = 2
        else:
            feat['user_flair'] = 0
        
        # Time features
        timestamp = sample.get('unix_timestamp_of_request_utc', 0)
        feat['timestamp'] = timestamp
        # Extract hour and day of week
        try:
            dt = pd.to_datetime(timestamp, unit='s')
            feat['hour_of_day'] = dt.hour
            feat['day_of_week'] = dt.dayofweek
        except:
            feat['hour_of_day'] = 0
            feat['day_of_week'] = 0
        
        # Text features - title
        title = sample.get('request_title', '')
        feat['title_length'] = len(title)
        feat['title_word_count'] = len(title.split())
        feat['title_exclamation_marks'] = title.count('!')
        feat['title_question_marks'] = title.count('?')
        feat['title_all_caps'] = 1 if title.isupper() else 0
        
        # Text features - request text
        text = sample.get('request_text', '')
        feat['text_length'] = len(text)
        feat['text_word_count'] = len(text.split())
        feat['text_exclamation_marks'] = text.count('!')
        feat['text_question_marks'] = text.count('?')
        feat['text_all_caps'] = 1 if text.isupper() else 0
        
        # Combined text features
        combined_text = (title + ' ' + text).lower()
        feat['combined_length'] = len(combined_text)
        
        # Simple sentiment indicators (keyword counting)
        positive_words = ['thank', 'thanks', 'please', 'kind', 'generous', 'appreciate', 'grateful', 'bless', 'wonderful', 'amazing']
        negative_words = ['desperate', 'starving', 'hungry', 'broke', 'poor', 'need', 'help', 'urgent', 'emergency']
        
        feat['positive_word_count'] = sum(1 for word in positive_words if word in combined_text)
        feat['negative_word_count'] = sum(1 for word in negative_words if word in combined_text)
        feat['sentiment_ratio'] = feat['positive_word_count'] / (feat['negative_word_count'] + 1)
        
        # Punctuation and formatting
        feat['total_exclamation_marks'] = combined_text.count('!')
        feat['total_question_marks'] = combined_text.count('?')
        feat['total_periods'] = combined_text.count('.')
        feat['total_commas'] = combined_text.count(',')
        
        features.append(feat)
    
    return pd.DataFrame(features)

In [None]:
# Extract features for training and test
print("Extracting features from training data...")
train_df = extract_features(train_data)
print(f"Training features shape: {train_df.shape}")

print("\nExtracting features from test data...")
test_df = extract_features(test_data)
print(f"Test features shape: {test_df.shape}")

# Add target to training data
train_df['target'] = [d['requester_received_pizza'] for d in train_data]

print("\nFeature columns:")
print(train_df.columns.tolist())

In [None]:
# Prepare data for modeling
feature_cols = [col for col in train_df.columns if col not in ['request_id', 'target']]
X = train_df[feature_cols]
y = train_df['target']
X_test = test_df[feature_cols]

print(f"Training data shape: {X.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Number of features: {len(feature_cols)}")

# Check for any missing values
print(f"\nMissing values in training: {X.isnull().sum().sum()}")
print(f"Missing values in test: {X_test.isnull().sum().sum()}")

# Fill any missing values with 0
X = X.fillna(0)
X_test = X_test.fillna(0)

In [None]:
# Cross-validation setup
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Store predictions
train_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(X_test))

cv_scores = []

print(f"Starting {n_folds}-fold cross-validation...")

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold + 1}/{n_folds}")
    
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)
    
    # Parameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': 42
    }
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        valid_names=['valid'],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predictions
    valid_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    # Store predictions
    train_predictions[valid_idx] = valid_pred
    test_predictions += test_pred / n_folds
    
    # Calculate AUC
    auc_score = roc_auc_score(y_valid, valid_pred)
    cv_scores.append(auc_score)
    
    print(f"Fold {fold + 1} AUC: {auc_score:.4f}")
    print(f"Best iteration: {model.best_iteration}")

print(f"\n{'='*50}")
print(f"Cross-validation results:")
print(f"Mean AUC: {np.mean(cv_scores):.4f} Â± {np.std(cv_scores):.4f}")
print(f"Individual folds: {[f'{score:.4f}' for score in cv_scores]}")

# Overall training AUC
overall_auc = roc_auc_score(y, train_predictions)
print(f"Overall training AUC: {overall_auc:.4f}")

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print("Top 20 most important features:")
print(feature_importance.head(20))

In [None]:
# Create submission file
submission_df = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure the predictions are in the correct range [0, 1]
submission_df['requester_received_pizza'] = submission_df['requester_received_pizza'].clip(0, 1)

print("Submission file preview:")
print(submission_df.head())

# Save submission
submission_path = '/home/submission/submission_001_baseline.csv'
submission_df.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

print(f"\nSubmission shape: {submission_df.shape}")
print(f"Prediction distribution:")
print(submission_df['requester_received_pizza'].describe())

In [None]:
# Save out-of-fold predictions for potential stacking
oof_df = pd.DataFrame({
    'request_id': train_df['request_id'],
    'prediction': train_predictions,
    'target': y
})

oof_path = '/home/submission/oof_001_baseline.csv'
oof_df.to_csv(oof_path, index=False)
print(f"Out-of-fold predictions saved to: {oof_path}")