# Random Acts of Pizza - Baseline Model

This notebook creates a baseline model for predicting pizza request success using both text and metadata features.

In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack, csr_matrix
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## Load Data

In [2]:
# Load training data
print("Loading training data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

# Load test data
print("Loading test data...")
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

print(f"Training samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

Loading training data...
Loading test data...
Training samples: 2878
Test samples: 1162


## Create DataFrames

In [3]:
# Convert to DataFrames for easier manipulation
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print("Training DataFrame shape:", train_df.shape)
print("Test DataFrame shape:", test_df.shape)

# Check target distribution
print("\nTarget distribution:")
print(train_df['requester_received_pizza'].value_counts())
print(f"Success rate: {train_df['requester_received_pizza'].mean():.3f}")

Training DataFrame shape: (2878, 32)
Test DataFrame shape: (1162, 17)

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64
Success rate: 0.248


## Feature Engineering

In [4]:
# Text features - combine title and text
train_df['combined_text'] = train_df['request_title'].fillna('') + ' ' + train_df['request_text_edit_aware'].fillna('')
test_df['combined_text'] = test_df['request_title'].fillna('') + ' ' + test_df['request_text_edit_aware'].fillna('')

# Basic metadata features
def extract_metadata_features(df, is_train=True):
    features = pd.DataFrame()
    
    # Account age features
    features['account_age_days'] = df['requester_account_age_in_days_at_request']
    features['account_age_days'].fillna(0, inplace=True)
    
    # Activity features
    features['num_comments'] = df['requester_number_of_comments_at_request']
    features['num_posts'] = df['requester_number_of_posts_at_request']
    features['num_subreddits'] = df['requester_number_of_subreddits_at_request']
    
    # RAOP-specific activity
    features['num_raop_comments'] = df['requester_number_of_comments_in_raop_at_request']
    features['num_raop_posts'] = df['requester_number_of_posts_on_raop_at_request']
    
    # Vote features
    features['upvotes_minus_downvotes'] = df['requester_upvotes_minus_downvotes_at_request']
    features['upvotes_plus_downvotes'] = df['requester_upvotes_plus_downvotes_at_request']
    
    # Request features (only available in training data)
    if is_train:
        # Check if these columns exist
        if 'request_number_of_comments_at_retrieval' in df.columns:
            features['request_comments'] = df['request_number_of_comments_at_retrieval']
        else:
            features['request_comments'] = 0
            
        if 'number_of_upvotes_of_request_at_retrieval' in df.columns:
            features['request_upvotes'] = df['number_of_upvotes_of_request_at_retrieval']
        else:
            features['request_upvotes'] = 0
            
        if 'number_of_downvotes_of_request_at_retrieval' in df.columns:
            features['request_downvotes'] = df['number_of_downvotes_of_request_at_retrieval']
        else:
            features['request_downvotes'] = 0
    else:
        # For test data, we'll use zeros for these features
        features['request_comments'] = 0
        features['request_upvotes'] = 0
        features['request_downvotes'] = 0
    
    # Time features
    features['unix_timestamp'] = df['unix_timestamp_of_request']
    
    # User flair (categorical) - only in training data
    if is_train:
        if 'requester_user_flair' in df.columns:
            features['user_flair'] = df['requester_user_flair'].fillna('None')
        else:
            features['user_flair'] = 'None'
    else:
        features['user_flair'] = 'None'  # Default for test data
    
    # Post edited (only available in training data)
    if is_train:
        if 'post_was_edited' in df.columns:
            features['post_edited'] = df['post_was_edited'].astype(int)
        else:
            features['post_edited'] = 0
    else:
        features['post_edited'] = 0
    
    return features

train_meta = extract_metadata_features(train_df, is_train=True)
test_meta = extract_metadata_features(test_df, is_train=False)

print("Metadata features shape:", train_meta.shape)
print("Test metadata features shape:", test_meta.shape)

Metadata features shape: (2878, 14)
Test metadata features shape: (1162, 14)


## Handle Categorical Features

In [5]:
# One-hot encode user flair
train_meta = pd.get_dummies(train_meta, columns=['user_flair'], prefix='flair')
test_meta = pd.get_dummies(test_meta, columns=['user_flair'], prefix='flair')

# Ensure same columns in both train and test
missing_cols = set(train_meta.columns) - set(test_meta.columns)
for col in missing_cols:
    test_meta[col] = 0

test_meta = test_meta[train_meta.columns]

print("After one-hot encoding:", train_meta.shape)

After one-hot encoding: (2878, 16)


## Text Vectorization

In [6]:
# Vectorize text using TF-IDF
print("Vectorizing text...")
tfidf = TfidfVectorizer(
    max_features=5000,  # Limit features for speed
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)

# Fit on training text and transform both train and test
train_text_features = tfidf.fit_transform(train_df['combined_text'])
test_text_features = tfidf.transform(test_df['combined_text'])

print("Text features shape:", train_text_features.shape)

Vectorizing text...


Text features shape: (2878, 5000)


## Scale Numerical Features

In [7]:
# Scale metadata features
scaler = StandardScaler()
train_meta_scaled = scaler.fit_transform(train_meta)
test_meta_scaled = scaler.transform(test_meta)

# Convert to sparse matrix for compatibility with text features
train_meta_sparse = csr_matrix(train_meta_scaled)
test_meta_sparse = csr_matrix(test_meta_scaled)

print("Scaled metadata shape:", train_meta_sparse.shape)

Scaled metadata shape: (2878, 16)


## Combine Features

In [8]:
# Combine text and metadata features
X_train = hstack([train_text_features, train_meta_sparse])
X_test = hstack([test_text_features, test_meta_sparse])

print("Final training features shape:", X_train.shape)
print("Final test features shape:", X_test.shape)

# Target variable
y_train = train_df['requester_received_pizza'].astype(int).values
print("Target shape:", y_train.shape)

Final training features shape: (2878, 5016)
Final test features shape: (1162, 5016)
Target shape: (2878,)


## Cross-Validation Setup

In [9]:
# Cross-validation setup
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Model - Logistic Regression as baseline
model = LogisticRegression(
    random_state=42,
    max_iter=1000,
    class_weight='balanced'  # Handle class imbalance
)

print(f"Running {n_splits}-fold cross-validation...")

Running 5-fold cross-validation...


## Cross-Validation

In [10]:
# Cross-validation
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    # Train model
    model.fit(X_tr, y_tr)
    
    # Predict
    val_pred = model.predict_proba(X_val)[:, 1]
    
    # Calculate ROC AUC
    auc_score = roc_auc_score(y_val, val_pred)
    cv_scores.append(auc_score)
    
    print(f"Fold {fold + 1} ROC AUC: {auc_score:.4f}")

print(f"\nCV ROC AUC: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

Fold 1 ROC AUC: 1.0000
Fold 2 ROC AUC: 1.0000
Fold 3 ROC AUC: 1.0000
Fold 4 ROC AUC: 1.0000
Fold 5 ROC AUC: 1.0000

CV ROC AUC: 1.0000 ± 0.0000


## Train on Full Data and Predict

In [11]:
# Train on full training data
print("Training on full dataset...")
model.fit(X_train, y_train)

# Predict on test set
test_predictions = model.predict_proba(X_test)[:, 1]
print("Test predictions shape:", test_predictions.shape)

Training on full dataset...
Test predictions shape: (1162,)


## Create Submission

In [12]:
# Create submission file
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)

print("Submission file created:", submission_path)
print("\nSubmission preview:")
print(submission.head())

# Check submission format
print(f"\nSubmission shape: {submission.shape}")
print(f"Prediction range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")

Submission file created: /home/submission/submission.csv

Submission preview:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.000704
1   t3_roiuw                  0.000976
2   t3_mjnbq                  0.000965
3   t3_t8wd1                  0.000865
4  t3_1m4zxu                  0.000696

Submission shape: (1162, 2)
Prediction range: [0.0006, 0.0044]
