# Random Acts of Pizza - Baseline Model

This notebook creates a baseline model for predicting pizza request success using both text and metadata features.

In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack, csr_matrix
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## Load Data

In [None]:
# Load training data
print("Loading training data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

# Load test data
print("Loading test data...")
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

print(f"Training samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

## Create DataFrames

In [None]:
# Text features - combine title and text
train_df['combined_text'] = train_df['request_title'].fillna('') + ' ' + train_df['request_text_edit_aware'].fillna('')
test_df['combined_text'] = test_df['request_title'].fillna('') + ' ' + test_df['request_text_edit_aware'].fillna('')

# Basic metadata features
def extract_metadata_features(df, is_train=True):
    features = pd.DataFrame()
    
    # Account age features
    features['account_age_days'] = df['requester_account_age_in_days_at_request']
    features['account_age_days'].fillna(0, inplace=True)
    
    # Activity features
    features['num_comments'] = df['requester_number_of_comments_at_request']
    features['num_posts'] = df['requester_number_of_posts_at_request']
    features['num_subreddits'] = df['requester_number_of_subreddits_at_request']
    
    # RAOP-specific activity
    features['num_raop_comments'] = df['requester_number_of_comments_in_raop_at_request']
    features['num_raop_posts'] = df['requester_number_of_posts_on_raop_at_request']
    
    # Vote features
    features['upvotes_minus_downvotes'] = df['requester_upvotes_minus_downvotes_at_request']
    features['upvotes_plus_downvotes'] = df['requester_upvotes_plus_downvotes_at_request']
    
    # Request features (only available in training data)
    if is_train:
        features['request_comments'] = df['request_number_of_comments_at_retrieval']
        features['request_upvotes'] = df['number_of_upvotes_of_request_at_retrieval']
        features['request_downvotes'] = df['number_of_downvotes_of_request_at_retrieval']
    else:
        # For test data, we'll use zeros or NaN for these features
        features['request_comments'] = 0
        features['request_upvotes'] = 0
        features['request_downvotes'] = 0
    
    # Time features
    features['unix_timestamp'] = df['unix_timestamp_of_request']
    
    # User flair (categorical)
    features['user_flair'] = df['requester_user_flair'].fillna('None')
    
    # Post edited (only available in training data)
    if is_train:
        features['post_edited'] = df['post_was_edited'].astype(int)
    else:
        features['post_edited'] = 0
    
    return features

train_meta = extract_metadata_features(train_df, is_train=True)
test_meta = extract_metadata_features(test_df, is_train=False)

print("Metadata features shape:", train_meta.shape)
print("Test metadata features shape:", test_meta.shape)

## Feature Engineering

In [None]:
# Text features - combine title and text
train_df['combined_text'] = train_df['request_title'].fillna('') + ' ' + train_df['request_text_edit_aware'].fillna('')
test_df['combined_text'] = test_df['request_title'].fillna('') + ' ' + test_df['request_text_edit_aware'].fillna('')

# Basic metadata features
def extract_metadata_features(df):
    features = pd.DataFrame()
    
    # Account age features
    features['account_age_days'] = df['requester_account_age_in_days_at_request']
    features['account_age_days'].fillna(0, inplace=True)
    
    # Activity features
    features['num_comments'] = df['requester_number_of_comments_at_request']
    features['num_posts'] = df['requester_number_of_posts_at_request']
    features['num_subreddits'] = df['requester_number_of_subreddits_at_request']
    
    # RAOP-specific activity
    features['num_raop_comments'] = df['requester_number_of_comments_in_raop_at_request']
    features['num_raop_posts'] = df['requester_number_of_posts_on_raop_at_request']
    
    # Vote features
    features['upvotes_minus_downvotes'] = df['requester_upvotes_minus_downvotes_at_request']
    features['upvotes_plus_downvotes'] = df['requester_upvotes_plus_downvotes_at_request']
    
    # Request features
    features['request_comments'] = df['request_number_of_comments_at_retrieval']
    features['request_upvotes'] = df['number_of_upvotes_of_request_at_retrieval']
    features['request_downvotes'] = df['number_of_downvotes_of_request_at_retrieval']
    
    # Time features
    features['unix_timestamp'] = df['unix_timestamp_of_request']
    
    # User flair (categorical)
    features['user_flair'] = df['requester_user_flair'].fillna('None')
    
    # Post edited
    features['post_edited'] = df['post_was_edited'].astype(int)
    
    return features

train_meta = extract_metadata_features(train_df)
test_meta = extract_metadata_features(test_df)

print("Metadata features shape:", train_meta.shape)

## Handle Categorical Features

In [None]:
# One-hot encode user flair
train_meta = pd.get_dummies(train_meta, columns=['user_flair'], prefix='flair')
test_meta = pd.get_dummies(test_meta, columns=['user_flair'], prefix='flair')

# Ensure same columns in both train and test
missing_cols = set(train_meta.columns) - set(test_meta.columns)
for col in missing_cols:
    test_meta[col] = 0

test_meta = test_meta[train_meta.columns]

print("After one-hot encoding:", train_meta.shape)

## Text Vectorization

In [None]:
# Vectorize text using TF-IDF
print("Vectorizing text...")
tfidf = TfidfVectorizer(
    max_features=5000,  # Limit features for speed
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)

# Fit on training text and transform both train and test
train_text_features = tfidf.fit_transform(train_df['combined_text'])
test_text_features = tfidf.transform(test_df['combined_text'])

print("Text features shape:", train_text_features.shape)

## Scale Numerical Features

In [None]:
# Scale metadata features
scaler = StandardScaler()
train_meta_scaled = scaler.fit_transform(train_meta)
test_meta_scaled = scaler.transform(test_meta)

# Convert to sparse matrix for compatibility with text features
train_meta_sparse = csr_matrix(train_meta_scaled)
test_meta_sparse = csr_matrix(test_meta_scaled)

print("Scaled metadata shape:", train_meta_sparse.shape)

## Combine Features

In [None]:
# Combine text and metadata features
X_train = hstack([train_text_features, train_meta_sparse])
X_test = hstack([test_text_features, test_meta_sparse])

print("Final training features shape:", X_train.shape)
print("Final test features shape:", X_test.shape)

# Target variable
y_train = train_df['requester_received_pizza'].astype(int).values
print("Target shape:", y_train.shape)

## Cross-Validation Setup

In [None]:
# Cross-validation setup
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Model - Logistic Regression as baseline
model = LogisticRegression(
    random_state=42,
    max_iter=1000,
    class_weight='balanced'  # Handle class imbalance
)

print(f"Running {n_splits}-fold cross-validation...")

## Cross-Validation

In [None]:
# Cross-validation
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    # Train model
    model.fit(X_tr, y_tr)
    
    # Predict
    val_pred = model.predict_proba(X_val)[:, 1]
    
    # Calculate ROC AUC
    auc_score = roc_auc_score(y_val, val_pred)
    cv_scores.append(auc_score)
    
    print(f"Fold {fold + 1} ROC AUC: {auc_score:.4f}")

print(f"\nCV ROC AUC: {np.mean(cv_scores):.4f} Â± {np.std(cv_scores):.4f}")

## Train on Full Data and Predict

In [None]:
# Train on full training data
print("Training on full dataset...")
model.fit(X_train, y_train)

# Predict on test set
test_predictions = model.predict_proba(X_test)[:, 1]
print("Test predictions shape:", test_predictions.shape)

## Create Submission

In [None]:
# Create submission file
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)

print("Submission file created:", submission_path)
print("\nSubmission preview:")
print(submission.head())

# Check submission format
print(f"\nSubmission shape: {submission.shape}")
print(f"Prediction range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")