# Baseline Model for Random Acts of Pizza

This notebook creates a proper baseline using only features available in both train and test data.

Key findings:
- Train: 2878 records, Test: 1162 records
- Leakage feature 'requester_user_flair' exists only in train (not usable)
- Using only request-time features (common to both datasets)

In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

print(f"Train records: {len(train_data)}")
print(f"Test records: {len(test_data)}")

# Convert to DataFrames
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print("\nTrain columns:", len(train_df.columns))
print("Test columns:", len(test_df.columns))
print("\nCommon columns:", len(set(train_df.columns) & set(test_df.columns)))

Loading data...
Train records: 2878
Test records: 1162

Train columns: 32
Test columns: 17

Common columns: 17


In [2]:
# Identify common features (available in both train and test)
train_features = set(train_df.columns)
test_features = set(test_df.columns)
common_features = train_features & test_features

print("Common features:")
for f in sorted(common_features):
    print(f"  - {f}")

print(f"\nTotal common features: {len(common_features)}")

# Remove non-predictive features
exclude_features = {
    'request_id',  # ID column
    'giver_username_if_known',  # Not known at request time
    'requester_username',  # High cardinality
    'requester_subreddits_at_request'  # Complex list data
}

usable_features = common_features - exclude_features
print(f"\nUsable features after exclusion: {len(usable_features)}")
print("\nUsable features:")
for f in sorted(usable_features):
    print(f"  - {f}")

Common features:
  - giver_username_if_known
  - request_id
  - request_text_edit_aware
  - request_title
  - requester_account_age_in_days_at_request
  - requester_days_since_first_post_on_raop_at_request
  - requester_number_of_comments_at_request
  - requester_number_of_comments_in_raop_at_request
  - requester_number_of_posts_at_request
  - requester_number_of_posts_on_raop_at_request
  - requester_number_of_subreddits_at_request
  - requester_subreddits_at_request
  - requester_upvotes_minus_downvotes_at_request
  - requester_upvotes_plus_downvotes_at_request
  - requester_username
  - unix_timestamp_of_request
  - unix_timestamp_of_request_utc

Total common features: 17

Usable features after exclusion: 13

Usable features:
  - request_text_edit_aware
  - request_title
  - requester_account_age_in_days_at_request
  - requester_days_since_first_post_on_raop_at_request
  - requester_number_of_comments_at_request
  - requester_number_of_comments_in_raop_at_request
  - requester_numb

In [3]:
# Prepare features - handle text fields and simple preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

# Text features to use
text_features = ['request_title', 'request_text_edit_aware']
numeric_features = list(usable_features - set(text_features))

print("Numeric features:", len(numeric_features))
print("Text features:", len(text_features))

# Create baseline numeric features only (simplest approach)
X_train = train_df[numeric_features].copy()
X_test = test_df[numeric_features].copy()

# Handle any missing values
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

y_train = train_df['requester_received_pizza'].astype(int)

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Positive rate: {y_train.mean():.3f}")

Numeric features: 11
Text features: 2
Training data shape: (2878, 11)
Test data shape: (1162, 11)
Positive rate: 0.248


In [4]:
# Stratified CV with Random Forest
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_scores = []
oof_predictions = np.zeros(len(X_train))

print("Running 5-fold Stratified CV...")

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Train Random Forest
    model = RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        min_samples_split=50,
        min_samples_leaf=20,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    
    model.fit(X_tr, y_tr)
    
    # Predict on validation set
    val_pred = model.predict_proba(X_val)[:, 1]
    oof_predictions[val_idx] = val_pred
    
    # Calculate AUC
    auc = roc_auc_score(y_val, val_pred)
    fold_scores.append(auc)
    
    print(f"Fold {fold + 1}: AUC = {auc:.4f}")

print(f"\nMean AUC: {np.mean(fold_scores):.4f} ± {np.std(fold_scores):.4f}")
print(f"OOF AUC: {roc_auc_score(y_train, oof_predictions):.4f}")

Running 5-fold Stratified CV...


Fold 1: AUC = 0.6807


Fold 2: AUC = 0.6427


Fold 3: AUC = 0.6963


Fold 4: AUC = 0.6447


Fold 5: AUC = 0.6693

Mean AUC: 0.6667 ± 0.0207
OOF AUC: 0.6658


In [5]:
# Train final model on full training data
print("Training final model on full data...")
final_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    min_samples_split=40,
    min_samples_leaf=15,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

final_model.fit(X_train, y_train)

# Make predictions on test set
test_predictions = final_model.predict_proba(X_test)[:, 1]
print(f"Test predictions shape: {test_predictions.shape}")
print(f"Test prediction range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")
print(f"Test prediction mean: {test_predictions.mean():.4f}")

Training final model on full data...


Test predictions shape: (1162,)
Test prediction range: [0.0765, 0.7699]
Test prediction mean: 0.4248


In [6]:
# Create submission file
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

print("Submission format:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")

# Save submission
submission_path = '/home/submission/submission_002_baseline.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

# Verify format matches sample
sample_sub = pd.read_csv('/home/data/sampleSubmission.csv')
print(f"\nSample submission shape: {sample_sub.shape}")
print(f"Columns match: {list(submission.columns) == list(sample_sub.columns)}")
print(f"ID count match: {len(submission) == len(sample_sub)}")

Submission format:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.390106
1   t3_roiuw                  0.465169
2   t3_mjnbq                  0.479372
3   t3_t8wd1                  0.501811
4  t3_1m4zxu                  0.320677

Submission shape: (1162, 2)

Submission saved to: /home/submission/submission_002_baseline.csv

Sample submission shape: (1162, 2)
Columns match: True
ID count match: True
