In [None]:
# Random Acts of Pizza - Baseline Model
# Competition: Predict successful pizza requests
# Strategy: Leverage leakage feature (user flair) + basic features with Random Forest

import json
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# Load training data
print("Loading training data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

print(f"Training samples: {len(train_data)}")
print(f"Features per sample: {len(train_data[0])}")

In [None]:
# Convert to DataFrame for easier manipulation
df_train = pd.DataFrame(train_data)

# Target distribution
print("Target distribution:")
print(df_train['requester_received_pizza'].value_counts(normalize=True))
print(f"\nPositive samples: {df_train['requester_received_pizza'].sum()} ({df_train['requester_received_pizza'].mean()*100:.1f}%)")

# Check the leakage feature: user flair
print("\nUser flair distribution:")
flair_counts = df_train['requester_user_flair'].value_counts()
print(flair_counts.head(10))

# Check success rate by flair
print("\nSuccess rate by user flair (top 10):")
flair_success = df_train.groupby('requester_user_flair')['requester_received_pizza'].agg(['count', 'sum', 'mean']).round(3)
flair_success = flair_success[flair_success['count'] >= 5]  # Only flair with >=5 samples
flair_success = flair_success.sort_values('mean', ascending=False)
print(flair_success.head(10))

In [None]:
# Feature engineering
print("Creating features...")

# IMPORTANT: Test data does NOT have 'requester_user_flair' column
# So we CANNOT use the leakage feature for actual predictions
# Let's create features that exist in BOTH train and test

# 1. Text length features (use edit_aware version which exists in both train and test)
df_train['title_length'] = df_train['request_title'].str.len()
df_train['text_length'] = df_train['request_text_edit_aware'].str.len()  # Use edit_aware version

# 2. User activity features (use 'at_request' versions to avoid leakage)
activity_features = [
    'requester_account_age_in_days_at_request',
    'requester_days_since_first_post_on_raop_at_request',
    'requester_number_of_comments_at_request',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_posts_on_raop_at_request',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request'
]

# 3. Post features (use 'at_request' versions)
post_features = [
    'post_was_edited'
]

# 4. Politeness markers (simple keyword counting) - use edit_aware text
politeness_words = ['please', 'thank', 'thanks', 'appreciate', 'grateful', 'kind', 'help']
for word in politeness_words:
    df_train[f'politeness_{word}'] = df_train['request_text_edit_aware'].str.lower().str.count(word)

# Combine all features (WITHOUT the leakage feature since it's not in test data)
feature_cols = ['title_length', 'text_length'] + activity_features + post_features + [f'politeness_{word}' for word in politeness_words]

print(f"Total features: {len(feature_cols)}")
print(f"Feature columns: {feature_cols}")

# For analysis only: Check what the leakage feature would have given us
if 'requester_user_flair' in df_train.columns:
    df_train['flair_leakage'] = df_train['requester_user_flair'].isin(['shroom', 'PIF']).astype(int)
    leakage_coverage = df_train['flair_leakage'].sum()
    print(f"\nNOTE: Leakage feature would cover {leakage_coverage} samples ({df_train['flair_leakage'].mean()*100:.1f}%) in training data")
    print("BUT: This feature is NOT available in test data, so we cannot use it for predictions")

In [None]:
# Prepare data for modeling
X = df_train[feature_cols].copy()
y = df_train['requester_received_pizza'].copy()

# Handle missing values
print("Missing values per feature:")
print(X.isnull().sum().sort_values(ascending=False).head())

X = X.fillna(0)  # Simple imputation

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Verify data types
print(f"\nData types:")
print(X.dtypes.value_counts())

In [None]:
# Stratified K-Fold validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = []
predictions = np.zeros(len(df_train))

print("Running 5-fold stratified CV...")

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Initialize and train Random Forest
    model = RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        min_samples_split=20,
        min_samples_leaf=10,
        random_state=42,
        n_jobs=-1,
        class_weight='balanced'  # Handle class imbalance
    )
    
    model.fit(X_train, y_train)
    
    # Predict on validation set
    val_pred = model.predict_proba(X_val)[:, 1]
    val_score = roc_auc_score(y_val, val_pred)
    cv_scores.append(val_score)
    
    # Store out-of-fold predictions
    predictions[val_idx] = val_pred
    
    print(f"Fold {fold + 1}: AUC = {val_score:.4f}")

print(f"\nCV Results: {np.mean(cv_scores):.4f} Â± {np.std(cv_scores):.4f}")
print(f"OOF AUC: {roc_auc_score(y, predictions):.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nTop 10 features by importance:")
print(feature_importance.head(10))

In [None]:
# Load test data and generate predictions
print("Loading test data...")
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

df_test = pd.DataFrame(test_data)
print(f"Test samples: {len(df_test)}")

# Apply same feature engineering to test data
# Text length features (use edit_aware version which exists in both train and test)
df_test['title_length'] = df_test['request_title'].str.len()
df_test['text_length'] = df_test['request_text_edit_aware'].str.len()  # Use edit_aware version

# Politeness markers (use edit_aware text)
for word in politeness_words:
    df_test[f'politeness_{word}'] = df_test['request_text_edit_aware'].str.lower().str.count(word)

# Prepare test features
X_test = df_test[feature_cols].copy()
X_test = X_test.fillna(0)

print(f"Test feature matrix shape: {X_test.shape}")

# Train final model on full training data
final_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=20,
    min_samples_leaf=10,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

final_model.fit(X, y)

# Generate predictions
test_predictions = final_model.predict_proba(X_test)[:, 1]
print(f"Test predictions shape: {test_predictions.shape}")
print(f"Prediction range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")
print(f"Mean prediction: {test_predictions.mean():.4f}")

In [None]:
# Create submission file
submission = pd.DataFrame({
    'request_id': df_test['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure correct format (0/1 probabilities)
submission['requester_received_pizza'] = submission['requester_received_pizza'].round(6)

print("Submission preview:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

# Verify submission format matches sample
sample_sub = pd.read_csv('/home/data/sampleSubmission.csv')
print(f"\nSample submission columns: {list(sample_sub.columns)}")
print(f"Our submission columns: {list(submission.columns)}")
print(f"Columns match: {list(submission.columns) == list(sample_sub.columns)}")