# Baseline Model for Random Acts of Pizza

This notebook implements a baseline model using:
1. The leakage feature (user flair) - perfect predictor for 'shroom' and 'PIF'
2. Basic text features (TF-IDF on title + text)
3. Numerical/meta features
4. LightGBM classifier with stratified K-fold validation

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, classification_report
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## Load Data

In [None]:
# Load training data
train_path = "/home/data/train.json"
with open(train_path, 'r') as f:
    train_data = json.load(f)

# Load test data
test_path = "/home/data/test.json"
with open(test_path, 'r') as f:
    test_data = json.load(f)

print(f"Training samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

# Convert to DataFrames
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

## Feature Engineering

In [None]:
def engineer_features(df):
    """Engineer features from the raw data"""
    
    # Create a copy to avoid modifying original
    df = df.copy()
    
    # Target variable
    if 'requester_received_pizza' in df.columns:
        df['target'] = df['requester_received_pizza'].astype(int)
    
    # Text features
    df['title_length'] = df['request_title'].str.len()
    df['text_length'] = df['request_text'].str.len()
    df['total_text_length'] = df['title_length'] + df['text_length']
    
    # User flair - CRITICAL LEAKAGE FEATURE
    df['has_shroom_flair'] = (df['requester_user_flair'] == 'shroom').astype(int)
    df['has_pif_flair'] = (df['requester_user_flair'] == 'PIF').astype(int)
    df['has_special_flair'] = df['has_shroom_flair'] | df['has_pif_flair']
    
    # Account age features
    df['account_age_days'] = df['requester_account_age_in_days_at_request']
    df['account_age_years'] = df['account_age_days'] / 365.25
    df['is_new_account'] = (df['account_age_days'] < 30).astype(int)
    
    # Activity features
    df['total_comments'] = df['requester_number_of_comments_at_request']
    df['total_posts'] = df['requester_number_of_posts_at_request']
    df['total_activity'] = df['total_comments'] + df['total_posts']
    df['activity_ratio'] = df['total_comments'] / (df['total_posts'] + 1)
    
    # Karma features
    df['net_karma'] = df['requester_upvotes_minus_downvotes_at_request']
    df['total_karma'] = df['requester_upvotes_plus_downvotes_at_request']
    df['has_negative_karma'] = (df['net_karma'] < 0).astype(int)
    
    # Subreddit diversity
    df['subreddit_count'] = df['requester_number_of_subreddits_at_request']
    df['is_raop_focused'] = (df['requester_number_of_posts_on_raop_at_request'] > 0).astype(int)
    
    # Request engagement (at retrieval time)
    df['request_upvotes'] = df['number_of_upvotes_of_request_at_retrieval']
    df['request_downvotes'] = df['number_of_downvotes_of_request_at_retrieval']
    df['request_comments'] = df['request_number_of_comments_at_retrieval']
    df['request_score'] = df['request_upvotes'] - df['request_downvotes']
    
    # Temporal features
    df['timestamp'] = df['unix_timestamp_of_request']
    df['hour_of_day'] = pd.to_datetime(df['timestamp'], unit='s').dt.hour
    df['day_of_week'] = pd.to_datetime(df['timestamp'], unit='s').dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    
    return df

# Engineer features for both train and test
train_df = engineer_features(train_df)
test_df = engineer_features(test_df)

print("Feature engineering completed")
print(f"Training features shape: {train_df.shape}")
print(f"Test features shape: {test_df.shape}")

## Prepare Features for Modeling

In [None]:
# Define numerical features
numerical_features = [
    'title_length', 'text_length', 'total_text_length',
    'account_age_days', 'account_age_years', 'is_new_account',
    'total_comments', 'total_posts', 'total_activity', 'activity_ratio',
    'net_karma', 'total_karma', 'has_negative_karma',
    'subreddit_count', 'is_raop_focused',
    'request_upvotes', 'request_downvotes', 'request_comments', 'request_score',
    'hour_of_day', 'day_of_week', 'is_weekend'
]

# Add the leakage features
leakage_features = ['has_shroom_flair', 'has_pif_flair', 'has_special_flair']

all_features = numerical_features + leakage_features

print(f"Total features: {len(all_features)}")
print(f"Numerical features: {len(numerical_features)}")
print(f"Leakage features: {len(leakage_features)}")

# Check for missing values
print("\nMissing values in training data:")
print(train_df[all_features].isnull().sum().sum())

print("\nMissing values in test data:")
print(test_df[all_features].isnull().sum().sum())

# Fill missing values with median
for col in all_features:
    if train_df[col].isnull().sum() > 0:
        median_val = train_df[col].median()
        train_df[col].fillna(median_val, inplace=True)
        test_df[col].fillna(median_val, inplace=True)

print("\nAfter filling missing values:")
print(f"Train missing: {train_df[all_features].isnull().sum().sum()}")
print(f"Test missing: {test_df[all_features].isnull().sum().sum()}")

## Text Features with TF-IDF

In [None]:
# Combine title and text for TF-IDF
train_df['combined_text'] = train_df['request_title'] + ' ' + train_df['request_text']
test_df['combined_text'] = test_df['request_title'] + ' ' + test_df['request_text']

# Create TF-IDF features (limit to top 1000 features to avoid overfitting)
tfidf = TfidfVectorizer(
    max_features=1000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.95
)

# Fit on training data and transform both train and test
tfidf_features = tfidf.fit_transform(train_df['combined_text'])
tfidf_test_features = tfidf.transform(test_df['combined_text'])

print(f"TF-IDF features shape: {tfidf_features.shape}")
print(f"TF-IDF test features shape: {tfidf_test_features.shape}")

## Stratified K-Fold Validation

In [None]:
# Prepare data for modeling
X_numerical = train_df[all_features].values
X_text = tfidf_features
y = train_df['target'].values

# Combine numerical and text features
from scipy.sparse import hstack
X_combined = hstack([X_text, X_numerical])

print(f"Combined training features shape: {X_combined.shape}")

# Stratified K-Fold
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Store results
cv_scores = []
predictions = np.zeros(len(test_df))
feature_importance_list = []

print(f"Starting {n_splits}-fold stratified cross-validation...")

In [None]:
fold = 1
for train_idx, val_idx in skf.split(X_combined, y):
    print(f"\nFold {fold}/{n_splits}")
    
    # Split data
    X_train, X_val = X_combined[train_idx], X_combined[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Calculate scale_pos_weight for class imbalance
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    print(f"Scale pos weight: {scale_pos_weight:.2f}")
    
    # Train LightGBM model
    model = lgb.LGBMClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=scale_pos_weight,
        random_state=42,
        n_jobs=-1
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='auc',
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predict on validation set
    val_pred = model.predict_proba(X_val)[:, 1]
    val_auc = roc_auc_score(y_val, val_pred)
    cv_scores.append(val_auc)
    
    print(f"Fold {fold} AUC: {val_auc:.4f}")
    
    # Predict on test set
    test_pred = model.predict_proba(hstack([tfidf_test_features, test_df[all_features].values]))[:, 1]
    predictions += test_pred / n_splits
    
    # Store feature importance
    feature_importance_list.append(model.feature_importances_)
    
    fold += 1

print(f"\n=== CROSS-VALIDATION RESULTS ===")
print(f"Mean AUC: {np.mean(cv_scores):.4f} Â± {np.std(cv_scores):.4f}")
print(f"Individual fold scores: {[f'{score:.4f}' for score in cv_scores]}")

## Feature Importance Analysis

In [None]:
# Calculate average feature importance
avg_importance = np.mean(feature_importance_list, axis=0)

# Create feature names
tfidf_feature_names = [f'tfidf_{i}' for i in range(tfidf_features.shape[1])]
feature_names = tfidf_feature_names + all_features

# Create importance DataFrame
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': avg_importance
}).sort_values('importance', ascending=False)

print("=== TOP 20 MOST IMPORTANT FEATURES ===")
print(importance_df.head(20))

# Check importance of leakage features
leakage_importance = importance_df[importance_df['feature'].isin(leakage_features)]
print(f"\n=== LEAKAGE FEATURE IMPORTANCE ===")
print(leakage_importance)

## Create Submission

In [None]:
# Create submission DataFrame
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': predictions
})

print("=== SUBMISSION PREVIEW ===")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")
print(f"Prediction statistics:")
print(submission['requester_received_pizza'].describe())

# Save submission
submission_path = "/home/submission/submission_001_baseline.csv"
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")