# Baseline Model for Random Acts of Pizza

This notebook creates a baseline model for predicting pizza request success using LightGBM with basic features.

In [1]:
import pandas as pd
import numpy as np
import json
import re
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from textstat import flesch_reading_ease, flesch_kincaid_grade
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## Load Data

In [2]:
# Load training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

# Load test data
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

print(f"Training samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

# Convert to DataFrames for easier manipulation
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

# Check class distribution
print(f"\nClass distribution in training data:")
print(train_df['requester_received_pizza'].value_counts())
print(f"Positive rate: {train_df['requester_received_pizza'].mean():.3f}")

Training samples: 2878
Test samples: 1162

Class distribution in training data:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64
Positive rate: 0.248


## Feature Engineering

In [3]:
def extract_text_features(text):
    """Extract basic text features"""
    if pd.isna(text) or text == '':
        return {
            'text_length': 0,
            'word_count': 0,
            'sentence_count': 0,
            'avg_word_length': 0,
            'exclamation_count': 0,
            'question_count': 0,
            'caps_count': 0,
            'digit_count': 0
        }
    
    # Basic text stats
    text_length = len(text)
    words = text.split()
    word_count = len(words)
    sentences = re.split(r'[.!?]+', text)
    sentence_count = max(1, len([s for s in sentences if s.strip()]))
    
    # Character-based features
    avg_word_length = np.mean([len(word) for word in words]) if words else 0
    exclamation_count = text.count('!')
    question_count = text.count('?')
    caps_count = sum(1 for c in text if c.isupper())
    digit_count = sum(1 for c in text if c.isdigit())
    
    return {
        'text_length': text_length,
        'word_count': word_count,
        'sentence_count': sentence_count,
        'avg_word_length': avg_word_length,
        'exclamation_count': exclamation_count,
        'question_count': question_count,
        'caps_count': caps_count,
        'digit_count': digit_count
    }

def extract_metadata_features(row):
    """Extract metadata features - ONLY use features available at request time"""
    features = {}
    
    # Account age at request (NOT retrieval)
    features['account_age_days'] = row.get('requester_account_age_in_days_at_request', 0)
    features['account_age_log'] = np.log1p(features['account_age_days'])
    
    # Activity features at request (NOT retrieval)
    features['total_comments'] = row.get('requester_number_of_comments_at_request', 0)
    features['total_posts'] = row.get('requester_number_of_posts_at_request', 0)
    features['raop_comments'] = row.get('requester_number_of_comments_in_raop_at_request', 0)
    features['raop_posts'] = row.get('requester_number_of_posts_on_raop_at_request', 0)
    
    # Voting features at request (NOT retrieval)
    features['upvotes_minus_downvotes'] = row.get('requester_upvotes_minus_downvotes_at_request', 0)
    features['upvotes_plus_downvotes'] = row.get('requester_upvotes_plus_downvotes_at_request', 0)
    features['vote_ratio'] = features['upvotes_minus_downvotes'] / max(1, features['upvotes_plus_downvotes'])
    
    # Subreddit diversity at request
    features['num_subreddits'] = row.get('requester_number_of_subreddits_at_request', 0)
    
    # Time features from request timestamp
    timestamp = row.get('unix_timestamp_of_request', 0)
    if timestamp and timestamp > 0:
        features['hour_of_day'] = pd.to_datetime(timestamp, unit='s').hour
        features['day_of_week'] = pd.to_datetime(timestamp, unit='s').dayofweek
    else:
        features['hour_of_day'] = 0
        features['day_of_week'] = 0
    
    # User flair (this is a static attribute, not time-dependent)
    flair = row.get('requester_user_flair')
    features['has_flair'] = 1 if flair is not None else 0
    features['flair_is_shroom'] = 1 if flair == 'shroom' else 0
    features['flair_is_pif'] = 1 if flair == 'PIF' else 0
    
    # Days since first post on RAOP at request
    features['days_since_first_raop'] = row.get('requester_days_since_first_post_on_raop_at_request', 0)
    
    return features

def create_features(df):
    """Create all features for a dataframe"""
    features_list = []
    
    for idx, row in df.iterrows():
        # Text features from title and request text
        title_features = extract_text_features(row.get('request_title', ''))
        text_features = extract_text_features(row.get('request_text_edit_aware', ''))
        
        # Metadata features
        meta_features = extract_metadata_features(row)
        
        # Combine all features
        combined_features = {}
        for prefix, feat_dict in [('title_', title_features), ('text_', text_features), ('meta_', meta_features)]:
            for key, value in feat_dict.items():
                combined_features[prefix + key] = value
        
        features_list.append(combined_features)
    
    return pd.DataFrame(features_list)

# Create features for train and test
print("Creating features for training data...")
train_features = create_features(train_df)

print("Creating features for test data...")
test_features = create_features(test_df)

print(f"Train features shape: {train_features.shape}")
print(f"Test features shape: {test_features.shape}")

# Check for any NaN values
print(f"\nNaN values in train features: {train_features.isnull().sum().sum()}")
print(f"NaN values in test features: {test_features.isnull().sum().sum()}")

# Fill any NaN values with 0
train_features = train_features.fillna(0)
test_features = test_features.fillna(0)

Creating features for training data...


Creating features for test data...


Train features shape: (2878, 32)
Test features shape: (1162, 32)

NaN values in train features: 0
NaN values in test features: 0


## Model Training with Cross-Validation

In [4]:
# Prepare data for training
X = train_features
y = train_df['requester_received_pizza'].astype(int)

# Define cross-validation strategy
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Store predictions and scores
cv_scores = []
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(test_features))

print(f"Training with {n_splits}-fold stratified cross-validation...")

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold + 1}/{n_splits}")
    
    # Split data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_val, label=y_val)
    
    # Define parameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': 42
    }
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Make predictions
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    test_pred = model.predict(test_features, num_iteration=model.best_iteration)
    
    # Store predictions
    oof_predictions[val_idx] = val_pred
    test_predictions += test_pred / n_splits
    
    # Calculate fold score
    fold_score = roc_auc_score(y_val, val_pred)
    cv_scores.append(fold_score)
    print(f"Fold {fold + 1} AUC: {fold_score:.4f}")

# Calculate overall CV score
overall_score = roc_auc_score(y, oof_predictions)
print(f"\nOverall CV AUC: {overall_score:.4f}")
print(f"Mean CV AUC: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

Training with 5-fold stratified cross-validation...

Fold 1/5
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	valid_0's auc: 1
Fold 1 AUC: 1.0000

Fold 2/5
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[2]	valid_0's auc: 1
Fold 2 AUC: 1.0000

Fold 3/5
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	valid_0's auc: 1
Fold 3 AUC: 1.0000

Fold 4/5
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	valid_0's auc: 1
Fold 4 AUC: 1.0000

Fold 5/5
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[2]	valid_0's auc: 1
Fold 5 AUC: 1.0000

Overall CV AUC: 1.0000
Mean CV AUC: 1.0000 ± 0.0000


## Feature Importance Analysis

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print("Top 20 most important features:")
print(feature_importance.head(20))

## Generate Submission

In [None]:
# Create submission file
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure the submission has the correct format
submission['requester_received_pizza'] = submission['requester_received_pizza'].astype(float)

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)

print(f"Submission saved to: {submission_path}")
print(f"Submission shape: {submission.shape}")
print(f"\nPrediction distribution:")
print(submission['requester_received_pizza'].describe())

# Show first few rows
print(f"\nFirst 5 rows of submission:")
print(submission.head())