# Baseline Experiment - Random Acts of Pizza

## Strategy
- Combine text features (title + text) with numerical/categorical features
- Use TF-IDF for text representation
- Use LightGBM for classification
- 5-fold stratified cross-validation
- Evaluate with AUC-ROC

In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

In [2]:
import json
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
from scipy.sparse import hstack
import lightgbm as lgb
import xgboost as xgb
from collections import Counter
import re

# Load data
print("Loading training data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

print("Loading test data...")
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

print(f"Training samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

# Extract labels
train_labels = [item['requester_received_pizza'] for item in train_data]
print(f"Positive rate in training: {sum(train_labels)/len(train_labels):.3f}")

Loading training data...
Loading test data...
Training samples: 2878
Test samples: 1162
Positive rate in training: 0.248


In [3]:
# Convert JSON data to DataFrames for easier manipulation
print("Converting to DataFrames...")
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print(f"Training DataFrame shape: {train_df.shape}")
print(f"Test DataFrame shape: {test_df.shape}")

# Show basic info about the target
print("\nTarget distribution:")
print(train_df['requester_received_pizza'].value_counts())
print(f"Positive rate: {train_df['requester_received_pizza'].mean():.3f}")

# Debug: Check column differences
print("\nColumns in train but not in test:")
train_cols = set(train_df.columns)
test_cols = set(test_df.columns)
missing_in_test = train_cols - test_cols
print(missing_in_test)

print("\nColumns in test but not in train:")
missing_in_train = test_cols - train_cols
print(missing_in_train)

# Add missing columns to test data with default values
for col in missing_in_test:
    if col == 'requester_received_pizza':
        continue  # This is the target, don't add to test
    
    # Determine appropriate default value
    if train_df[col].dtype in ['int64', 'float64']:
        default_val = 0
    elif train_df[col].dtype == 'bool':
        default_val = False
    else:
        default_val = ''
    
    test_df[col] = default_val
    print(f"Added missing column '{col}' to test data with default value: {default_val}")

Converting to DataFrames...
Training DataFrame shape: (2878, 32)
Test DataFrame shape: (1162, 17)

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64
Positive rate: 0.248

Columns in train but not in test:
{'requester_received_pizza', 'number_of_upvotes_of_request_at_retrieval', 'requester_number_of_posts_on_raop_at_retrieval', 'request_text', 'requester_user_flair', 'requester_upvotes_minus_downvotes_at_retrieval', 'request_number_of_comments_at_retrieval', 'post_was_edited', 'requester_number_of_posts_at_retrieval', 'requester_account_age_in_days_at_retrieval', 'number_of_downvotes_of_request_at_retrieval', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_upvotes_plus_downvotes_at_retrieval', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_comments_at_retrieval'}

Columns in test but not in train:
set()
Added missing column 'number_of_upvotes_of_request_at_retrieval' to test data with default

In [4]:
# Basic feature engineering
print("Creating features...")

# Debug: Check for missing text fields
print("Checking for missing text fields in training data...")
missing_text_train = train_df['request_text'].isna().sum()
missing_title_train = train_df['request_title'].isna().sum()
print(f"Missing request_text in train: {missing_text_train}")
print(f"Missing request_title in train: {missing_title_train}")

print("Checking for missing text fields in test data...")
missing_text_test = test_df['request_text'].isna().sum() if 'request_text' in test_df.columns else len(test_df)
missing_title_test = test_df['request_title'].isna().sum() if 'request_title' in test_df.columns else len(test_df)
print(f"Missing request_text in test: {missing_text_test}")
print(f"Missing request_title in test: {missing_title_test}")

# Handle missing columns in test data
if 'request_text' not in test_df.columns:
    test_df['request_text'] = ''
if 'request_title' not in test_df.columns:
    test_df['request_title'] = ''

# Fill missing values
train_df['request_text'] = train_df['request_text'].fillna('')
train_df['request_title'] = train_df['request_title'].fillna('')
test_df['request_text'] = test_df['request_text'].fillna('')
test_df['request_title'] = test_df['request_title'].fillna('')

# Text features
train_df['combined_text'] = train_df['request_title'] + ' ' + train_df['request_text']
test_df['combined_text'] = test_df['request_title'] + ' ' + test_df['request_text']

# Length features
train_df['title_length'] = train_df['request_title'].str.len()
train_df['text_length'] = train_df['request_text'].str.len()
train_df['combined_length'] = train_df['combined_text'].str.len()

test_df['title_length'] = test_df['request_title'].str.len()
test_df['text_length'] = test_df['request_text'].str.len()
test_df['combined_length'] = test_df['combined_text'].str.len()

# Ratios - handle division by zero
train_df['upvote_ratio'] = train_df['number_of_upvotes_of_request_at_retrieval'] / (train_df['number_of_upvotes_of_request_at_retrieval'] + train_df['number_of_downvotes_of_request_at_retrieval'] + 1)
train_df['requester_vote_ratio'] = train_df['requester_upvotes_minus_downvotes_at_request'] / (train_df['requester_upvotes_plus_downvotes_at_request'] + 1)
train_df['account_age_years'] = train_df['requester_account_age_in_days_at_request'] / 365.25

test_df['upvote_ratio'] = test_df['number_of_upvotes_of_request_at_retrieval'] / (test_df['number_of_upvotes_of_request_at_retrieval'] + test_df['number_of_downvotes_of_request_at_retrieval'] + 1)
test_df['requester_vote_ratio'] = test_df['requester_upvotes_minus_downvotes_at_request'] / (test_df['requester_upvotes_plus_downvotes_at_request'] + 1)
test_df['account_age_years'] = test_df['requester_account_age_in_days_at_request'] / 365.25

# Activity rates - handle division by zero
train_df['comments_per_day'] = train_df['requester_number_of_comments_at_request'] / (train_df['requester_account_age_in_days_at_request'] + 1)
train_df['posts_per_day'] = train_df['requester_number_of_posts_at_request'] / (train_df['requester_account_age_in_days_at_request'] + 1)
train_df['subreddit_diversity'] = train_df['requester_number_of_subreddits_at_request'] / (train_df['requester_number_of_posts_at_request'] + 1)

test_df['comments_per_day'] = test_df['requester_number_of_comments_at_request'] / (test_df['requester_account_age_in_days_at_request'] + 1)
test_df['posts_per_day'] = test_df['requester_number_of_posts_at_request'] / (test_df['requester_account_age_in_days_at_request'] + 1)
test_df['subreddit_diversity'] = test_df['requester_number_of_subreddits_at_request'] / (test_df['requester_number_of_posts_at_request'] + 1)

print("Feature engineering completed!")

Creating features...
Checking for missing text fields in training data...
Missing request_text in train: 0
Missing request_title in train: 0
Checking for missing text fields in test data...
Missing request_text in test: 0
Missing request_title in test: 0
Feature engineering completed!


In [5]:
# Prepare numerical and categorical features
print(f"Numerical features: {len(numerical_features)}")
print(f"Categorical features: {len(categorical_features)}")

# Fill missing values in numerical features
for col in numerical_features:
    if col in train_df.columns:
        train_df[col] = train_df[col].fillna(0)
    if col in test_df.columns:
        test_df[col] = test_df[col].fillna(0)

# No categorical features to encode since requester_user_flair is missing in test
print("Feature preparation completed!")

Numerical features: 21
Categorical features: 1


In [6]:
# Prepare text features with TF-IDF
print("Vectorizing text...")

# Use a subset of TF-IDF features to keep memory manageable
vectorizer = TfidfVectorizer(
    max_features=5000,  # Limit features for speed
    stop_words='english',
    ngram_range=(1, 2),  # Unigrams and bigrams
    min_df=2,
    max_df=0.95
)

text_tfidf = vectorizer.fit_transform(train_df['combined_text'])
print(f"TF-IDF shape: {text_tfidf.shape}")

Vectorizing text...


TF-IDF shape: (2878, 5000)


In [7]:
# Combine all features
from scipy.sparse import hstack, csr_matrix

# Prepare numerical features as sparse matrix
numerical_matrix = train_df[numerical_features + ['requester_user_flair_encoded']].values

# Combine text TF-IDF with numerical features
X = hstack([text_tfidf, numerical_matrix])
X = csr_matrix(X)  # Convert to CSR for efficient row indexing
y = train_df['requester_received_pizza'].values

print(f"Final feature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")

Final feature matrix shape: (2878, 5022)
Target shape: (2878,)


In [8]:
# Cross-validation setup
print("Setting up cross-validation...")
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

fold_scores = []
oof_predictions = np.zeros(len(train_df))

print(f"Starting {n_folds}-fold cross-validation...")

Setting up cross-validation...
Starting 5-fold cross-validation...


In [None]:
# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold + 1}/{n_folds}")
    
    # Split data
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    # Parameters (simplified for baseline)
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.1,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': 42
    }
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=500,
        valid_sets=[val_data],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(0)
        ]
    )
    
    # Predict on validation set
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    oof_predictions[val_idx] = val_pred
    
    # Calculate AUC
    fold_auc = roc_auc_score(y_val, val_pred)
    fold_scores.append(fold_auc)
    
    print(f"Fold {fold + 1} AUC: {fold_auc:.4f}")

# Overall CV score
mean_auc = np.mean(fold_scores)
std_auc = np.std(fold_scores)
print(f"\nCV Score: {mean_auc:.4f} Â± {std_auc:.4f}")

# OOF AUC
oof_auc = roc_auc_score(y, oof_predictions)
print(f"OOF AUC: {oof_auc:.4f}")

In [None]:
# Load test data
print("Loading test data...")
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

test_df = pd.DataFrame(test_data)
print(f"Test data shape: {test_df.shape}")

# Apply same feature engineering to test data
print("Engineering test features...")

# Handle missing columns in test data (in case they weren't added earlier)
if 'request_text' not in test_df.columns:
    test_df['request_text'] = ''
if 'request_title' not in test_df.columns:
    test_df['request_title'] = ''

# Fill missing values
test_df['request_text'] = test_df['request_text'].fillna('')
test_df['request_title'] = test_df['request_title'].fillna('')

# Text features
test_df['combined_text'] = test_df['request_title'] + ' ' + test_df['request_text']

# Length features
test_df['title_length'] = test_df['request_title'].str.len()
test_df['text_length'] = test_df['request_text'].str.len()
test_df['combined_length'] = test_df['combined_text'].str.len()

# Ratios
test_df['upvote_ratio'] = test_df['number_of_upvotes_of_request_at_retrieval'] / (test_df['number_of_upvotes_of_request_at_retrieval'] + test_df['number_of_downvotes_of_request_at_retrieval'] + 1)
test_df['requester_vote_ratio'] = test_df['requester_upvotes_plus_downvotes_at_request'] / (test_df['requester_upvotes_plus_downvotes_at_request'] + 1)

# Account age features
test_df['account_age_years'] = test_df['requester_account_age_in_days_at_request'] / 365.25

# Activity features
test_df['comments_per_day'] = test_df['requester_number_of_comments_at_request'] / (test_df['requester_account_age_in_days_at_request'] + 1)
test_df['posts_per_day'] = test_df['requester_number_of_posts_at_request'] / (test_df['requester_account_age_in_days_at_request'] + 1)
test_df['subreddit_diversity'] = test_df['requester_number_of_subreddits_at_request'] / (test_df['requester_number_of_posts_at_request'] + 1)

# Handle categorical features
test_df['requester_user_flair'] = test_df['requester_user_flair'].fillna('missing')
test_df['requester_user_flair_encoded'] = le.transform(test_df['requester_user_flair'])

print("Test feature engineering completed!")

In [None]:
# Transform test text with TF-IDF
print("Transforming test text...")
test_text_tfidf = vectorizer.transform(test_df['combined_text'])

# Prepare test numerical features
test_numerical_matrix = test_df[numerical_features + ['requester_user_flair_encoded']].values

# Combine test features
X_test = hstack([test_text_tfidf, test_numerical_matrix])
print(f"Test feature matrix shape: {X_test.shape}")

In [None]:
# Train final model on full training data and predict on test set
print("Training final model on full training data...")

final_train_data = lgb.Dataset(X, label=y)

final_model = lgb.train(
    params,
    final_train_data,
    num_boost_round=500,
    valid_sets=[final_train_data],
    callbacks=[
        lgb.log_evaluation(0)
    ]
)

print("Making predictions on test set...")
test_predictions = final_model.predict(X_test)

# Create submission file
submission_df = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

submission_path = '/home/submission/submission.csv'
submission_df.to_csv(submission_path, index=False)

print(f"Submission file saved to: {submission_path}")
print(f"Submission shape: {submission_df.shape}")
print(f"Prediction distribution:")
print(submission_df['requester_received_pizza'].describe())

In [17]:
# Check what columns actually exist in train and test
print("Train columns:")
print(train_df.columns.tolist())
print(f"\nTotal train columns: {len(train_df.columns)}")

print("\nTest columns:")
print(test_df.columns.tolist())
print(f"\nTotal test columns: {len(test_df.columns)}")

# Check which numerical features actually exist
print("\nChecking which numerical features exist in train:")
for col in numerical_features:
    exists = col in train_df.columns
    print(f"{col}: {exists}")

Train columns:
['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_upvotes_minus_downvotes_at_request', '

In [18]:
# Check which features are missing in test
print("Features missing in test data:")
missing_in_test = []
for col in numerical_features:
    if col not in test_df.columns:
        missing_in_test.append(col)
        print(f"  {col}")

print(f"\nTotal missing: {len(missing_in_test)}")

# Also check for categorical features
print(f"\nCategorical features in test:")
for col in categorical_features:
    exists = col in test_df.columns
    print(f"  {col}: {exists}")

Features missing in test data:
  number_of_upvotes_of_request_at_retrieval
  number_of_downvotes_of_request_at_retrieval
  request_number_of_comments_at_retrieval
  upvote_ratio
  requester_vote_ratio
  account_age_years
  comments_per_day
  posts_per_day
  subreddit_diversity

Total missing: 9

Categorical features in test:
  requester_user_flair: False


In [19]:
# Revised feature lists based on what's available in both train and test
numerical_features = [
    'requester_account_age_in_days_at_request',
    'requester_days_since_first_post_on_raop_at_request',
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_posts_on_raop_at_request',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'title_length',
    'text_length',
    'combined_length'
]

# Categorical features - requester_user_flair is missing in test, so we'll skip it
categorical_features = []  # Empty for now

print(f"Revised numerical features: {len(numerical_features)}")
print(f"Revised categorical features: {len(categorical_features)}")

# Verify all features exist in both train and test
print("\nVerifying features exist in both datasets:")
for col in numerical_features:
    train_exists = col in train_df.columns
    test_exists = col in test_df.columns
    print(f"  {col}: train={train_exists}, test={test_exists}")

Revised numerical features: 12
Revised categorical features: 0

Verifying features exist in both datasets:
  requester_account_age_in_days_at_request: train=True, test=True
  requester_days_since_first_post_on_raop_at_request: train=True, test=True
  requester_number_of_comments_at_request: train=True, test=True
  requester_number_of_posts_at_request: train=True, test=True
  requester_number_of_comments_in_raop_at_request: train=True, test=True
  requester_number_of_posts_on_raop_at_request: train=True, test=True
  requester_number_of_subreddits_at_request: train=True, test=True
  requester_upvotes_minus_downvotes_at_request: train=True, test=True
  requester_upvotes_plus_downvotes_at_request: train=True, test=True
  title_length: train=True, test=True
  text_length: train=True, test=True
  combined_length: train=True, test=True
