# Baseline Experiment - Random Acts of Pizza

## Strategy
- Combine text features (title + text) with numerical/categorical features
- Use TF-IDF for text representation
- Use LightGBM for classification
- 5-fold stratified cross-validation
- Evaluate with AUC-ROC

In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

In [2]:
# Load training data
print("Loading training data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

print(f"Number of training samples: {len(train_data)}")
print(f"Keys in first sample: {list(train_data[0].keys())}")

Loading training data...
Number of training samples: 2878
Keys in first sample: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddi

In [3]:
def extract_features(data):
    """Extract comprehensive features from pizza request data"""
    features = []
    
    for item in data:
        feat = {}
        
        # Get text - use request_text_edit_aware first, fall back to request_text
        text = item.get('request_text_edit_aware', '') or item.get('request_text', '')
        title = item.get('request_title', '')
        full_text = f"{title} {text}" if title and text else (title or text)
        
        # Text features
        feat['text_length'] = len(full_text) if full_text else 0
        feat['word_count'] = len(full_text.split()) if full_text else 0
        feat['char_count'] = len(full_text.replace(' ', '')) if full_text else 0
        
        # Sentiment indicators
        positive_words = ['thank', 'thanks', 'appreciate', 'grateful', 'please', 'kind', 'generous', 'awesome', 'amazing', 'wonderful']
        negative_words = ['desperate', 'starving', 'broke', 'hungry', 'broke', 'poor', 'suffering', 'depressed', 'suicidal', 'dying']
        
        text_lower = full_text.lower() if full_text else ''
        feat['positive_word_count'] = sum(1 for word in positive_words if word in text_lower)
        feat['negative_word_count'] = sum(1 for word in negative_words if word in text_lower)
        feat['sentiment_score'] = feat['positive_word_count'] - feat['negative_word_count']
        
        # Punctuation and style
        feat['exclamation_count'] = full_text.count('!') if full_text else 0
        feat['question_count'] = full_text.count('?') if full_text else 0
        feat['caps_count'] = sum(1 for c in full_text if c.isupper()) if full_text else 0
        feat['caps_ratio'] = feat['caps_count'] / max(feat['char_count'], 1)
        
        # Requester account features
        feat['account_age_days'] = item.get('requester_account_age_in_days_at_request', 0)
        feat['account_age_years'] = feat['account_age_days'] / 365.25
        feat['is_new_account'] = 1 if feat['account_age_days'] < 30 else 0
        
        # Posting history
        feat['total_posts'] = item.get('requester_number_of_posts_at_request', 0)
        feat['total_comments'] = item.get('requester_number_of_comments_at_request', 0)
        feat['raop_posts'] = item.get('requester_number_of_posts_on_raop_at_request', 0)
        feat['raop_comments'] = item.get('requester_number_of_comments_in_raop_at_request', 0)
        
        # Karma features
        feat['upvotes_minus_downvotes'] = item.get('requester_upvotes_minus_downvotes_at_request', 0)
        feat['upvotes_plus_downvotes'] = item.get('requester_upvotes_plus_downvotes_at_request', 0)
        feat['karma_ratio'] = feat['upvotes_minus_downvotes'] / max(feat['upvotes_plus_downvotes'], 1)
        
        # Subreddit diversity
        subreddits = item.get('requester_subreddits_at_request', [])
        feat['num_subreddits'] = len(subreddits) if subreddits else 0
        feat['has_diverse_subreddits'] = 1 if feat['num_subreddits'] > 10 else 0
        
        # Time features
        timestamp = item.get('unix_timestamp_of_request_utc', 0)
        if timestamp:
            dt = datetime.fromtimestamp(timestamp)
            feat['hour_of_day'] = dt.hour
            feat['day_of_week'] = dt.weekday()
            feat['is_weekend'] = 1 if dt.weekday() >= 5 else 0
            feat['is_evening'] = 1 if 18 <= dt.hour <= 23 else 0
        else:
            feat['hour_of_day'] = 0
            feat['day_of_week'] = 0
            feat['is_weekend'] = 0
            feat['is_evening'] = 0
        
        # Request metadata
        feat['post_was_edited'] = 1 if item.get('post_was_edited', False) else 0
        feat['title_length'] = len(title) if title else 0
        feat['title_word_count'] = len(title.split()) if title else 0
        
        # Interaction features
        feat['comments_per_post'] = feat['total_comments'] / max(feat['total_posts'], 1)
        feat['karma_per_comment'] = feat['upvotes_minus_downvotes'] / max(feat['total_comments'], 1)
        feat['karma_per_post'] = feat['upvotes_minus_downvotes'] / max(feat['total_posts'], 1)
        
        # Add all features to list
        features.append(feat)
    
    return features

Training data shape: (2878, 32)

Target distribution:
requester_received_pizza
False    2163
True      715
Name: count, dtype: int64
Positive rate: 0.248


In [4]:
# Basic feature engineering
print("Creating features...")

# Text features - combine title and text
train_df['combined_text'] = train_df['request_title'].fillna('') + ' ' + train_df['request_text'].fillna('')

# Length features
train_df['title_length'] = train_df['request_title'].fillna('').str.len()
train_df['text_length'] = train_df['request_text'].fillna('').str.len()
train_df['combined_length'] = train_df['combined_text'].str.len()

# Upvote/downvote ratios
train_df['upvote_ratio'] = train_df['number_of_upvotes_of_request_at_retrieval'] / (train_df['number_of_upvotes_of_request_at_retrieval'] + train_df['number_of_downvotes_of_request_at_retrieval'] + 1)
train_df['requester_vote_ratio'] = train_df['requester_upvotes_plus_downvotes_at_request'] / (train_df['requester_upvotes_plus_downvotes_at_request'] + 1)

# Account age features (convert to years)
train_df['account_age_years'] = train_df['requester_account_age_in_days_at_request'] / 365.25

# Activity features
train_df['comments_per_day'] = train_df['requester_number_of_comments_at_request'] / (train_df['requester_account_age_in_days_at_request'] + 1)
train_df['posts_per_day'] = train_df['requester_number_of_posts_at_request'] / (train_df['requester_account_age_in_days_at_request'] + 1)

# Subreddit diversity
train_df['subreddit_diversity'] = train_df['requester_number_of_subreddits_at_request'] / (train_df['requester_number_of_posts_at_request'] + 1)

print("Feature engineering completed!")

Creating features...
Feature engineering completed!


In [5]:
# Prepare numerical and categorical features
numerical_features = [
    'number_of_upvotes_of_request_at_retrieval',
    'number_of_downvotes_of_request_at_retrieval', 
    'request_number_of_comments_at_retrieval',
    'requester_account_age_in_days_at_request',
    'requester_days_since_first_post_on_raop_at_request',
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_posts_on_raop_at_request',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'title_length',
    'text_length',
    'combined_length',
    'upvote_ratio',
    'requester_vote_ratio',
    'account_age_years',
    'comments_per_day',
    'posts_per_day',
    'subreddit_diversity'
]

# Handle categorical features
categorical_features = ['requester_user_flair']

# Fill missing values
for col in numerical_features:
    train_df[col] = train_df[col].fillna(0)

# Encode categorical features
flair_encoder = LabelEncoder()
train_df['requester_user_flair_encoded'] = flair_encoder.fit_transform(train_df['requester_user_flair'].fillna('None'))

print(f"Numerical features: {len(numerical_features)}")
print(f"Categorical features: {len(categorical_features)}")

Numerical features: 21
Categorical features: 1


In [6]:
# Prepare text features with TF-IDF
print("Vectorizing text...")

# Use a subset of TF-IDF features to keep memory manageable
vectorizer = TfidfVectorizer(
    max_features=5000,  # Limit features for speed
    stop_words='english',
    ngram_range=(1, 2),  # Unigrams and bigrams
    min_df=2,
    max_df=0.95
)

text_tfidf = vectorizer.fit_transform(train_df['combined_text'])
print(f"TF-IDF shape: {text_tfidf.shape}")

Vectorizing text...


TF-IDF shape: (2878, 5000)


In [7]:
# Combine all features
from scipy.sparse import hstack, csr_matrix

# Prepare numerical features as sparse matrix
numerical_matrix = train_df[numerical_features + ['requester_user_flair_encoded']].values

# Combine text TF-IDF with numerical features
X = hstack([text_tfidf, numerical_matrix])
X = csr_matrix(X)  # Convert to CSR for efficient row indexing
y = train_df['requester_received_pizza'].values

print(f"Final feature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")

Final feature matrix shape: (2878, 5022)
Target shape: (2878,)


In [8]:
# Cross-validation setup
print("Setting up cross-validation...")
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

fold_scores = []
oof_predictions = np.zeros(len(train_df))

print(f"Starting {n_folds}-fold cross-validation...")

Setting up cross-validation...
Starting 5-fold cross-validation...


In [None]:
# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold + 1}/{n_folds}")
    
    # Split data
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    # Parameters (simplified for baseline)
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.1,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': 42
    }
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=500,
        valid_sets=[val_data],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(0)
        ]
    )
    
    # Predict on validation set
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    oof_predictions[val_idx] = val_pred
    
    # Calculate AUC
    fold_auc = roc_auc_score(y_val, val_pred)
    fold_scores.append(fold_auc)
    
    print(f"Fold {fold + 1} AUC: {fold_auc:.4f}")

# Overall CV score
mean_auc = np.mean(fold_scores)
std_auc = np.std(fold_scores)
print(f"\nCV Score: {mean_auc:.4f} Â± {std_auc:.4f}")

# OOF AUC
oof_auc = roc_auc_score(y, oof_predictions)
print(f"OOF AUC: {oof_auc:.4f}")

In [None]:
# Load test data
print("Loading test data...")
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

test_df = pd.DataFrame(test_data)
print(f"Test data shape: {test_df.shape}")

# Apply same feature engineering to test data
print("Engineering test features...")

# Text features
test_df['combined_text'] = test_df['request_title'].fillna('') + ' ' + test_df['request_text'].fillna('')

# Length features
test_df['title_length'] = test_df['request_title'].fillna('').str.len()
test_df['text_length'] = test_df['request_text'].fillna('').str.len()
test_df['combined_length'] = test_df['combined_text'].str.len()

# Ratios
test_df['upvote_ratio'] = test_df['number_of_upvotes_of_request_at_retrieval'] / (test_df['number_of_upvotes_of_request_at_retrieval'] + test_df['number_of_downvotes_of_request_at_retrieval'] + 1)
test_df['requester_vote_ratio'] = test_df['requester_upvotes_plus_downvotes_at_request'] / (test_df['requester_upvotes_plus_downvotes_at_request'] + 1)

# Account age
test_df['account_age_years'] = test_df['requester_account_age_in_days_at_request'] / 365.25

# Activity features
test_df['comments_per_day'] = test_df['requester_number_of_comments_at_request'] / (test_df['requester_account_age_in_days_at_request'] + 1)
test_df['posts_per_day'] = test_df['requester_number_of_posts_at_request'] / (test_df['requester_account_age_in_days_at_request'] + 1)

# Subreddit diversity
test_df['subreddit_diversity'] = test_df['requester_number_of_subreddits_at_request'] / (test_df['requester_number_of_posts_at_request'] + 1)

# Fill missing values
for col in numerical_features:
    test_df[col] = test_df[col].fillna(0)

# Encode categorical features
test_df['requester_user_flair_encoded'] = flair_encoder.transform(test_df['requester_user_flair'].fillna('None'))

print("Test feature engineering completed!")

In [None]:
# Transform test text with TF-IDF
print("Transforming test text...")
test_text_tfidf = vectorizer.transform(test_df['combined_text'])

# Prepare test numerical features
test_numerical_matrix = test_df[numerical_features + ['requester_user_flair_encoded']].values

# Combine test features
X_test = hstack([test_text_tfidf, test_numerical_matrix])
print(f"Test feature matrix shape: {X_test.shape}")

In [None]:
# Train final model on full training data and predict on test set
print("Training final model on full training data...")

final_train_data = lgb.Dataset(X, label=y)

final_model = lgb.train(
    params,
    final_train_data,
    num_boost_round=500,
    valid_sets=[final_train_data],
    callbacks=[
        lgb.log_evaluation(0)
    ]
)

print("Making predictions on test set...")
test_predictions = final_model.predict(X_test)

# Create submission file
submission_df = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

submission_path = '/home/submission/submission.csv'
submission_df.to_csv(submission_path, index=False)

print(f"Submission file saved to: {submission_path}")
print(f"Submission shape: {submission_df.shape}")
print(f"Prediction distribution:")
print(submission_df['requester_received_pizza'].describe())