# Baseline Experiment: LightGBM with Text and Metadata Features

This notebook implements a baseline model following the seed prompt strategy:
- LightGBM gradient boosting on engineered features
- TF-IDF vectors for text features
- Metadata features from the dataset
- Stratified 5-fold cross-validation
- AUC-ROC evaluation metric

In [3]:
import pandas as pd
import numpy as np
import json
import re
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from scipy.sparse import hstack
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## Load Data

In [4]:
# Load training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)
        
train_df = pd.DataFrame(train_data)
print(f"Training data shape: {train_df.shape}")
print(f"Columns: {train_df.columns.tolist()}")
print(f"\nTarget distribution:")
print(train_df['requester_received_pizza'].value_counts(normalize=True))

Training data shape: (2878, 32)
Columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_upvotes_minu

In [5]:
# Load test data
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)
        
test_df = pd.DataFrame(test_data)
print(f"Test data shape: {test_df.shape}")
print(f"Columns: {test_df.columns.tolist()}")

Test data shape: (1162, 17)
Columns: ['giver_username_if_known', 'request_id', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_days_since_first_post_on_raop_at_request', 'requester_number_of_comments_at_request', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_posts_at_request', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_subreddits_at_request', 'requester_subreddits_at_request', 'requester_upvotes_minus_downvotes_at_request', 'requester_upvotes_plus_downvotes_at_request', 'requester_username', 'unix_timestamp_of_request', 'unix_timestamp_of_request_utc']


## Feature Engineering

In [None]:
def create_metadata_features(df):
    """Create metadata features"""
    features = pd.DataFrame()
    
    # Account age features (only at_request available in test)
    features['account_age_at_request'] = df['requester_account_age_in_days_at_request']
    
    # Karma features (only at_request available in test)
    features['karma_at_request'] = df['requester_upvotes_minus_downvotes_at_request']
    features['total_votes_at_request'] = df['requester_upvotes_plus_downvotes_at_request']
    
    # Karma ratios (to handle division by zero)
    features['karma_ratio_at_request'] = np.where(
        df['requester_upvotes_plus_downvotes_at_request'] > 0,
        df['requester_upvotes_minus_downvotes_at_request'] / df['requester_upvotes_plus_downvotes_at_request'],
        0
    )
    
    # Activity metrics (only at_request available in test)
    features['comments_at_request'] = df['requester_number_of_comments_at_request']
    features['posts_at_request'] = df['requester_number_of_posts_at_request']
    
    # RAOP-specific activity (only at_request available in test)
    features['raop_comments_at_request'] = df['requester_number_of_comments_in_raop_at_request']
    features['raop_posts_at_request'] = df['requester_number_of_posts_on_raop_at_request']
    
    # Subreddit diversity
    features['num_subreddits'] = df['requester_number_of_subreddits_at_request']
    
    # User flair encoding - handle missing in test
    if 'requester_user_flair' in df.columns:
        flair_map = {'None': 0, 'shroom': 1, 'PIF': 2}
        features['user_flair_encoded'] = df['requester_user_flair'].map(flair_map).fillna(0)
    else:
        # Test set doesn't have flair, use 0 (None) as default
        features['user_flair_encoded'] = 0
    
    # Time-based features
    features['unix_timestamp'] = df['unix_timestamp_of_request']
    features['hour_of_day'] = pd.to_datetime(df['unix_timestamp_of_request'], unit='s').dt.hour
    features['day_of_week'] = pd.to_datetime(df['unix_timestamp_of_request'], unit='s').dt.dayofweek
    
    # Activity density (add small constant to avoid division by zero)
    features['comments_per_day'] = features['comments_at_request'] / (features['account_age_at_request'] + 1)
    features['posts_per_day'] = features['posts_at_request'] / (features['account_age_at_request'] + 1)
    
    return features

## TF-IDF Features

In [None]:
# Create TF-IDF features for text
# Use edit-aware text if available
text_col = 'request_text_edit_aware' if 'request_text_edit_aware' in train_df.columns else 'request_text'

# TF-IDF for request text
tfidf_text = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)

tfidf_text.fit(train_df[text_col].fillna(''))
train_tfidf_text = tfidf_text.transform(train_df[text_col].fillna(''))
test_tfidf_text = tfidf_text.transform(test_df[text_col].fillna(''))

print(f"TF-IDF text features shape: {train_tfidf_text.shape}")

# TF-IDF for request title
tfidf_title = TfidfVectorizer(
    max_features=1000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)

tfidf_title.fit(train_df['request_title'].fillna(''))
train_tfidf_title = tfidf_title.transform(train_df['request_title'].fillna(''))
test_tfidf_title = tfidf_title.transform(test_df['request_title'].fillna(''))

print(f"TF-IDF title features shape: {train_tfidf_title.shape}")

## Combine All Features

In [None]:
# Combine all features
# Convert metadata and text features to sparse matrices
from scipy.sparse import csr_matrix

train_meta_sparse = csr_matrix(train_meta_features.fillna(0).values)
test_meta_sparse = csr_matrix(test_meta_features.fillna(0).values)

train_text_sparse = csr_matrix(train_text_features.fillna(0).values)
test_text_sparse = csr_matrix(test_text_features.fillna(0).values)

# Stack all features
X_train = hstack([
    train_tfidf_text,
    train_tfidf_title,
    train_text_sparse,
    train_meta_sparse
])

X_test = hstack([
    test_tfidf_text,
    test_tfidf_title,
    test_text_sparse,
    test_meta_sparse
])

y_train = train_df['requester_received_pizza'].astype(int).values

print(f"Final training features shape: {X_train.shape}")
print(f"Final test features shape: {X_test.shape}")

## Model Training with Cross-Validation

In [None]:
# Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_scores = []
predictions = np.zeros(len(test_df))
feature_importance_list = []

print("Training LightGBM model with 5-fold CV...")

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"\nFold {fold + 1}/5")
    
    X_train_fold = X_train[train_idx]
    y_train_fold = y_train[train_idx]
    X_val_fold = X_train[val_idx]
    y_val_fold = y_train[val_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
    val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)
    
    # Parameters (conservative for baseline)
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'num_threads': 4
    }
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50),
            lgb.log_evaluation(100)
        ]
    )
    
    # Predict on validation set
    val_pred = model.predict(X_val_fold, num_iteration=model.best_iteration)
    val_score = roc_auc_score(y_val_fold, val_pred)
    fold_scores.append(val_score)
    
    print(f"Fold {fold + 1} AUC: {val_score:.4f}")
    
    # Predict on test set
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    predictions += test_pred / 5
    
    # Store feature importance
    importance = model.feature_importance(importance_type='gain')
    feature_importance_list.append(importance)

print(f"\n{'='*50}")
print(f"Mean CV AUC: {np.mean(fold_scores):.4f} Â± {np.std(fold_scores):.4f}")
print(f"Individual fold scores: {fold_scores}")
print(f"{'='*50}")

## Feature Importance Analysis

In [None]:
# Analyze feature importance
feature_names = []
feature_names.extend([f'text_tfidf_{i}' for i in range(train_tfidf_text.shape[1])])
feature_names.extend([f'title_tfidf_{i}' for i in range(train_tfidf_title.shape[1])])
feature_names.extend(train_text_features.columns.tolist())
feature_names.extend(train_meta_features.columns.tolist())

# Calculate mean importance across folds
mean_importance = np.mean(feature_importance_list, axis=0)
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': mean_importance
}).sort_values('importance', ascending=False)

print("Top 20 most important features:")
print(importance_df.head(20))

## Create Submission

In [None]:
# Create submission file
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': predictions
})

# Ensure the format matches sample submission
print("Submission preview:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

In [None]:
# Verify submission format matches sample
sample_sub = pd.read_csv('/home/data/sampleSubmission.csv')
print("Sample submission format:")
print(sample_sub.head())
print(f"\nOur submission format:")
print(submission.head())
print(f"\nFormat matches: {list(submission.columns) == list(sample_sub.columns)}")