# Baseline Experiment: LightGBM with Engineered Features

This notebook implements a baseline model using LightGBM on engineered features from the Reddit pizza requests dataset.

## Strategy
- Focus on the highly predictive user flair feature ('shroom' and 'PIF' have 100% success rate)
- Engineer basic text features (length, word count, etc.)
- Use metadata features (account age, activity metrics)
- Apply stratified k-fold cross-validation
- Handle class imbalance with appropriate techniques

In [3]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## Load and Explore Data

In [4]:
# Load training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

train_df = pd.DataFrame(train_data)
print(f"Training data shape: {train_df.shape}")
print(f"Columns: {train_df.columns.tolist()}")
print(f"\nTarget distribution:")
print(train_df['requester_received_pizza'].value_counts(normalize=True))

Training data shape: (2878, 32)
Columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_upvotes_minu

In [5]:
# Load test data
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

test_df = pd.DataFrame(test_data)
print(f"Test data shape: {test_df.shape}")
print(f"Columns: {test_df.columns.tolist()}")

Test data shape: (1162, 17)
Columns: ['giver_username_if_known', 'request_id', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_days_since_first_post_on_raop_at_request', 'requester_number_of_comments_at_request', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_posts_at_request', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_subreddits_at_request', 'requester_subreddits_at_request', 'requester_upvotes_minus_downvotes_at_request', 'requester_upvotes_plus_downvotes_at_request', 'requester_username', 'unix_timestamp_of_request', 'unix_timestamp_of_request_utc']


## Feature Engineering

In [7]:
def engineer_features(df):
    """Engineer features from the raw data"""
    features = pd.DataFrame(index=df.index)
    
    # Critical feature: User flair (highly predictive according to EDA)
    # Note: user flair is NOT in test set, so we need to handle this
    if 'requester_user_flair' in df.columns:
        features['user_flair'] = df['requester_user_flair'].fillna('None')
        features['has_flair'] = (features['user_flair'] != 'None').astype(int)
        features['is_shroom'] = (features['user_flair'] == 'shroom').astype(int)
        features['is_pif'] = (features['user_flair'] == 'PIF').astype(int)
    else:
        # For test set, create dummy features
        features['user_flair'] = 'None'
        features['has_flair'] = 0
        features['is_shroom'] = 0
        features['is_pif'] = 0
    
    # Text features
    # Use request_text_edit_aware to avoid leakage from edits
    text_col = 'request_text_edit_aware' if 'request_text_edit_aware' in df.columns else 'request_text'
    
    # Combine title and text for unified analysis
    df['combined_text'] = df['request_title'].fillna('') + ' ' + df[text_col].fillna('')
    
    # Basic text length features
    features['text_length'] = df['combined_text'].str.len()
    features['word_count'] = df['combined_text'].str.split().str.len()
    features['title_length'] = df['request_title'].str.len()
    
    # Account activity features (use at_request versions since at_retrieval not in test)
    features['account_age_days'] = df['requester_account_age_in_days_at_request']
    features['num_comments'] = df['requester_number_of_comments_at_request']
    features['num_posts'] = df['requester_number_of_posts_at_request']
    features['num_subreddits'] = df['requester_number_of_subreddits_at_request']
    
    # Activity ratios
    features['comments_per_post'] = features['num_comments'] / (features['num_posts'] + 1)
    features['subreddits_per_post'] = features['num_subreddits'] / (features['num_posts'] + 1)
    
    # Vote features (use at_request versions)
    features['upvotes_minus_downvotes'] = df['requester_upvotes_minus_downvotes_at_request']
    features['upvotes_plus_downvotes'] = df['requester_upvotes_plus_downvotes_at_request']
    
    # Time features
    features['timestamp'] = df['unix_timestamp_of_request']
    features['hour_of_day'] = pd.to_datetime(df['unix_timestamp_of_request'], unit='s').dt.hour
    features['day_of_week'] = pd.to_datetime(df['unix_timestamp_of_request'], unit='s').dt.dayofweek
    
    return features

# Apply feature engineering
train_features = engineer_features(train_df)
test_features = engineer_features(test_df)

print(f"Engineered features shape: {train_features.shape}")
print(f"Sample features:\n{train_features.head()}")

Engineered features shape: (2878, 18)
Sample features:
  user_flair  has_flair  is_shroom  is_pif  text_length  word_count  \
0       None          0          0       0          280          54   
1       None          0          0       0          292          50   
2       None          0          0       0          780         161   
3     shroom          1          1       0         1068         208   
4       None          0          0       0          197          37   

   title_length  account_age_days  num_comments  num_posts  num_subreddits  \
0            65          0.000000             0          0               1   
1           122         99.526863            40         11               7   
2            85          0.000000             0          0               1   
3            39        491.088264            46          1               5   
4            33        369.417558           195         12              29   

   comments_per_post  subreddits_per_post  upvote

In [None]:
# Handle categorical features
# User flair is critical - let's use one-hot encoding
# First check which flair values exist in training data
print("User flair distribution in training:")
print(train_features['user_flair'].value_counts())

# For test set, we know user flair is not available, so we'll create dummy features
# Create binary features for the most important flair types
flair_features = ['has_flair', 'is_shroom', 'is_pif']

# These are already created in engineer_features, so we just need to ensure they're numeric
train_features[flair_features] = train_features[flair_features].astype(int)
test_features[flair_features] = test_features[flair_features].astype(int)

# Drop the original categorical column
train_features = train_features.drop('user_flair', axis=1)
test_features = test_features.drop('user_flair', axis=1)

# Ensure all features are numeric and handle any remaining NaNs
train_features = train_features.fillna(0)
test_features = test_features.fillna(0)

print(f"Final feature shape: {train_features.shape}")
print(f"Feature types:")
print(train_features.dtypes.value_counts())

## TF-IDF Features for Text

In [None]:
# Create TF-IDF features for the text
# Use a subset of features to avoid dimensionality explosion
vectorizer = TfidfVectorizer(
    max_features=1000,  # Limit features for baseline
    stop_words='english',
    ngram_range=(1, 2),  # Unigrams and bigrams
    min_df=5,  # Ignore very rare terms
    max_df=0.95  # Ignore very common terms
)

# Fit on training text
train_text = train_df['combined_text'].fillna('')
tfidf_train = vectorizer.fit_transform(train_text)

# Transform test text
test_text = test_df['combined_text'].fillna('')
tfidf_test = vectorizer.transform(test_text)

print(f"TF-IDF features shape: {tfidf_train.shape}")

## Combine Features

In [None]:
# Convert engineered features to sparse matrix format
from scipy.sparse import csr_matrix

train_engineered = csr_matrix(train_features.fillna(0).values)
test_engineered = csr_matrix(test_features.fillna(0).values)

# Combine TF-IDF and engineered features
X_train = hstack([tfidf_train, train_engineered])
X_test = hstack([tfidf_test, test_engineered])

y_train = train_df['requester_received_pizza'].values

print(f"Final training matrix shape: {X_train.shape}")
print(f"Final test matrix shape: {X_test.shape}")

## Model Training with Cross-Validation

In [None]:
# Set up stratified k-fold cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize arrays for out-of-fold predictions
oof_predictions = np.zeros(len(train_df))
test_predictions = np.zeros(len(test_df))

# Track scores across folds
cv_scores = []

print(f"Training LightGBM model with {n_splits}-fold stratified CV...")

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"\nFold {fold + 1}/{n_splits}")
    
    # Split data
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    # Create LightGBM datasets
    train_set = lgb.Dataset(X_tr, label=y_tr)
    val_set = lgb.Dataset(X_val, label=y_val)
    
    # Define parameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'num_threads': 4
    }
    
    # Handle class imbalance with scale_pos_weight
    pos_rate = y_tr.mean()
    params['scale_pos_weight'] = (1 - pos_rate) / pos_rate
    
    # Train model
    model = lgb.train(
        params,
        train_set,
        num_boost_round=1000,
        valid_sets=[val_set],
        valid_names=['val'],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(100)
        ]
    )
    
    # Make predictions
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    # Store predictions
    oof_predictions[val_idx] = val_pred
    test_predictions += test_pred / n_splits
    
    # Calculate fold score
    fold_score = roc_auc_score(y_val, val_pred)
    cv_scores.append(fold_score)
    
    print(f"Fold {fold + 1} AUC: {fold_score:.4f}")

# Calculate overall CV score
overall_score = roc_auc_score(y_train, oof_predictions)
print(f"\nOverall CV AUC: {overall_score:.4f}")
print(f"Mean CV AUC: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")

## Feature Importance Analysis

In [None]:
# Get feature importance from the last fold model
feature_names = (list(vectorizer.get_feature_names_out()) + 
                 list(train_features.columns))

importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print("Top 20 most important features:")
print(importance_df.head(20))

## Create Submission

In [None]:
# Create submission dataframe
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure the format matches sample submission
submission['requester_received_pizza'] = submission['requester_received_pizza'].clip(0, 1)

print("Submission preview:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")
print(f"Prediction distribution:")
print(submission['requester_received_pizza'].describe())

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print("\nSubmission saved to /home/submission/submission.csv")