# Baseline Model: LightGBM with Tabular Features

This notebook implements a baseline model following the seed prompt strategy:
- Uses only safe features available in both train and test sets
- Handles class imbalance with appropriate weighting
- Uses stratified k-fold validation
- Generates predictions in correct submission format

In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## Load and Prepare Data

In [2]:
# Load training data
train_path = '/home/data/train.json'
with open(train_path, 'r') as f:
    train_data = json.load(f)

# Convert to DataFrame
train_df = pd.DataFrame(train_data)
print(f"Training data shape: {train_df.shape}")
print(f"Columns: {train_df.columns.tolist()}")
print(f"\nTarget distribution:")
print(train_df['requester_received_pizza'].value_counts(normalize=True))

Training data shape: (2878, 32)
Columns: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requester_upvotes_minu

## Select Safe Features Only

Following the seed prompt strategy, we only use features available in both train and test sets to avoid leakage.

In [3]:
# Define safe features (available in both train and test)
safe_features = [
    'request_text_edit_aware',  # Use edit-aware version to avoid leakage
    'request_title',
    'requester_account_age_in_days_at_request',
    'requester_days_since_first_post_on_raop_at_request',
    'requester_number_of_comments_at_request',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_posts_on_raop_at_request',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_subreddits_at_request',
    'requester_username',
    'giver_username_if_known',
    'unix_timestamp_of_request',
    'unix_timestamp_of_request_utc',
    'request_id'  # For identification, not for modeling
]

# Select only safe features
train_safe = train_df[safe_features + ['requester_received_pizza']].copy()
print(f"Safe features shape: {train_safe.shape}")
print(f"Safe features used: {[f for f in safe_features if f != 'request_id']}")

Safe features shape: (2878, 18)
Safe features used: ['request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_days_since_first_post_on_raop_at_request', 'requester_number_of_comments_at_request', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_posts_at_request', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_subreddits_at_request', 'requester_upvotes_minus_downvotes_at_request', 'requester_upvotes_plus_downvotes_at_request', 'requester_subreddits_at_request', 'requester_username', 'giver_username_if_known', 'unix_timestamp_of_request', 'unix_timestamp_of_request_utc']


## Feature Engineering

Create simple features from text and timestamps.

In [4]:
# Text length features (strong signals in social media)
train_safe['title_length'] = train_safe['request_title'].fillna('').str.len()
train_safe['text_length'] = train_safe['request_text_edit_aware'].fillna('').str.len()
train_safe['text_word_count'] = train_safe['request_text_edit_aware'].fillna('').str.split().str.len()

# Temporal features from timestamp
train_safe['request_timestamp'] = pd.to_datetime(train_safe['unix_timestamp_of_request'], unit='s')
train_safe['request_hour'] = train_safe['request_timestamp'].dt.hour
train_safe['request_dayofweek'] = train_safe['request_timestamp'].dt.dayofweek

# User activity ratios and interactions
train_safe['raop_activity_ratio'] = train_safe['requester_number_of_posts_on_raop_at_request'] / (train_safe['requester_number_of_posts_at_request'] + 1)
train_safe['raop_comment_ratio'] = train_safe['requester_number_of_comments_in_raop_at_request'] / (train_safe['requester_number_of_comments_at_request'] + 1)
train_safe['upvote_downvote_ratio'] = train_safe['requester_upvotes_plus_downvotes_at_request'] / (train_safe['requester_upvotes_minus_downvotes_at_request'] + 1)

# Interaction: text length * user activity
train_safe['text_length_x_raop_posts'] = train_safe['text_length'] * train_safe['requester_number_of_posts_on_raop_at_request']
train_safe['text_length_x_account_age'] = train_safe['text_length'] * train_safe['requester_account_age_in_days_at_request']

print("Feature engineering completed")
print(f"New shape: {train_safe.shape}")

Feature engineering completed
New shape: (2878, 29)


## Prepare Features for Modeling

In [None]:
# Define feature columns (exclude target, ID, and raw text)
exclude_cols = ['requester_received_pizza', 'request_id', 'request_text_edit_aware', 'request_title', 
                'requester_subreddits_at_request', 'requester_username', 'giver_username_if_known',
                'request_timestamp']

feature_cols = [col for col in train_safe.columns if col not in exclude_cols]
print(f"Number of features: {len(feature_cols)}")
print(f"Features: {feature_cols}")

# Prepare data
X = train_safe[feature_cols].copy()
y = train_safe['requester_received_pizza'].copy()

# Handle missing values
X = X.fillna(0)

print(f"X shape: {X.shape}, y shape: {y.shape}")
print(f"Class distribution: {y.value_counts(normalize=True)}")

## Stratified K-Fold Validation

Use stratified k-fold to maintain class distribution across folds.

In [None]:
# Set up stratified k-fold
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Calculate scale_pos_weight for LightGBM (handles class imbalance)
scale_pos_weight = (y == 0).sum() / (y == 1).sum()
print(f"Scale pos weight: {scale_pos_weight:.2f} (ratio of negative to positive samples)")

# Store out-of-fold predictions
oof_predictions = np.zeros(len(X))
fold_scores = []

## Train LightGBM Model

In [None]:
# Train with cross-validation
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold + 1}/{n_splits}")
    
    # Split data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)
    
    # Define parameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'scale_pos_weight': scale_pos_weight
    }
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        valid_names=['val'],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(100)
        ]
    )
    
    # Predict on validation set
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    oof_predictions[val_idx] = val_pred
    
    # Calculate AUC
    fold_auc = roc_auc_score(y_val, val_pred)
    fold_scores.append(fold_auc)
    print(f"Fold {fold + 1} AUC: {fold_auc:.4f}")

# Overall OOF score
overall_auc = roc_auc_score(y, oof_predictions)
print(f"\nOverall OOF AUC: {overall_auc:.4f}")
print(f"Mean CV AUC: {np.mean(fold_scores):.4f} Â± {np.std(fold_scores):.4f}")

## Feature Importance Analysis

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print("Top 15 Most Important Features:")
print(feature_importance.head(15))

## Prepare Test Data and Generate Predictions

In [None]:
# Load test data
test_path = '/home/data/test.json'
with open(test_path, 'r') as f:
    test_data = json.load(f)

test_df = pd.DataFrame(test_data)
print(f"Test data shape: {test_df.shape}")

# Apply same feature engineering to test data
test_df['title_length'] = test_df['request_title'].fillna('').str.len()
test_df['text_length'] = test_df['request_text_edit_aware'].fillna('').str.len()
test_df['text_word_count'] = test_df['request_text_edit_aware'].fillna('').str.split().str.len()

# Temporal features
test_df['request_timestamp'] = pd.to_datetime(test_df['unix_timestamp_of_request'], unit='s')
test_df['request_hour'] = test_df['request_timestamp'].dt.hour
test_df['request_dayofweek'] = test_df['request_timestamp'].dt.dayofweek

# User activity ratios and interactions
test_df['raop_activity_ratio'] = test_df['requester_number_of_posts_on_raop_at_request'] / (test_df['requester_number_of_posts_at_request'] + 1)
test_df['raop_comment_ratio'] = test_df['requester_number_of_comments_in_raop_at_request'] / (test_df['requester_number_of_comments_at_request'] + 1)
test_df['upvote_downvote_ratio'] = test_df['requester_upvotes_plus_downvotes_at_request'] / (test_df['requester_upvotes_minus_downvotes_at_request'] + 1)

# Interaction features
test_df['text_length_x_raop_posts'] = test_df['text_length'] * test_df['requester_number_of_posts_on_raop_at_request']
test_df['text_length_x_account_age'] = test_df['text_length'] * test_df['requester_account_age_in_days_at_request']

# Prepare test features
X_test = test_df[feature_cols].copy()
X_test = X_test.fillna(0)

print(f"Test features shape: {X_test.shape}")

In [None]:
# Generate predictions on test set
test_predictions = model.predict(X_test, num_iteration=model.best_iteration)

# Create submission dataframe
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

print("Submission format:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)
print(f"Submission saved to: {submission_path}")

## Summary

- Used only safe features available in both train and test sets
- Applied stratified 5-fold cross-validation
- Handled class imbalance with scale_pos_weight
- Generated predictions in correct submission format
- Overall OOF AUC: {overall_auc:.4f}