# Baseline Model - LightGBM

Simple baseline using LightGBM with basic features:
- Text length features
- User flair encoding
- Numeric features
- StratifiedKFold for cross-validation

In [None]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Create experiments directory
Path('/home/code/experiments').mkdir(parents=True, exist_ok=True)

In [None]:
# Load data
print("Loading training data...")
with open('/home/data/train.json', 'r') as f:
    train_data = [json.loads(line) for line in f]

train_df = pd.DataFrame(train_data)
print(f"Training data shape: {train_df.shape}")
print(f"Columns: {list(train_df.columns)}")

print("\nLoading test data...")
with open('/home/data/test.json', 'r') as f:
    test_data = [json.loads(line) for line in f]

test_df = pd.DataFrame(test_data)
print(f"Test data shape: {test_df.shape}")

In [None]:
# Basic feature engineering
print("Creating basic features...")

# Target
target = 'requester_received_pizza'

# Text length features
train_df['title_length'] = train_df['request_title'].fillna('').str.len()
train_df['text_length'] = train_df['request_text'].fillna('').str.len()
train_df['text_edit_length'] = train_df['request_text_edit_aware'].fillna('').str.len()

test_df['title_length'] = test_df['request_title'].fillna('').str.len()
test_df['text_length'] = test_df['request_text'].fillna('').str.len()
test_df['text_edit_length'] = test_df['request_text_edit_aware'].fillna('').str.len()

# User flair encoding (strong predictive feature according to EDA)
flair_mapping = {'None': 0, 'shroom': 1, 'PIF': 2}
train_df['user_flair_encoded'] = train_df['requester_user_flair'].map(flair_mapping)
test_df['user_flair_encoded'] = test_df['requester_user_flair'].map(flair_mapping)

# Fill missing values
numeric_features = [
    'number_of_downvotes_of_request_at_retrieval',
    'number_of_upvotes_of_request_at_retrieval',
    'request_number_of_comments_at_retrieval',
    'requester_account_age_in_days_at_request',
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_number_of_subreddits_at_request'
]

for col in numeric_features:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce').fillna(0)
    test_df[col] = pd.to_numeric(test_df[col], errors='coerce').fillna(0)

# Select features for modeling
feature_cols = numeric_features + [
    'title_length', 'text_length', 'text_edit_length', 'user_flair_encoded'
]

print(f"Using {len(feature_cols)} features: {feature_cols}")

X = train_df[feature_cols]
y = train_df[target]
X_test = test_df[feature_cols]

print(f"Training features shape: {X.shape}")
print(f"Test features shape: {X_test.shape}")
print(f"Target distribution: {y.mean():.3f} (positive rate)")

In [None]:
# Cross-validation setup
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = []
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(X_test))

print(f"Starting {n_folds}-fold cross-validation...")

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold + 1}/{n_folds}")
    
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    # LightGBM parameters (basic)
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'random_state': 42 + fold
    }
    
    # Create datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(0)
        ]
    )
    
    # Predictions
    valid_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    # Store predictions
    oof_predictions[valid_idx] = valid_pred
    test_predictions += test_pred / n_folds
    
    # Calculate fold score
    fold_score = roc_auc_score(y_valid, valid_pred)
    cv_scores.append(fold_score)
    print(f"Fold {fold + 1} AUC: {fold_score:.4f}")

# Overall CV score
mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)
print(f"\nCV Score: {mean_cv_score:.4f} Â± {std_cv_score:.4f}")

# OOF score
oof_score = roc_auc_score(y, oof_predictions)
print(f"OOF AUC: {oof_score:.4f}")

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print("Top 10 features by importance:")
print(feature_importance.head(10))

In [None]:
# Create submission
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure submission format matches sample
print("Submission preview:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)
print(f"Submission saved to: {submission_path}")

# Check distribution
print(f"\nPrediction distribution:")
print(f"Mean: {submission['requester_received_pizza'].mean():.4f}")
print(f"Std: {submission['requester_received_pizza'].std():.4f}")
print(f"Min: {submission['requester_received_pizza'].min():.4f}")
print(f"Max: {submission['requester_received_pizza'].max():.4f}")