# Baseline Experiment: Meta Features Only with LightGBM

This is the first baseline experiment following the seed strategy:
- Use only meta/numerical features (no text processing yet)
- LightGBM with class imbalance handling
- Stratified K-Fold validation (k=5)
- Focus on AUC-ROC metric

In [4]:
import pandas as pd
import numpy as np
import json
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

## Load Data

In [5]:
# Load training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

train_df = pd.DataFrame(train_data)
print(f"Training data shape: {train_df.shape}")
print(f"Target distribution: {train_df['requester_received_pizza'].value_counts().to_dict()}")
print(f"Positive rate: {train_df['requester_received_pizza'].mean():.3f}")

# Load test data
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

test_df = pd.DataFrame(test_data)
print(f"\nTest data shape: {test_df.shape}")

Training data shape: (2878, 32)
Target distribution: {False: 2163, True: 715}
Positive rate: 0.248

Test data shape: (1162, 17)


## Feature Engineering - Meta Features Only

Based on EDA findings, we'll use:
- User activity metrics (posts, comments on RAOP)
- Account age
- Text length features (correlate with success)
- Temporal features
- Exclude post-retrieval features to prevent leakage

In [6]:
def engineer_features(df):
    """Engineer features from the raw data"""
    features = {}
    
    # Basic text length features (strong correlation with target per EDA)
    # Use request_text if available, otherwise use request_text_edit_aware
    if 'request_text' in df.columns:
        features['request_text_length'] = df['request_text'].str.len()
    else:
        features['request_text_length'] = df['request_text_edit_aware'].str.len()
    
    features['request_title_length'] = df['request_title'].str.len()
    
    # User activity on RAOP (highly predictive per EDA - 0.46 correlation)
    features['requester_number_of_posts_on_raop_at_request'] = df['requester_number_of_posts_on_raop_at_request']
    features['requester_number_of_comments_in_raop_at_request'] = df['requester_number_of_comments_in_raop_at_request']
    
    # Account age
    features['requester_account_age_in_days_at_request'] = df['requester_account_age_in_days_at_request']
    features['requester_days_since_first_post_on_raop_at_request'] = df['requester_days_since_first_post_on_raop_at_request']
    
    # Subreddit diversity (number of unique subreddits)
    features['requester_number_of_subreddits_at_request'] = df['requester_number_of_subreddits_at_request']
    
    # Post timing (hour of day)
    features['request_hour'] = pd.to_datetime(df['unix_timestamp_of_request_utc'], unit='s').dt.hour
    features['request_day_of_week'] = pd.to_datetime(df['unix_timestamp_of_request_utc'], unit='s').dt.dayofweek
    
    # Ratios
    features['posts_to_comments_ratio'] = (
        df['requester_number_of_posts_on_raop_at_request'] / 
        (df['requester_number_of_comments_in_raop_at_request'] + 1)
    )
    
    features['account_age_to_posts_ratio'] = (
        df['requester_account_age_in_days_at_request'] / 
        (df['requester_number_of_posts_on_raop_at_request'] + 1)
    )
    
    # Convert to DataFrame
    feature_df = pd.DataFrame(features)
    
    # Fill missing values
    feature_df = feature_df.fillna(0)
    
    return feature_df

# Engineer features for train and test
print("Engineering features...")
X_train = engineer_features(train_df)
X_test = engineer_features(test_df)

print(f"Feature matrix shape: {X_train.shape}")
print(f"Features: {list(X_train.columns)}")

# Target variable
y_train = train_df['requester_received_pizza'].astype(int)

print(f"\nFeature statistics:")
print(X_train.describe())

Engineering features...
Feature matrix shape: (2878, 11)
Features: ['request_text_length', 'request_title_length', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_comments_in_raop_at_request', 'requester_account_age_in_days_at_request', 'requester_days_since_first_post_on_raop_at_request', 'requester_number_of_subreddits_at_request', 'request_hour', 'request_day_of_week', 'posts_to_comments_ratio', 'account_age_to_posts_ratio']

Feature statistics:
       request_text_length  request_title_length  \
count          2878.000000           2878.000000   
mean            402.521543             71.572967   
std             362.393727             36.233487   
min               0.000000              7.000000   
25%             182.000000             46.000000   
50%             308.000000             64.000000   
75%             503.750000             90.000000   
max            4460.000000            272.000000   

       requester_number_of_posts_on_raop_at_request  \
co

## Model Training with Stratified K-Fold

Using LightGBM with class imbalance handling (scale_pos_weight ~ 3:1 ratio)

In [7]:
# Setup cross-validation
N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

# Model parameters
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': SEED
}

# Calculate scale_pos_weight for class imbalance
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"Scale pos weight: {scale_pos_weight:.2f}")
params['scale_pos_weight'] = scale_pos_weight

# Cross-validation
fold_scores = []
oof_predictions = np.zeros(len(X_train))
test_predictions = np.zeros(len(X_test))

print(f"\nStarting {N_FOLDS}-fold cross-validation...")

for fold, (train_idx, valid_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"\nFold {fold + 1}/{N_FOLDS}")
    
    # Split data
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
    
    # Create LightGBM datasets
    train_set = lgb.Dataset(X_tr, label=y_tr)
    valid_set = lgb.Dataset(X_val, label=y_val)
    
    # Train model
    model = lgb.train(
        params,
        train_set,
        num_boost_round=1000,
        valid_sets=[valid_set],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(100)
        ]
    )
    
    # Predict on validation set
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    oof_predictions[valid_idx] = val_pred
    
    # Calculate fold score
    fold_score = roc_auc_score(y_val, val_pred)
    fold_scores.append(fold_score)
    print(f"Fold {fold + 1} AUC: {fold_score:.4f}")
    
    # Predict on test set
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    test_predictions += test_pred / N_FOLDS

# Overall CV score
cv_score = roc_auc_score(y_train, oof_predictions)
print(f"\n{'='*50}")
print(f"Cross-validation AUC: {cv_score:.4f}")
print(f"Mean fold AUC: {np.mean(fold_scores):.4f} ± {np.std(fold_scores):.4f}")
print(f"Fold scores: {fold_scores}")
print(f"{'='*50}")

Scale pos weight: 3.03

Starting 5-fold cross-validation...

Fold 1/5
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[24]	valid's auc: 0.651989
Fold 1 AUC: 0.6520

Fold 2/5
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[19]	valid's auc: 0.622773
Fold 2 AUC: 0.6228

Fold 3/5
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[21]	valid's auc: 0.689166
Fold 3 AUC: 0.6892

Fold 4/5
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[14]	valid's auc: 0.566434
Fold 4 AUC: 0.5664

Fold 5/5
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[41]	valid's auc: 0.630423
Fold 5 AUC: 0.6304

Cross-validation AUC: 0.6283
Mean fold AUC: 0.6322 ± 0.0401
Fold scores: [0.651988888709443, 0.6227733006023999, 0.6891664917069072, 0.5664335664335665, 0.630422817922818]


## Feature Importance Analysis

In [8]:
# Get feature importance from the last fold model
feature_importance = model.feature_importance(importance_type='gain')
feature_names = X_train.columns.tolist()

# Create importance DataFrame
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Top 10 most important features:")
print(importance_df.head(10))

Top 10 most important features:
                                              feature   importance
0                                 request_text_length  3030.400134
1                                request_title_length  2083.183783
4            requester_account_age_in_days_at_request  1663.927594
6           requester_number_of_subreddits_at_request  1457.335270
7                                        request_hour  1241.976861
10                         account_age_to_posts_ratio  1174.820701
8                                 request_day_of_week   927.619119
5   requester_days_since_first_post_on_raop_at_req...   665.895947
3     requester_number_of_comments_in_raop_at_request   356.397734
2        requester_number_of_posts_on_raop_at_request    82.926881


## Create Submission File

In [9]:
# Load sample submission to get the format
sample_sub = pd.read_csv('/home/data/sampleSubmission.csv')
print(f"Sample submission shape: {sample_sub.shape}")
print(sample_sub.head())

# Create submission
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

print(f"\nSubmission shape: {submission.shape}")
print(submission.head())

# Save submission
submission_path = '/home/submission/submission_001_baseline_meta_only.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

Sample submission shape: (1162, 2)
  request_id  requester_received_pizza
0  t3_1aw5zf                         0
1   t3_roiuw                         0
2   t3_mjnbq                         0
3   t3_t8wd1                         0
4  t3_1m4zxu                         0

Submission shape: (1162, 2)
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.452669
1   t3_roiuw                  0.384399
2   t3_mjnbq                  0.288113
3   t3_t8wd1                  0.356146
4  t3_1m4zxu                  0.389680

Submission saved to: /home/submission/submission_001_baseline_meta_only.csv
