# Baseline Model: LightGBM with TF-IDF Features

This notebook implements a baseline model using LightGBM with basic feature engineering:
- TF-IDF on merged text fields
- Metadata feature processing
- Stratified K-fold validation
- Class imbalance handling

In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Target distribution: {train_df['requester_received_pizza'].value_counts().to_dict()}")
print(f"Success rate: {train_df['requester_received_pizza'].mean():.3f}")

Loading data...
Train shape: (2878, 32)
Test shape: (1162, 17)
Target distribution: {False: 2163, True: 715}
Success rate: 0.248


In [3]:
# Feature Engineering
print("\nEngineering features...")

# Text features: merge title and text
train_df['combined_text'] = train_df['request_title'].fillna('') + ' ' + train_df['request_text_edit_aware'].fillna('')
test_df['combined_text'] = test_df['request_title'].fillna('') + ' ' + test_df['request_text_edit_aware'].fillna('')

# Create TF-IDF features
print("Creating TF-IDF features...")
tfidf = TfidfVectorizer(
    max_features=10000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)

tfidf_features_train = tfidf.fit_transform(train_df['combined_text'])
tfidf_features_test = tfidf.transform(test_df['combined_text'])

print(f"TF-IDF shape: {tfidf_features_train.shape}")

# Metadata features (only those available at request time)
metadata_features = [
    'requester_account_age_in_days_at_request',
    'requester_days_since_first_post_on_raop_at_request',
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_posts_on_raop_at_request',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request'
]

# Process metadata features
meta_train = train_df[metadata_features].copy()
meta_test = test_df[metadata_features].copy()

# Handle missing values and log transform skewed features
for col in metadata_features:
    # Fill missing values with median
    median_val = meta_train[col].median()
    meta_train[col].fillna(median_val, inplace=True)
    meta_test[col].fillna(median_val, inplace=True)
    
    # Log transform highly skewed features
    if meta_train[col].skew() > 2:
        meta_train[col] = np.log1p(meta_train[col])
        meta_test[col] = np.log1p(meta_test[col])

# One-hot encode requester_user_flair if it exists
if 'requester_user_flair' in train_df.columns and 'requester_user_flair' in test_df.columns:
    flair_train = pd.get_dummies(train_df['requester_user_flair'], prefix='flair')
    flair_test = pd.get_dummies(test_df['requester_user_flair'], prefix='flair')
    
    # Align flair columns (test might not have all categories)
    for col in flair_train.columns:
        if col not in flair_test.columns:
            flair_test[col] = 0
    for col in flair_test.columns:
        if col not in flair_train.columns:
            flair_train[col] = 0
    
    # Ensure same column order
    flair_test = flair_test[flair_train.columns]
    
    print(f"Flair shape: {flair_train.shape}")
else:
    # Create dummy features if flair doesn't exist
    flair_train = pd.DataFrame(index=train_df.index)
    flair_test = pd.DataFrame(index=test_df.index)
    print("No flair feature found, skipping...")

print(f"Metadata shape: {meta_train.shape}")

# Combine all features
from scipy.sparse import hstack

X_train = hstack([tfidf_features_train, meta_train.values, flair_train.values])
X_test = hstack([tfidf_features_test, meta_test.values, flair_test.values])

y_train = train_df['requester_received_pizza'].values

print(f"Final feature matrix shape: {X_train.shape}")


Engineering features...
Creating TF-IDF features...


TF-IDF shape: (2878, 10000)
No flair feature found, skipping...
Metadata shape: (2878, 9)
Final feature matrix shape: (2878, 10009)


In [6]:
# Model Training with Stratified K-Fold
print("\nTraining model with Stratified K-Fold...")

# Convert sparse matrices to CSR format for efficient indexing
from scipy.sparse import csr_matrix
X_train_csr = X_train.tocsr()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

auc_scores = []
oof_predictions = np.zeros(len(train_df))
test_predictions = np.zeros(len(test_df))

fold = 1
for train_idx, val_idx in skf.split(X_train_csr, y_train):
    print(f"\nFold {fold}/5")
    
    X_tr = X_train_csr[train_idx]
    X_val = X_train_csr[val_idx]
    y_tr = y_train[train_idx]
    y_val = y_train[val_idx]
    
    # Calculate scale_pos_weight for handling class imbalance
    neg_count = (y_tr == 0).sum()
    pos_count = (y_tr == 1).sum()
    scale_pos_weight = neg_count / pos_count
    
    # Train LightGBM
    model = lgb.LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        scale_pos_weight=scale_pos_weight,
        n_jobs=-1
    )
    
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        eval_metric='auc',
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predictions
    val_pred = model.predict_proba(X_val)[:, 1]
    test_pred = model.predict_proba(X_test)[:, 1]
    
    # Calculate AUC
    auc = roc_auc_score(y_val, val_pred)
    auc_scores.append(auc)
    print(f"Fold {fold} AUC: {auc:.4f}")
    
    # Store OOF predictions
    oof_predictions[val_idx] = val_pred
    test_predictions += test_pred / 5
    
    fold += 1

print(f"\n{'='*50}")
print(f"Mean AUC: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")
print(f"OOF AUC: {roc_auc_score(y_train, oof_predictions):.4f}")
print(f"{'='*50}")


Training model with Stratified K-Fold...

Fold 1/5
[LightGBM] [Info] Number of positive: 572, number of negative: 1730
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014128 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22354
[LightGBM] [Info] Number of data points in the train set: 2302, number of used features: 846
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.248480 -> initscore=-1.106738
[LightGBM] [Info] Start training from score -1.106738
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[5]	valid_0's auc: 0.62971	valid_0's binary_logloss: 0.553298
Fold 1 AUC: 0.6297

Fold 2/5
[LightGBM] [Info] Number of positive: 572, number of negative: 1730
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014715 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22157
[LightGBM] [Info] Number of data points in the train set: 2302, number of used features: 833
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.248480 -> initscore=-1.106738
[LightGBM] [Info] Start training from score -1.106738
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[1]	valid_0's auc: 0.524322	valid_0's binary_logloss: 0.560253
Fold 2 AUC: 0.5243

Fold 3/5
[LightGBM] [Info] Number of positive: 572, number of negative: 1730
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011748 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22425
[LightGBM] [Info] Number of data points in the train set: 2302, number of used features: 848
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.248480 -> initscore=-1.106738
[LightGBM] [Info] Start training from score -1.106738
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[6]	valid_0's auc: 0.593259	valid_0's binary_logloss: 0.556542
Fold 3 AUC: 0.5933

Fold 4/5
[LightGBM] [Info] Number of positive: 572, number of negative: 1731
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012161 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22353
[LightGBM] [Info] Number of data points in the train set: 2303, number of used features: 838
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.248372 -> initscore=-1.107316
[LightGBM] [Info] Start training from score -1.107316
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[2]	valid_0's auc: 0.569582	valid_0's binary_logloss: 0.558515
Fold 4 AUC: 0.5696

Fold 5/5
[LightGBM] [Info] Number of positive: 572, number of negative: 1731
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015478 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22307
[LightGBM] [Info] Number of data points in the train set: 2303, number of used features: 849
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.248372 -> initscore=-1.107316
[LightGBM] [Info] Start training from score -1.107316
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[5]	valid_0's auc: 0.634356	valid_0's binary_logloss: 0.552978
Fold 5 AUC: 0.6344

Mean AUC: 0.5902 ± 0.0407
OOF AUC: 0.5827


In [7]:
# Create submission file
print("\nCreating submission file...")

submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure submission format matches sample
print(f"Submission shape: {submission.shape}")
print(f"Submission columns: {submission.columns.tolist()}")
print(f"Sample predictions: {test_predictions[:5]}")

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print("Submission saved to /home/submission/submission.csv")

# Also save OOF predictions for potential stacking
pd.DataFrame({
    'request_id': train_df['request_id'],
    'oof_prediction': oof_predictions,
    'target': y_train
}).to_csv('/home/submission/oof_predictions.csv', index=False)
print("OOF predictions saved to /home/submission/oof_predictions.csv")


Creating submission file...
Submission shape: (1162, 2)
Submission columns: ['request_id', 'requester_received_pizza']
Sample predictions: [0.26907746 0.26704352 0.26075883 0.30887097 0.27959236]
Submission saved to /home/submission/submission.csv
OOF predictions saved to /home/submission/oof_predictions.csv
