# Baseline Model: TF-IDF + LightGBM

This notebook implements a baseline model combining:
- TF-IDF features from text (title + request_text)
- Tabular metadata features
- LightGBM classifier with class weighting for imbalance
- Stratified K-fold cross-validation

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## Load Data

In [None]:
# Load training data
train_path = '/home/data/train.json'
with open(train_path, 'r') as f:
    train_data = json.load(f)

# Load test data
test_path = '/home/data/test.json'
with open(test_path, 'r') as f:
    test_data = json.load(f)

print(f"Training samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

# Convert to DataFrames
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

# Check class distribution
print(f"\nClass distribution in training data:")
print(train_df['requester_received_pizza'].value_counts(normalize=True))

## Feature Engineering

In [None]:
# Combine text features
train_df['combined_text'] = train_df['request_title'].fillna('') + ' ' + train_df['request_text_edit_aware'].fillna('')
test_df['combined_text'] = test_df['request_title'].fillna('') + ' ' + test_df['request_text_edit_aware'].fillna('')

# Basic text length features
train_df['text_length'] = train_df['combined_text'].str.len()
test_df['text_length'] = test_df['combined_text'].str.len()

train_df['word_count'] = train_df['combined_text'].str.split().str.len()
test_df['word_count'] = test_df['combined_text'].str.split().str.len()

# Tabular features to use
numeric_features = [
    'requester_account_age_in_days_at_request',
    'requester_number_of_comments_at_request', 
    'requester_number_of_posts_at_request',
    'requester_number_of_subreddits_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    'request_number_of_comments_at_retrieval',
    'number_of_upvotes_of_request_at_retrieval',
    'number_of_downvotes_of_request_at_retrieval',
    'text_length',
    'word_count'
]

# Handle missing values and create feature matrix
X_train_tabular = train_df[numeric_features].fillna(0)
X_test_tabular = test_df[numeric_features].fillna(0)

print(f"Tabular feature shape: {X_train_tabular.shape}")
print(f"Text samples for TF-IDF: {len(train_df['combined_text'])}")
print(f"Columns in train_df: {train_df.columns.tolist()}")

## TF-IDF Vectorization

In [None]:
# Create TF-IDF features from text
vectorizer = TfidfVectorizer(
    max_features=5000,  # Limit features for speed
    stop_words='english',
    ngram_range=(1, 2),  # Unigrams and bigrams
    min_df=2,
    max_df=0.95
)

# Fit on training text and transform both train and test
X_train_text = vectorizer.fit_transform(train_df['combined_text'])
X_test_text = vectorizer.transform(test_df['combined_text'])

print(f"TF-IDF feature shape: {X_train_text.shape}")

## Stratified K-Fold Cross-Validation

In [None]:
# Prepare target variable
y = train_df['requester_received_pizza'].astype(int)

# Calculate class weights for handling imbalance
pos_class_weight = len(y) / (2 * y.sum())
neg_class_weight = len(y) / (2 * (len(y) - y.sum()))

print(f"Positive class weight: {pos_class_weight:.2f}")
print(f"Negative class weight: {neg_class_weight:.2f}")

# Set up stratified k-fold
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

fold_scores = []
oof_predictions = np.zeros(len(train_df))

print(f"\nStarting {n_splits}-fold stratified cross-validation...")

In [None]:
# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, y)):
    print(f"\nFold {fold + 1}/{n_splits}")
    
    # Split data
    X_train_text_fold = X_train_text[train_idx]
    X_val_text_fold = X_train_text[val_idx]
    
    X_train_tab_fold = X_train_tabular.iloc[train_idx]
    X_val_tab_fold = X_train_tabular.iloc[val_idx]
    
    y_train_fold = y.iloc[train_idx]
    y_val_fold = y.iloc[val_idx]
    
    # For simplicity, we'll use only tabular features in this baseline
    # In future experiments, we can combine text and tabular features more sophisticatedly
    
    # Train LightGBM model
    train_data = lgb.Dataset(X_train_tab_fold, label=y_train_fold)
    val_data = lgb.Dataset(X_val_tab_fold, label=y_val_fold, reference=train_data)
    
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'is_unbalance': True  # Handle class imbalance
    }
    
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        valid_names=['val'],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predict on validation set
    val_pred = model.predict(X_val_tab_fold, num_iteration=model.best_iteration)
    oof_predictions[val_idx] = val_pred
    
    # Calculate AUC score
    fold_auc = roc_auc_score(y_val_fold, val_pred)
    fold_scores.append(fold_auc)
    
    print(f"Fold {fold + 1} AUC: {fold_auc:.4f}")

# Overall CV score
cv_score = np.mean(fold_scores)
cv_std = np.std(fold_scores)

print(f"\n{'='*50}")
print(f"Cross-Validation AUC: {cv_score:.4f} Â± {cv_std:.4f}")
print(f"{'='*50}")

# OOF AUC
oof_auc = roc_auc_score(y, oof_predictions)
print(f"Out-of-Fold AUC: {oof_auc:.4f}")

## Train Final Model and Generate Predictions

In [None]:
# Train final model on full training data
train_data_full = lgb.Dataset(X_train_tabular, label=y)

params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'is_unbalance': True
}

final_model = lgb.train(
    params,
    train_data_full,
    num_boost_round=1000
)

# Generate predictions on test set
test_predictions = final_model.predict(X_test_tabular)

print(f"Test predictions shape: {test_predictions.shape}")
print(f"Test predictions range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")

# Create submission dataframe
submission_df = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

print(f"\nSubmission shape: {submission_df.shape}")
print(submission_df.head())

# Save submission
submission_path = '/home/submission/submission_001_baseline.csv'
submission_df.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

## Feature Importance

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': numeric_features,
    'importance': final_model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print(feature_importance.head(10))