# Baseline Experiment: LightGBM with Simple Features

This notebook creates a baseline model using LightGBM with simple features extracted from both text and tabular data.

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss
import lightgbm as lgb
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
# Load the data
print("Loading data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Target distribution:")
print(train_df['requester_received_pizza'].value_counts(normalize=True))

## Feature Engineering

In [None]:
def extract_text_features(text):
    """Extract simple features from text"""
    if pd.isna(text) or text == '':
        return {
            'text_length': 0,
            'word_count': 0,
            'sentence_count': 0,
            'exclamation_count': 0,
            'question_count': 0,
            'caps_ratio': 0,
            'has_please': 0,
            'has_thank': 0,
            'has_sorry': 0,
            'has_because': 0,
            'has_family': 0,
            'has_kids': 0,
            'has_work': 0,
            'has_money': 0,
            'has_pay': 0,
            'has_hungry': 0,
            'has_food': 0,
            'has_help': 0,
            'has_emergency': 0
        }
    
    # Basic text stats
    text_length = len(text)
    words = text.split()
    word_count = len(words)
    sentences = re.split(r'[.!?]+', text)
    sentence_count = len([s for s in sentences if s.strip()])
    
    # Punctuation
    exclamation_count = text.count('!')
    question_count = text.count('?')
    caps_ratio = sum(1 for c in text if c.isupper()) / len(text) if len(text) > 0 else 0
    
    # Keywords (lowercase for matching)
    text_lower = text.lower()
    has_please = int('please' in text_lower)
    has_thank = int(any(word in text_lower for word in ['thank', 'thanks', 'thx']))
    has_sorry = int('sorry' in text_lower)
    has_because = int('because' in text_lower)
    has_family = int(any(word in text_lower for word in ['family', 'fam', 'parent', 'mother', 'father']))
    has_kids = int(any(word in text_lower for word in ['kid', 'kids', 'child', 'children', 'baby', 'babies']))
    has_work = int(any(word in text_lower for word in ['work', 'job', 'employ', 'money']))
    has_money = int(any(word in text_lower for word in ['money', 'cash', 'dollar', 'bucks']))
    has_pay = int(any(word in text_lower for word in ['pay', 'payment', 'bills', 'rent']))
    has_hungry = int(any(word in text_lower for word in ['hungry', 'starving', 'hunger']))
    has_food = int(any(word in text_lower for word in ['food', 'pizza', 'eat', 'meal']))
    has_help = int('help' in text_lower)
    has_emergency = int(any(word in text_lower for word in ['emergency', 'urgent', 'desperate', 'crisis']))
    
    return {
        'text_length': text_length,
        'word_count': word_count,
        'sentence_count': sentence_count,
        'exclamation_count': exclamation_count,
        'question_count': question_count,
        'caps_ratio': caps_ratio,
        'has_please': has_please,
        'has_thank': has_thank,
        'has_sorry': has_sorry,
        'has_because': has_because,
        'has_family': has_family,
        'has_kids': has_kids,
        'has_work': has_work,
        'has_money': has_money,
        'has_pay': has_pay,
        'has_hungry': has_hungry,
        'has_food': has_food,
        'has_help': has_help,
        'has_emergency': has_emergency
    }

In [None]:
# Extract text features from both text fields
print("Extracting text features from request_text...")
text_features_train = pd.DataFrame([extract_text_features(text) for text in train_df['request_text']])
text_features_test = pd.DataFrame([extract_text_features(text) for text in test_df['request_text']])

print("Extracting text features from request_text_edit_aware...")
text_edit_features_train = pd.DataFrame([extract_text_features(text) for text in train_df['request_text_edit_aware']])
text_edit_features_test = pd.DataFrame([extract_text_features(text) for text in test_df['request_text_edit_aware']])

# Rename columns to distinguish between text fields
text_features_train.columns = [f"text_{col}" for col in text_features_train.columns]
text_features_test.columns = [f"text_{col}" for col in text_features_test.columns]
text_edit_features_train.columns = [f"text_edit_{col}" for col in text_edit_features_train.columns]
text_edit_features_test.columns = [f"text_edit_{col}" for col in text_edit_features_test.columns]

print(f"Text features shape: {text_features_train.shape}")
print(f"Text edit features shape: {text_edit_features_train.shape}")

In [None]:
# Extract tabular features (excluding text fields and target)
tabular_features = [col for col in train_df.columns if col not in ['request_text', 'request_text_edit_aware', 'requester_received_pizza', 'request_id']]

print(f"Tabular features: {len(tabular_features)}")
print(f"Tabular feature names: {tabular_features}")

# Prepare feature matrices
X_train = pd.concat([
    train_df[tabular_features].reset_index(drop=True),
    text_features_train.reset_index(drop=True),
    text_edit_features_train.reset_index(drop=True)
], axis=1)

X_test = pd.concat([
    test_df[tabular_features].reset_index(drop=True),
    text_features_test.reset_index(drop=True),
    text_edit_features_test.reset_index(drop=True)
], axis=1)

y_train = train_df['requester_received_pizza'].astype(int)

print(f"Final training features shape: {X_train.shape}")
print(f"Final test features shape: {X_test.shape}")

In [None]:
# Define cross-validation strategy
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_SEED)

# Store predictions
train_predictions = np.zeros(len(X_train))
test_predictions = np.zeros(len(X_test))

cv_scores = []

print(f"Training LightGBM model with {n_folds}-fold CV...")

for fold, (train_idx, valid_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"Fold {fold + 1}/{n_folds}")
    
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_tr, label=y_tr)
    valid_data = lgb.Dataset(X_val, label=y_val)
    
    # Define parameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': RANDOM_SEED
    }
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(100)
        ]
    )
    
    # Predict on validation set
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    train_predictions[valid_idx] = val_pred
    
    # Calculate validation score (log loss since metric direction is false)
    val_score = log_loss(y_val, val_pred)
    cv_scores.append(val_score)
    print(f"Fold {fold + 1} validation log loss: {val_score:.4f}")
    
    # Predict on test set
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    test_predictions += test_pred / n_folds

# Calculate overall CV score
overall_cv_score = log_loss(y_train, train_predictions)
print(f"\nOverall CV log loss: {overall_cv_score:.4f}")
print(f"Mean CV log loss: {np.mean(cv_scores):.4f} Â± {np.std(cv_scores):.4f}")

## Create Submission

In [None]:
# Create submission dataframe
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure the predictions are in the correct format (0-1 range)
submission['requester_received_pizza'] = submission['requester_received_pizza'].clip(0, 1)

print("Submission preview:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")
print(f"Prediction distribution:")
print(submission['requester_received_pizza'].describe())

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

# Verify submission format matches sample
sample_submission = pd.read_csv('/home/data/sampleSubmission.csv')
print(f"\nSample submission columns: {sample_submission.columns.tolist()}")
print(f"Our submission columns: {submission.columns.tolist()}")
print(f"Columns match: {list(submission.columns) == list(sample_submission.columns)}")

## Feature Importance Analysis

In [None]:
# Calculate average feature importance across all folds
# For simplicity, we'll use the last fold's importance as a proxy
final_importance = pd.DataFrame({
    'feature': X_train.columns.tolist(),
    'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print("Top 20 most important features:")
print(final_importance.head(20))

# Save feature importance
final_importance.to_csv('/home/code/experiments/001_feature_importance.csv', index=False)
print("\nFeature importance saved to: /home/code/experiments/001_feature_importance.csv")