# Baseline Experiment: LightGBM with Simple Features

This notebook creates a baseline model using LightGBM with simple features extracted from both text and tabular data.

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## Load Data

In [None]:
# Load training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

# Load test data
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

print(f"Training samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

# Convert to DataFrames for easier manipulation
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print(f"Training columns: {train_df.columns.tolist()}")
print(f"Target distribution:")
print(train_df['requester_received_pizza'].value_counts(normalize=True))

## Feature Engineering

In [None]:
def extract_text_features(text):
    """Extract simple features from text"""
    if pd.isna(text) or text == '':
        return {
            'text_length': 0,
            'word_count': 0,
            'sentence_count': 0,
            'exclamation_count': 0,
            'question_count': 0,
            'caps_ratio': 0,
            'has_please': 0,
            'has_thank': 0,
            'has_sorry': 0,
            'has_because': 0,
            'has_family': 0,
            'has_kids': 0,
            'has_work': 0,
            'has_money': 0,
            'has_pay': 0,
            'has_hungry': 0,
            'has_food': 0,
            'has_help': 0,
            'has_emergency': 0
        }
    
    # Basic text stats
    text_length = len(text)
    words = text.split()
    word_count = len(words)
    sentences = re.split(r'[.!?]+', text)
    sentence_count = len([s for s in sentences if s.strip()])
    
    # Punctuation
    exclamation_count = text.count('!')
    question_count = text.count('?')
    
    # Capitalization ratio
    caps_count = sum(1 for c in text if c.isupper())
    caps_ratio = caps_count / text_length if text_length > 0 else 0
    
    # Keywords (indicators of politeness, need, etc.)
    text_lower = text.lower()
    has_please = int('please' in text_lower)
    has_thank = int(any(word in text_lower for word in ['thank', 'thanks', 'thx']))
    has_sorry = int('sorry' in text_lower)
    has_because = int('because' in text_lower)
    has_family = int(any(word in text_lower for word in ['family', 'fam']))
    has_kids = int(any(word in text_lower for word in ['kid', 'kids', 'child', 'children', 'baby', 'babies']))
    has_work = int(any(word in text_lower for word in ['work', 'job', 'employ', 'unemploy', 'laid off']))
    has_money = int(any(word in text_lower for word in ['money', 'cash', 'dollar', 'broke', 'poor', 'bills']))
    has_pay = int(any(word in text_lower for word in ['pay', 'paycheck', 'salary', 'wage']))
    has_hungry = int(any(word in text_lower for word in ['hungry', 'starv', 'hunger', 'food']))
    has_food = int('food' in text_lower)
    has_help = int(any(word in text_lower for word in ['help', 'need', 'desperate', 'urgent']))
    has_emergency = int(any(word in text_lower for word in ['emergency', 'crisis', 'urgent', 'desperate']))
    
    return {
        'text_length': text_length,
        'word_count': word_count,
        'sentence_count': sentence_count,
        'exclamation_count': exclamation_count,
        'question_count': question_count,
        'caps_ratio': caps_ratio,
        'has_please': has_please,
        'has_thank': has_thank,
        'has_sorry': has_sorry,
        'has_because': has_because,
        'has_family': has_family,
        'has_kids': has_kids,
        'has_work': has_work,
        'has_money': has_money,
        'has_pay': has_pay,
        'has_hungry': has_hungry,
        'has_food': has_food,
        'has_help': has_help,
        'has_emergency': has_emergency
    }

def extract_metadata_features(df):
    """Extract features from metadata"""
    features = pd.DataFrame()
    
    # Account age features
    features['account_age_at_request'] = df['requester_account_age_in_days_at_request']
    features['account_age_at_retrieval'] = df['requester_account_age_in_days_at_retrieval']
    features['account_age_diff'] = df['requester_account_age_in_days_at_retrieval'] - df['requester_account_age_in_days_at_request']
    
    # Activity features
    features['comments_at_request'] = df['requester_number_of_comments_at_request']
    features['comments_at_retrieval'] = df['requester_number_of_comments_at_retrieval']
    features['comments_diff'] = df['requester_number_of_comments_at_retrieval'] - df['requester_number_of_comments_at_request']
    
    features['posts_at_request'] = df['requester_number_of_posts_at_request']
    features['posts_at_retrieval'] = df['requester_number_of_posts_at_retrieval']
    features['posts_diff'] = df['requester_number_of_posts_at_retrieval'] - df['requester_number_of_posts_at_request']
    
    features['comments_in_raop_at_request'] = df['requester_number_of_comments_in_raop_at_request']
    features['comments_in_raop_at_retrieval'] = df['requester_number_of_comments_in_raop_at_retrieval']
    features['posts_in_raop_at_request'] = df['requester_number_of_posts_on_raop_at_request']
    features['posts_in_raop_at_retrieval'] = df['requester_number_of_posts_on_raop_at_retrieval']
    
    # Upvotes/downvotes features
    features['upvotes_minus_downvotes_at_request'] = df['requester_upvotes_minus_downvotes_at_request']
    features['upvotes_minus_downvotes_at_retrieval'] = df['requester_upvotes_minus_downvotes_at_retrieval']
    features['upvotes_plus_downvotes_at_request'] = df['requester_upvotes_plus_downvotes_at_request']
    features['upvotes_plus_downvotes_at_retrieval'] = df['requester_upvotes_plus_downvotes_at_retrieval']
    
    # Request features
    features['request_upvotes'] = df['number_of_upvotes_of_request_at_retrieval']
    features['request_downvotes'] = df['number_of_downvotes_of_request_at_retrieval']
    features['request_comments'] = df['request_number_of_comments_at_retrieval']
    
    # Time features
    features['unix_timestamp'] = df['unix_timestamp_of_request']
    features['unix_timestamp_utc'] = df['unix_timestamp_of_request_utc']
    features['timestamp_diff'] = df['unix_timestamp_of_request_utc'] - df['unix_timestamp_of_request']
    
    # Boolean features
    features['post_was_edited'] = df['post_was_edited'].astype(int)
    
    # User flair encoding
    flair_map = {'None': 0, 'shroom': 1, 'PIF': 2}
    features['user_flair'] = df['requester_user_flair'].map(flair_map).fillna(0)
    
    # Days since first post on RAOP
    features['days_since_first_raop_at_request'] = df['requester_days_since_first_post_on_raop_at_request']
    features['days_since_first_raop_at_retrieval'] = df['requester_days_since_first_post_on_raop_at_retrieval']
    
    # Number of subreddits
    features['number_of_subreddits_at_request'] = df['requester_number_of_subreddits_at_request']
    
    return features

# Extract text features from both text fields
print("Extracting text features from request_text...")
text_features_train = pd.DataFrame([extract_text_features(text) for text in train_df['request_text']])
text_features_test = pd.DataFrame([extract_text_features(text) for text in test_df['request_text']])

print("Extracting text features from request_text_edit_aware...")
text_edit_features_train = pd.DataFrame([extract_text_features(text) for text in train_df['request_text_edit_aware']])
text_edit_features_test = pd.DataFrame([extract_text_features(text) for text in test_df['request_text_edit_aware']])

# Rename columns to distinguish between text fields
text_features_train.columns = [f"text_{col}" for col in text_features_train.columns]
text_features_test.columns = [f"text_{col}" for col in text_features_test.columns]
text_edit_features_train.columns = [f"text_edit_{col}" for col in text_edit_features_train.columns]
text_edit_features_test.columns = [f"text_edit_{col}" for col in text_edit_features_test.columns]

print("Extracting metadata features...")
metadata_features_train = extract_metadata_features(train_df)
metadata_features_test = extract_metadata_features(test_df)

# Combine all features
X_train = pd.concat([text_features_train, text_edit_features_train, metadata_features_train], axis=1)
X_test = pd.concat([text_features_test, text_edit_features_test, metadata_features_test], axis=1)

y_train = train_df['requester_received_pizza'].astype(int)

print(f"Training features shape: {X_train.shape}")
print(f"Test features shape: {X_test.shape}")
print(f"Feature columns: {X_train.columns.tolist()[:10]}...")  # Show first 10

## Model Training with Cross-Validation

In [None]:
# Define cross-validation strategy
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_SEED)

# Store predictions
train_predictions = np.zeros(len(X_train))
test_predictions = np.zeros(len(X_test))

cv_scores = []

print(f"Training LightGBM model with {n_folds}-fold CV...")

for fold, (train_idx, valid_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"Fold {fold + 1}/{n_folds}")
    
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_tr, label=y_tr)
    valid_data = lgb.Dataset(X_val, label=y_val)
    
    # Define parameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'random_state': RANDOM_SEED
    }
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(100)
        ]
    )
    
    # Make predictions
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    # Store predictions
    train_predictions[valid_idx] = val_pred
    test_predictions += test_pred / n_folds
    
    # Calculate fold score
    fold_score = roc_auc_score(y_val, val_pred)
    cv_scores.append(fold_score)
    print(f"Fold {fold + 1} AUC: {fold_score:.4f}")
    
    # Feature importance for this fold
    importance = model.feature_importance(importance_type='gain')
    feature_names = X_train.columns.tolist()
    fold_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': importance
    }).sort_values('importance', ascending=False)
    
    print(f"Top 5 features in fold {fold + 1}:")
    print(fold_importance.head())
    print("-" * 50)

# Overall CV score
cv_score = roc_auc_score(y_train, train_predictions)
print(f"\nOverall CV AUC: {cv_score:.4f}")
print(f"Mean CV AUC: {np.mean(cv_scores):.4f} Â± {np.std(cv_scores):.4f}")

## Create Submission

In [None]:
# Create submission dataframe
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure the predictions are in the correct format (0-1 range)
submission['requester_received_pizza'] = submission['requester_received_pizza'].clip(0, 1)

print("Submission preview:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")
print(f"Prediction distribution:")
print(submission['requester_received_pizza'].describe())

# Save submission
submission_path = '/home/submission/submission.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

# Verify submission format matches sample
sample_submission = pd.read_csv('/home/data/sampleSubmission.csv')
print(f"\nSample submission columns: {sample_submission.columns.tolist()}")
print(f"Our submission columns: {submission.columns.tolist()}")
print(f"Columns match: {list(submission.columns) == list(sample_submission.columns)}")

## Feature Importance Analysis

In [None]:
# Calculate average feature importance across all folds
# For simplicity, we'll use the last fold's importance as a proxy
final_importance = pd.DataFrame({
    'feature': X_train.columns.tolist(),
    'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print("Top 20 most important features:")
print(final_importance.head(20))

# Save feature importance
final_importance.to_csv('/home/code/experiments/001_feature_importance.csv', index=False)
print("\nFeature importance saved to: /home/code/experiments/001_feature_importance.csv")