# Baseline Experiment: LightGBM with Simple Features

This notebook implements a baseline model using LightGBM with simple text and tabular features.

**Strategy:**
- Simple text features: length, word count, presence of 'EDIT'
- Tabular features as-is with minimal preprocessing
- Stratified 5-fold CV to handle class imbalance
- LightGBM for speed and good performance with mixed data types

In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## Load and Explore Data

In [2]:
# Load training data
train_path = '/home/data/train.json'
test_path = '/home/data/test.json'

with open(train_path, 'r') as f:
    train_data = json.load(f)

with open(test_path, 'r') as f:
    test_data = json.load(f)

print(f"Training samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")
print(f"Sample keys: {list(train_data[0].keys())}")

Training samples: 2878
Test samples: 1162
Sample keys: ['giver_username_if_known', 'number_of_downvotes_of_request_at_retrieval', 'number_of_upvotes_of_request_at_retrieval', 'post_was_edited', 'request_id', 'request_number_of_comments_at_retrieval', 'request_text', 'request_text_edit_aware', 'request_title', 'requester_account_age_in_days_at_request', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_request', 'requester_days_since_first_post_on_raop_at_retrieval', 'requester_number_of_comments_at_request', 'requester_number_of_comments_at_retrieval', 'requester_number_of_comments_in_raop_at_request', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_number_of_posts_at_request', 'requester_number_of_posts_at_retrieval', 'requester_number_of_posts_on_raop_at_request', 'requester_number_of_posts_on_raop_at_retrieval', 'requester_number_of_subreddits_at_request', 'requester_received_pizza', 'requester_subreddits_at_request', 'requeste

In [3]:
# Convert to DataFrame for easier manipulation
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print("Training data shape:", train_df.shape)
print("Test data shape:", test_df.shape)
print("\nTarget distribution:")
print(train_df['requester_received_pizza'].value_counts(normalize=True))

Training data shape: (2878, 32)
Test data shape: (1162, 17)

Target distribution:
requester_received_pizza
False    0.751564
True     0.248436
Name: proportion, dtype: float64


## Feature Engineering

In [None]:
def extract_text_features(df):
    """Extract simple text features from request text and title"""
    features = pd.DataFrame(index=df.index)
    
    # Use request_text if available, otherwise use request_text_edit_aware
    text_col = 'request_text' if 'request_text' in df.columns else 'request_text_edit_aware'
    
    # Request text features
    features['text_length'] = df[text_col].fillna('').apply(len)
    features['text_word_count'] = df[text_col].fillna('').apply(lambda x: len(x.split()))
    features['text_has_edit'] = df[text_col].fillna('').str.contains('EDIT', case=False).astype(int)
    
    # Title features  
    features['title_length'] = df['request_title'].fillna('').apply(len)
    features['title_word_count'] = df['request_title'].fillna('').apply(lambda x: len(x.split()))
    
    # Combined text features
    features['total_text_length'] = features['text_length'] + features['title_length']
    features['total_word_count'] = features['text_word_count'] + features['title_word_count']
    
    return features

def extract_tabular_features(df):
    """Extract and preprocess tabular features"""
    features = pd.DataFrame(index=df.index)
    
    # Numeric features that might exist in the dataset
    numeric_cols = [
        'requester_account_age_in_days_at_request',
        'requester_account_age_in_days_at_retrieval',
        'requester_number_of_comments_at_request',
        'requester_number_of_comments_at_retrieval',
        'requester_number_of_posts_at_request',
        'requester_number_of_posts_at_retrieval',
        'requester_upvotes_minus_downvotes_at_request',
        'requester_upvotes_minus_downvotes_at_retrieval',
        'requester_upvotes_plus_downvotes_at_request',
        'requester_upvotes_plus_downvotes_at_retrieval',
        'number_of_upvotes_of_request_at_retrieval',
        'number_of_downvotes_of_request_at_retrieval',
        'request_number_of_comments_at_retrieval'
    ]
    
    # Add numeric features if they exist
    for col in numeric_cols:
        if col in df.columns:
            features[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
        else:
            features[col] = 0  # Add column with zeros if it doesn't exist
    
    # Binary features
    if 'post_was_edited' in df.columns:
        features['post_was_edited'] = df['post_was_edited'].astype(int)
    else:
        features['post_was_edited'] = 0
    
    return features

# Extract features for train and test
train_text_features = extract_text_features(train_df)
train_tabular_features = extract_tabular_features(train_df)
train_features = pd.concat([train_text_features, train_tabular_features], axis=1)

test_text_features = extract_text_features(test_df)
test_tabular_features = extract_tabular_features(test_df)
test_features = pd.concat([test_text_features, test_tabular_features], axis=1)

# Ensure both train and test have the same columns
train_features = train_features.reindex(columns=test_features.columns)

print(f"Train features shape: {train_features.shape}")
print(f"Test features shape: {test_features.shape}")
print(f"Columns match: {list(train_features.columns) == list(test_features.columns)}")

In [None]:
# No categorical features to encode in this baseline model
print("No categorical features to encode in baseline model")
print(f"Final train features shape: {train_features.shape}")
print(f"Final test features shape: {test_features.shape}")

## Model Training with Cross-Validation

In [None]:
# Prepare data for training
X = train_features.copy()
y = train_df['requester_received_pizza'].astype(int)

# Fill any remaining NaN values
X = X.fillna(0)
test_features = test_features.fillna(0)

print(f"Training data: {X.shape}")
print(f"Target distribution: {y.value_counts().to_dict()}")

# Define cross-validation strategy
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_SEED)

# Store predictions
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(test_features))
cv_scores = []

print(f"\nStarting {n_folds}-fold stratified cross-validation...")

In [None]:
# Train LightGBM model with cross-validation
fold = 1
for train_idx, valid_idx in skf.split(X, y):
    print(f"\nFold {fold}/{n_folds}")
    
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)
    
    # Define parameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': RANDOM_SEED
    }
    
    # Train model
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50),
            lgb.log_evaluation(100)
        ]
    )
    
    # Make predictions
    oof_predictions[valid_idx] = model.predict(X_valid, num_iteration=model.best_iteration)
    test_predictions += model.predict(test_features, num_iteration=model.best_iteration) / n_folds
    
    # Calculate fold score
    fold_score = roc_auc_score(y_valid, oof_predictions[valid_idx])
    cv_scores.append(fold_score)
    print(f"Fold {fold} AUC: {fold_score:.4f}")
    
    fold += 1

# Calculate overall CV score
overall_cv_score = roc_auc_score(y, oof_predictions)
print(f"\nOverall CV AUC: {overall_cv_score:.4f}")
print(f"Mean CV AUC: {np.mean(cv_scores):.4f} Â± {np.std(cv_scores):.4f}")

In [None]:
# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print("Top 10 most important features:")
print(feature_importance.head(10))

## Generate Submission

In [None]:
# Create submission file
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure submission format matches sample
print("Submission shape:", submission.shape)
print("\nFirst few rows:")
print(submission.head())

# Save submission
submission_path = '/home/submission/submission_001_baseline.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

In [None]:
# Verify submission format
sample_sub = pd.read_csv('/home/data/sampleSubmission.csv')
print("Sample submission shape:", sample_sub.shape)
print("Our submission shape:", submission.shape)
print("Columns match:", list(submission.columns) == list(sample_sub.columns))
print("Request IDs match:", set(submission['request_id']) == set(sample_sub['request_id']))