# Fixed Baseline: LightGBM without Leaky Features

This notebook implements a corrected baseline model that excludes features not present in the test set.

## Key Fix
- **Removed user flair features**: `requester_user_flair` is not present in test data, causing the model to fail
- Using only features available in both train and test sets
- Focus on text features and metadata that generalize

## Strategy
- Engineer text features (TF-IDF, length, word count)
- Use metadata features (account age, activity metrics, vote counts)
- Apply stratified k-fold cross-validation
- Handle class imbalance with scale_pos_weight

In [6]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## Load Data

In [7]:
# Load training data
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

train_df = pd.DataFrame(train_data)
print(f"Training data shape: {train_df.shape}")
print(f"Target distribution:")
print(train_df['requester_received_pizza'].value_counts(normalize=True))

# Load test data
with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

test_df = pd.DataFrame(test_data)
print(f"\nTest data shape: {test_df.shape}")

Training data shape: (2878, 32)
Target distribution:
requester_received_pizza
False    0.751564
True     0.248436
Name: proportion, dtype: float64

Test data shape: (1162, 17)


## Feature Engineering (Fixed)

**Key Change**: Only use features available in BOTH train and test sets

In [8]:
def engineer_features(df):
    """Engineer features from the raw data - ONLY use features available in test set"""
    features = pd.DataFrame(index=df.index)
    
    # Text features
    # Use request_text_edit_aware to avoid leakage from edits
    text_col = 'request_text_edit_aware' if 'request_text_edit_aware' in df.columns else 'request_text'
    
    # Combine title and text for unified analysis
    df['combined_text'] = df['request_title'].fillna('') + ' ' + df[text_col].fillna('')
    
    # Basic text length features
    features['text_length'] = df['combined_text'].str.len()
    features['word_count'] = df['combined_text'].str.split().str.len()
    features['title_length'] = df['request_title'].str.len()
    features['avg_word_length'] = features['text_length'] / (features['word_count'] + 1)
    
    # Account activity features (use at_request versions since at_retrieval not in test)
    features['account_age_days'] = df['requester_account_age_in_days_at_request']
    features['num_comments'] = df['requester_number_of_comments_at_request']
    features['num_posts'] = df['requester_number_of_posts_at_request']
    features['num_subreddits'] = df['requester_number_of_subreddits_at_request']
    
    # Activity ratios
    features['comments_per_post'] = features['num_comments'] / (features['num_posts'] + 1)
    features['subreddits_per_post'] = features['num_subreddits'] / (features['num_posts'] + 1)
    
    # Vote features (available in both train and test)
    features['upvotes_minus_downvotes'] = df['requester_upvotes_minus_downvotes_at_request']
    features['upvotes_plus_downvotes'] = df['requester_upvotes_plus_downvotes_at_request']
    
    # Time features
    features['timestamp'] = df['unix_timestamp_of_request']
    features['hour_of_day'] = pd.to_datetime(df['unix_timestamp_of_request'], unit='s').dt.hour
    features['day_of_week'] = pd.to_datetime(df['unix_timestamp_of_request'], unit='s').dt.dayofweek
    features['is_weekend'] = features['day_of_week'].isin([5, 6]).astype(int)
    
    # Handle missing values
    features = features.fillna(0)
    
    return features

# Engineer features for train and test
train_features = engineer_features(train_df)
test_features = engineer_features(test_df)

print(f"Engineered features shape: {train_features.shape}")
print(f"Feature columns: {list(train_features.columns)}")

Engineered features shape: (2878, 16)
Feature columns: ['text_length', 'word_count', 'title_length', 'avg_word_length', 'account_age_days', 'num_comments', 'num_posts', 'num_subreddits', 'comments_per_post', 'subreddits_per_post', 'upvotes_minus_downvotes', 'upvotes_plus_downvotes', 'timestamp', 'hour_of_day', 'day_of_week', 'is_weekend']


## TF-IDF Features for Text

In [9]:
# Create TF-IDF features for the text
vectorizer = TfidfVectorizer(
    max_features=2000,  # Increased from 1000 for better text representation
    stop_words='english',
    ngram_range=(1, 2),  # Unigrams and bigrams
    min_df=3,  # Reduced from 5 to capture more features
    max_df=0.95  # Ignore very common terms
)

# Fit on training text
train_text = train_df['combined_text'].fillna('')
tfidf_train = vectorizer.fit_transform(train_text)

# Transform test text
test_text = test_df['combined_text'].fillna('')
tfidf_test = vectorizer.transform(test_text)

print(f"TF-IDF features shape: {tfidf_train.shape}")

TF-IDF features shape: (2878, 2000)


## Combine Features

In [10]:
# Convert engineered features to sparse matrix format
train_engineered = csr_matrix(train_features.values)
test_engineered = csr_matrix(test_features.values)

# Combine TF-IDF and engineered features
X_train = hstack([tfidf_train, train_engineered])
X_test = hstack([tfidf_test, test_engineered])

y_train = train_df['requester_received_pizza'].values

print(f"Final training matrix shape: {X_train.shape}")
print(f"Final test matrix shape: {X_test.shape}")

Final training matrix shape: (2878, 2016)
Final test matrix shape: (1162, 2016)


## Model Training with Cross-Validation

In [11]:
# Set up stratified k-fold cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize arrays for out-of-fold predictions
oof_predictions = np.zeros(len(train_df))
test_predictions = np.zeros(len(test_df))

# Track scores across folds
cv_scores = []

print(f"Training LightGBM model with {n_splits}-fold stratified CV...")

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"\nFold {fold + 1}/{n_splits}")
    
    # Split data
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    # Create LightGBM datasets
    train_set = lgb.Dataset(X_tr, label=y_tr)
    val_set = lgb.Dataset(X_val, label=y_val)
    
    # Define parameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 63,  # Increased from 31 for more complex model
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'num_threads': 4
    }
    
    # Handle class imbalance with scale_pos_weight
    pos_rate = y_tr.mean()
    params['scale_pos_weight'] = (1 - pos_rate) / pos_rate
    
    # Train model
    model = lgb.train(
        params,
        train_set,
        num_boost_round=2000,  # Increased for better learning
        valid_sets=[val_set],
        valid_names=['val'],
        callbacks=[
            lgb.early_stopping(100),  # Increased patience
            lgb.log_evaluation(100)
        ]
    )
    
    # Make predictions
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    # Store predictions
    oof_predictions[val_idx] = val_pred
    test_predictions += test_pred / n_splits
    
    # Calculate fold score
    fold_score = roc_auc_score(y_val, val_pred)
    cv_scores.append(fold_score)
    
    print(f"Fold {fold + 1} AUC: {fold_score:.4f}")

# Calculate overall CV score
overall_score = roc_auc_score(y_train, oof_predictions)
print(f"\nOverall CV AUC: {overall_score:.4f}")
print(f"Mean CV AUC: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")

Training LightGBM model with 5-fold stratified CV...

Fold 1/5
Training until validation scores don't improve for 100 rounds


[100]	val's auc: 0.655905


Early stopping, best iteration is:
[76]	val's auc: 0.665418
Fold 1 AUC: 0.6654

Fold 2/5
Training until validation scores don't improve for 100 rounds


[100]	val's auc: 0.639319


[200]	val's auc: 0.636024
Early stopping, best iteration is:
[104]	val's auc: 0.640288
Fold 2 AUC: 0.6403

Fold 3/5
Training until validation scores don't improve for 100 rounds


[100]	val's auc: 0.649381


Early stopping, best iteration is:
[36]	val's auc: 0.677934
Fold 3 AUC: 0.6779

Fold 4/5
Training until validation scores don't improve for 100 rounds


[100]	val's auc: 0.611435
Early stopping, best iteration is:
[8]	val's auc: 0.631297
Fold 4 AUC: 0.6313

Fold 5/5
Training until validation scores don't improve for 100 rounds


[100]	val's auc: 0.620419
Early stopping, best iteration is:
[11]	val's auc: 0.63366
Fold 5 AUC: 0.6337

Overall CV AUC: 0.6413
Mean CV AUC: 0.6497 (+/- 0.0186)


## Feature Importance Analysis

In [12]:
# Get feature importance from the last fold model
feature_names = (list(vectorizer.get_feature_names_out()) + 
                 list(train_features.columns))

importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print("Top 20 most important features:")
print(importance_df.head(20))

# Check how many features have non-zero importance
non_zero_features = importance_df[importance_df['importance'] > 0]
print(f"\nNumber of features with non-zero importance: {len(non_zero_features)}")

Top 20 most important features:
                      feature   importance
2012                timestamp  1361.622870
2000              text_length   654.490335
2011   upvotes_plus_downvotes   481.411610
2001               word_count   465.550311
2010  upvotes_minus_downvotes   420.842627
2002             title_length   372.966048
2008        comments_per_post   325.275016
2003          avg_word_length   324.242850
1279                    pizza   285.650590
2004         account_age_days   278.281860
872                      just   277.596773
2005             num_comments   276.734030
2013              hour_of_day   273.708783
2009      subreddits_per_post   223.592561
1094                    money   198.730501
2007           num_subreddits   198.583499
2014              day_of_week   198.212170
1009                     love   172.488650
746                      help   155.886431
1807                  tonight   154.544720

Number of features with non-zero importance: 105


## Create Submission

In [13]:
# Create submission dataframe
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

# Ensure the format matches sample submission
submission['requester_received_pizza'] = submission['requester_received_pizza'].clip(0, 1)

print("Submission preview:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")
print(f"Prediction distribution:")
print(submission['requester_received_pizza'].describe())

# Check if predictions are varied (not all the same)
unique_predictions = submission['requester_received_pizza'].nunique()
print(f"\nNumber of unique prediction values: {unique_predictions}")

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print("\nSubmission saved to /home/submission/submission.csv")

Submission preview:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.270787
1   t3_roiuw                  0.235612
2   t3_mjnbq                  0.436364
3   t3_t8wd1                  0.388253
4  t3_1m4zxu                  0.201804

Submission shape: (1162, 2)
Prediction distribution:
count    1162.000000
mean        0.309310
std         0.118674
min         0.087961
25%         0.216137
50%         0.293694
75%         0.391359
max         0.618710
Name: requester_received_pizza, dtype: float64

Number of unique prediction values: 1162

Submission saved to /home/submission/submission.csv
