# Experiment 004: LightGBM with Fixed Leakage

This experiment addresses the critical issues identified in exp_003:
1. **Model upgrade**: Switch from LogisticRegression to LightGBM
2. **Fix data leakage**: Move ALL feature fitting inside CV loop
3. **Optimize dimensionality**: Reduce to 75 total SVD components (50 word + 25 char)
4. **Better regularization**: LightGBM handles high dimensionality better than logistic regression

Expected improvement: +0.03 to +0.08 AUC (target: 0.68-0.72)

In [None]:
import pandas as pd
import numpy as np
import json
import re
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading data...")
with open('/home/data/train.json', 'r') as f:
    train_data = json.load(f)

with open('/home/data/test.json', 'r') as f:
    test_data = json.load(f)

df_train = pd.DataFrame(train_data)
df_test = pd.DataFrame(test_data)

print(f"Training samples: {len(df_train)}")
print(f"Test samples: {len(df_test)}")
print(f"Positive class rate: {df_train['requester_received_pizza'].mean():.3f}")

In [None]:
# Combine text features
df_train['combined_text'] = df_train['request_title'].fillna('') + ' ' + df_train['request_text_edit_aware'].fillna('')
df_test['combined_text'] = df_test['request_title'].fillna('') + ' ' + df_test['request_text_edit_aware'].fillna('')

# Text preprocessing function
def preprocess_text(text):
    """Clean text while preserving important patterns"""
    if pd.isna(text):
        return ""
    
    text = str(text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    text = re.sub(r'/u/\w+', '', text)
    text = re.sub(r'EDIT:\s*', '', text)
    
    return text.strip()

# Apply preprocessing
df_train['combined_text_clean'] = df_train['combined_text'].apply(preprocess_text)
df_test['combined_text_clean'] = df_test['combined_text'].apply(preprocess_text)

print("Text preprocessing completed")

In [None]:
# Create enhanced numeric features
y = df_train['requester_received_pizza'].values

# Log transforms for count features
count_features = [
    'requester_number_of_comments_at_request',
    'requester_number_of_posts_at_request', 
    'requester_upvotes_plus_downvotes_at_request'
]

for feat in count_features:
    df_train[f'{feat}_log'] = np.log1p(df_train[feat])
    df_test[f'{feat}_log'] = np.log1p(df_test[feat])

# Ratios
df_train['upvotes_per_comment'] = df_train['requester_upvotes_plus_downvotes_at_request'] / (df_train['requester_number_of_comments_at_request'] + 1)
df_train['comments_per_post'] = df_train['requester_number_of_comments_at_request'] / (df_train['requester_number_of_posts_at_request'] + 1)
df_test['upvotes_per_comment'] = df_test['requester_upvotes_plus_downvotes_at_request'] / (df_test['requester_number_of_comments_at_request'] + 1)
df_test['comments_per_post'] = df_test['requester_number_of_comments_at_request'] / (df_test['requester_number_of_posts_at_request'] + 1)

# Account age in years
df_train['account_age_years'] = df_train['requester_account_age_in_days_at_request'] / 365.25
df_test['account_age_years'] = df_test['requester_account_age_in_days_at_request'] / 365.25

# Text statistics
df_train['text_length'] = df_train['combined_text_clean'].str.len()
df_test['text_length'] = df_test['combined_text_clean'].str.len()
df_train['word_count'] = df_train['combined_text_clean'].str.split().str.len()
df_test['word_count'] = df_test['combined_text_clean'].str.split().str.len()

# Select numeric features
numeric_features = [
    'requester_number_of_comments_at_request_log',
    'requester_number_of_posts_at_request_log', 
    'requester_upvotes_plus_downvotes_at_request_log',
    'upvotes_per_comment',
    'comments_per_post',
    'account_age_years',
    'text_length',
    'word_count'
]

train_numeric = df_train[numeric_features].values
test_numeric = df_test[numeric_features].values

print(f"Created {len(numeric_features)} numeric features")
print(f"Train numeric shape: {train_numeric.shape}")
print(f"Test numeric shape: {test_numeric.shape}")

In [None]:
# Define a custom transformer for text features with SVD
def create_text_pipeline(n_word_components=50, n_char_components=25):
    """Create a pipeline that transforms text to SVD components"""
    
    # Word-level TF-IDF + SVD
    word_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            max_features=5000,
            ngram_range=(1, 2),
            stop_words='english',
            lowercase=True,
            min_df=3,
            max_df=0.9,
            sublinear_tf=True
        )),
        ('svd', TruncatedSVD(n_components=n_word_components, random_state=42))
    ])
    
    # Character-level TF-IDF + SVD
    char_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            analyzer='char',
            ngram_range=(2, 4),
            max_features=2000,
            lowercase=False,
            min_df=5,
            max_df=0.95,
            sublinear_tf=True
        )),
        ('svd', TruncatedSVD(n_components=n_char_components, random_state=42))
    ])
    
    return word_pipeline, char_pipeline

print("Text pipeline functions defined")

In [None]:
# Stratified CV setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store predictions
oof_predictions = np.zeros(len(df_train))
test_predictions = np.zeros(len(df_test))
cv_scores = []

print("Starting 5-fold CV with LightGBM and fixed leakage...")
print("=" * 60)

fold = 0
for train_idx, val_idx in skf.split(df_train, y):
    fold += 1
    print(f"\nFold {fold}/5")
    
    # Split data
    X_train_text = df_train['combined_text_clean'].iloc[train_idx]
    X_val_text = df_train['combined_text_clean'].iloc[val_idx]
    X_train_num = train_numeric[train_idx]
    X_val_num = train_numeric[val_idx]
    
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Create and fit text pipelines (INSIDE CV LOOP - NO LEAKAGE)
    word_pipe, char_pipe = create_text_pipeline(n_word_components=50, n_char_components=25)
    
    # Fit on training data only
    X_train_word_svd = word_pipe.fit_transform(X_train_text)
    X_train_char_svd = char_pipe.fit_transform(X_train_text)
    
    # Transform validation data
    X_val_word_svd = word_pipe.transform(X_val_text)
    X_val_char_svd = char_pipe.transform(X_val_text)
    
    # Combine all features
    X_train_combined = np.hstack([X_train_word_svd, X_train_char_svd, X_train_num])
    X_val_combined = np.hstack([X_val_word_svd, X_val_char_svd, X_val_num])
    
    print(f"  Training features shape: {X_train_combined.shape}")
    print(f"  Validation features shape: {X_val_combined.shape}")
    
    # Train LightGBM model
    model = lgb.LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.05,
        num_leaves=64,
        max_depth=7,
        min_child_samples=50,
        feature_fraction=0.8,
        bagging_fraction=0.8,
        bagging_freq=5,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    
    model.fit(
        X_train_combined, y_train,
        eval_set=[(X_val_combined, y_val)],
        eval_metric='auc',
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predict and evaluate
    val_pred = model.predict_proba(X_val_combined)[:, 1]
    fold_auc = roc_auc_score(y_val, val_pred)
    cv_scores.append(fold_auc)
    oof_predictions[val_idx] = val_pred
    
    print(f"  Fold {fold} AUC: {fold_auc:.4f}")
    print(f"  Best iteration: {model.best_iteration_}")

print("\n" + "=" * 60)
print(f"Overall CV AUC: {np.mean(cv_scores):.4f}")
print(f"CV scores: {cv_scores}")
print(f"Mean ± Std: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

In [None]:
# Generate final predictions using full training data
print("Training final model on full data...")

# Create and fit text pipelines on full training data
word_pipe_full, char_pipe_full = create_text_pipeline(n_word_components=50, n_char_components=25)

train_word_svd_full = word_pipe_full.fit_transform(df_train['combined_text_clean'])
train_char_svd_full = char_pipe_full.fit_transform(df_train['combined_text_clean'])

test_word_svd_full = word_pipe_full.transform(df_test['combined_text_clean'])
test_char_svd_full = char_pipe_full.transform(df_test['combined_text_clean'])

# Combine all features
train_combined_full = np.hstack([train_word_svd_full, train_char_svd_full, train_numeric])
test_combined_full = np.hstack([test_word_svd_full, test_char_svd_full, test_numeric])

print(f"Final training features shape: {train_combined_full.shape}")
print(f"Final test features shape: {test_combined_full.shape}")

# Train final model
final_model = lgb.LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=64,
    max_depth=7,
    min_child_samples=50,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=5,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

final_model.fit(
    train_combined_full, y,
    eval_metric='auc'
)

# Generate final predictions
final_predictions = final_model.predict_proba(test_combined_full)[:, 1]

print(f"Final model trained on {train_combined_full.shape[1]} features")
print(f"Final predictions shape: {final_predictions.shape}")
print(f"Feature importance - top 10:")
for i, (idx, imp) in enumerate(sorted(enumerate(final_model.feature_importances_), key=lambda x: x[1], reverse=True)[:10]):
    print(f"  {i+1}. Feature {idx}: {imp:.4f}")

In [None]:
# Create submission file
submission = pd.DataFrame({
    'request_id': df_test['request_id'],
    'requester_received_pizza': final_predictions
})

# Ensure proper format
submission['requester_received_pizza'] = submission['requester_received_pizza'].astype(float)

# Save submission
submission_path = '/home/submission/submission_lightgbm_fixed_leakage.csv'
submission.to_csv(submission_path, index=False)

print(f"\nSubmission saved to {submission_path}")
print(f"Submission shape: {submission.shape}")
print(f"Prediction range: {submission['requester_received_pizza'].min():.4f} to {submission['requester_received_pizza'].max():.4f}")
print("\nSubmission preview:")
print(submission.head())