# DistilBERT + LightGBM with Proper Scaling

**Problem**: exp_006 (DistilBERT baseline) underperformed due to catastrophic scale mismatch
- Meta-features scale: ~1.38 billion
- DistilBERT embeddings scale: ~11.3
- Ratio: 122 million to 1

**Solution**: Apply RobustScaler to meta-features before concatenation

**Expected improvement**: +0.03-0.05 AUC (from 0.6312 to 0.66-0.68)

In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import warnings
warnings.filterwarnings('ignore')

# Set seed
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

print("Libraries loaded successfully")

## 1. Load Data

In [None]:
# Load data
train_path = '/home/data/train.json'
with open(train_path, 'r') as f:
    train_data = json.load(f)
train_df = pd.DataFrame(train_data)

test_path = '/home/data/test.json'
with open(test_path, 'r') as f:
    test_data = json.load(f)
test_df = pd.DataFrame(test_data)

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Target distribution: {train_df['requester_received_pizza'].mean():.4f}")

# Combine text fields
train_df['combined_text'] = train_df['request_title'].fillna('') + ' ' + train_df['request_text_edit_aware'].fillna('')
test_df['combined_text'] = test_df['request_title'].fillna('') + ' ' + test_df['request_text_edit_aware'].fillna('')

## 2. Extract DistilBERT Embeddings

In [None]:
# Load DistilBERT model and tokenizer
print("Loading DistilBERT...")
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model.eval()

if torch.cuda.is_available():
    model = model.cuda()
    print("Using GPU for DistilBERT")
else:
    print("Using CPU for DistilBERT")

# Extract embeddings
batch_size = 16
def extract_distilbert_features(texts, max_length=256):
    """Extract [CLS] token embeddings from DistilBERT"""
    all_features = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        
        # Tokenize
        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        )
        
        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}
        
        # Get embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            # Use [CLS] token embedding (first token)
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            all_features.append(cls_embeddings)
    
    return np.vstack(all_features)

# Extract features (this will take a few minutes)
print("Extracting DistilBERT features from training data...")
train_distilbert = extract_distilbert_features(train_df['combined_text'].tolist())
print(f"Training embeddings shape: {train_distilbert.shape}")

print("Extracting DistilBERT features from test data...")
test_distilbert = extract_distilbert_features(test_df['combined_text'].tolist())
print(f"Test embeddings shape: {test_distilbert.shape}")

## 3. Engineer Meta-Features (SAFE features only - NO LEAKAGE)

In [None]:
# Define SAFE meta-features (same as honest baseline)
meta_features = [
    # Text length features
    'total_text_length', 'title_word_count', 'total_word_count', 'word_count', 'text_length',
    
    # User activity at request time (SAFE - no leakage)
    'requester_number_of_posts_at_request',
    'requester_number_of_comments_at_request',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_request',
    
    # Account age and temporal features
    'requester_account_age_in_days_at_request',
    'requester_days_since_first_post_on_raop_at_request',
    
    # Post metadata
    'post_was_edited'
]

# Check which features exist
available_features = []
for feature in meta_features:
    if feature in train_df.columns:
        available_features.append(feature)
    else:
        print(f"Warning: {feature} not found in data")

print(f"\nUsing {len(available_features)} meta-features:")
for i, f in enumerate(available_features, 1):
    print(f"{i:2d}. {f}")

# Prepare meta-features
train_meta = train_df[available_features].copy()
test_meta = test_df[available_features].copy()

# Handle boolean post_was_edited
if 'post_was_edited' in available_features:
    train_meta['post_was_edited'] = train_meta['post_was_edited'].astype(int)
    test_meta['post_was_edited'] = test_meta['post_was_edited'].astype(int)

print(f"\nTrain meta-features shape: {train_meta.shape}")
print(f"Test meta-features shape: {test_meta.shape}")

## 4. Apply Proper Scaling

In [None]:
# Apply RobustScaler to meta-features (handles outliers better than StandardScaler)
print("Applying RobustScaler to meta-features...")
scaler = RobustScaler()
train_meta_scaled = scaler.fit_transform(train_meta)
test_meta_scaled = scaler.transform(test_meta)

print(f"Meta-features after scaling:")
print(f"  Train - Mean: {train_meta_scaled.mean():.4f}, Std: {train_meta_scaled.std():.4f}")
print(f"  Range: [{train_meta_scaled.min():.4f}, {train_meta_scaled.max():.4f}]")

print(f"\nDistilBERT embeddings:")
print(f"  Train - Mean: {train_distilbert.mean():.4f}, Std: {train_distilbert.std():.4f}")
print(f"  Range: [{train_distilbert.min():.4f}, {train_distilbert.max():.4f}]")

# Check scale ratio after scaling
meta_scale = np.abs(train_meta_scaled).max()
distilbert_scale = np.abs(train_distilbert).max()
scale_ratio = meta_scale / distilbert_scale

print(f"\n=== SCALE COMPARISON ===")
print(f"Meta-features scale: {meta_scale:.4f}")
print(f"DistilBERT scale: {distilbert_scale:.4f}")
print(f"Scale ratio (meta/distilbert): {scale_ratio:.2f}x")

if scale_ratio > 10:
    print("WARNING: Still significant scale mismatch!")
elif scale_ratio > 3:
    print("CAUTION: Moderate scale mismatch, but acceptable")
else:
    print("GOOD: Scale mismatch resolved")

## 5. Combine Features

In [None]:
# Combine scaled meta-features with DistilBERT embeddings
X_train = np.hstack([train_meta_scaled, train_distilbert])
X_test = np.hstack([test_meta_scaled, test_distilbert])
y_train = train_df['requester_received_pizza'].values

print(f"Final training features shape: {X_train.shape}")
print(f"Final test features shape: {X_test.shape}")
print(f"Feature composition:")
print(f"  - Meta-features: {train_meta_scaled.shape[1]}")
print(f"  - DistilBERT embeddings: {train_distilbert.shape[1]}")
print(f"  - Total: {X_train.shape[1]}")

## 6. Train LightGBM with Optimized Hyperparameters

In [None]:
# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

# Optimized hyperparameters for neural features
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 63,  # Increased from 31 for more complex features
    'learning_rate': 0.05,  # Reduced for better convergence
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': RANDOM_SEED,
    'max_depth': 7,  # Added to prevent overfitting
    'min_child_samples': 20,  # Increased for stability
    'scale_pos_weight': (len(y_train) - sum(y_train)) / sum(y_train)  # Handle class imbalance
}

print("LightGBM Parameters:")
for k, v in lgb_params.items():
    print(f"  {k}: {v}")

# Cross-validation
cv_scores = []
fold_predictions = []
feature_importances = []

print("\nStarting 5-fold CV...")
for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train), 1):
    print(f"\nFold {fold}:")
    
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    # Create datasets
    train_set = lgb.Dataset(X_tr, label=y_tr)
    val_set = lgb.Dataset(X_val, label=y_val, reference=train_set)
    
    # Train with early stopping
    model = lgb.train(
        lgb_params,
        train_set,
        num_boost_round=1000,
        valid_sets=[val_set],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=True),
            lgb.log_evaluation(period=0)  # Suppress iteration logs
        ]
    )
    
    # Predict and evaluate
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    fold_score = roc_auc_score(y_val, val_pred)
    cv_scores.append(fold_score)
    
    print(f"  AUC: {fold_score:.4f}")
    print(f"  Best iteration: {model.best_iteration}")
    
    # Store feature importance
    importance = model.feature_importance(importance_type='gain')
    feature_importances.append(importance)
    
    # Store predictions for analysis
    fold_predictions.append((val_idx, val_pred))

# Overall CV score
cv_mean = np.mean(cv_scores)
cv_std = np.std(cv_scores)
print(f"\n=== CV Results ===")
print(f"Mean AUC: {cv_mean:.4f} Â± {cv_std:.4f}")
print(f"Individual folds: {[f'{s:.4f}' for s in cv_scores]}")

# Compare with previous experiments
print(f"\n=== Performance Comparison ===")
print(f"Previous TF-IDF baseline: 0.6253")
print(f"Previous DistilBERT (unscaled): 0.6312")
print(f"This experiment (scaled): {cv_mean:.4f}")
print(f"Improvement over TF-IDF: {cv_mean - 0.6253:.4f}")
print(f"Improvement over unscaled DistilBERT: {cv_mean - 0.6312:.4f}")

## 7. Feature Importance Analysis

In [None]:
# Calculate average feature importance across folds
mean_importance = np.mean(feature_importances, axis=0)

# Separate meta-features and DistilBERT features
n_meta = train_meta_scaled.shape[1]
meta_importance = mean_importance[:n_meta]
distilbert_importance = mean_importance[n_meta:]

print(f"=== Feature Importance Analysis ===")
print(f"Total importance: {mean_importance.sum():.2f}")
print(f"Meta-features importance: {meta_importance.sum():.2f} ({meta_importance.sum()/mean_importance.sum()*100:.1f}%)")
print(f"DistilBERT importance: {distilbert_importance.sum():.2f} ({distilbert_importance.sum()/mean_importance.sum()*100:.1f}%)")

# Top meta-features
meta_feature_names = available_features
top_meta_idx = np.argsort(meta_importance)[-5:][::-1]
print(f"\nTop 5 Meta-Features:")
for i, idx in enumerate(top_meta_idx, 1):
    print(f"  {i}. {meta_feature_names[idx]}: {meta_importance[idx]:.2f}")

# Top DistilBERT features
top_distilbert_idx = np.argsort(distilbert_importance)[-5:][::-1]
print(f"\nTop 5 DistilBERT Features:")
for i, idx in enumerate(top_distilbert_idx, 1):
    print(f"  {i}. distilbert_{idx}: {distilbert_importance[idx]:.2f}")

## 8. Train Final Model and Generate Predictions

In [None]:
# Train final model on full training data
print("Training final model on full training data...")
final_model = lgb.train(
    lgb_params,
    lgb.Dataset(X_train, label=y_train),
    num_boost_round=500,  # Use reasonable number of iterations
    callbacks=[lgb.log_evaluation(period=0)]
)

# Generate predictions
test_predictions = final_model.predict(X_test)

print(f"Test predictions - Mean: {test_predictions.mean():.4f}, Std: {test_predictions.std():.4f}")
print(f"Range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")

# Create submission
submission_df = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

submission_path = '/home/code/submission_candidates/candidate_005_distilbert_scaled.csv'
submission_df.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

# Save experiment results
experiment_results = {
    'experiment_id': 'exp_007',
    'name': 'distilbert_proper_scaling',
    'cv_mean': cv_mean,
    'cv_std': cv_std,
    'cv_scores': cv_scores,
    'feature_composition': {
        'meta_features': n_meta,
        'distilbert_features': train_distilbert.shape[1],
        'total': X_train.shape[1]
    },
    'meta_importance_pct': meta_importance.sum()/mean_importance.sum()*100,
    'distilbert_importance_pct': distilbert_importance.sum()/mean_importance.sum()*100,
    'scaling_method': 'RobustScaler',
    'scale_ratio_after_scaling': scale_ratio,
    'lgb_params': lgb_params
}

import json
with open('/home/code/experiments/exp_007_results.json', 'w') as f:
    json.dump(experiment_results, f, indent=2)

print(f"\nExperiment results saved to: /home/code/experiments/exp_007_results.json")