# Experiment 004: DistilBERT Feature Extraction + LightGBM

**Objective**: Upgrade from TF-IDF to DistilBERT transformer embeddings for text representation

**Expected gain**: +0.03-0.05 AUC (target: 0.65-0.68 CV)

**Key improvements**:
- DistilBERT [CLS] token embeddings (768-dim) instead of TF-IDF (5000-dim sparse)
- Captures semantic relationships and narrative structure
- 60% faster than BERT, 40% smaller (66M params), lower overfitting risk
- Maintains all proven meta-features from honest baseline

**Reference**: Based on analysis in `exploration/evolver_loop5_analysis.ipynb`

In [1]:
import pandas as pd
import numpy as np
import json
import re
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import torch
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

print("Libraries imported successfully")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

Libraries imported successfully
PyTorch version: 2.2.0+cu118
CUDA available: True
GPU: NVIDIA A100-SXM4-80GB
GPU Memory: 85.1 GB


## Load Data

In [2]:
# Load training data
train_path = '/home/data/train.json'
with open(train_path, 'r') as f:
    train_data = json.load(f)
train_df = pd.DataFrame(train_data)

# Load test data
test_path = '/home/data/test.json'
with open(test_path, 'r') as f:
    test_data = json.load(f)
test_df = pd.DataFrame(test_data)

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Target distribution:")
print(train_df['requester_received_pizza'].value_counts(normalize=True))

Training samples: 2878
Test samples: 1162
Target distribution:
requester_received_pizza
False    0.751564
True     0.248436
Name: proportion, dtype: float64


## Install and Import Transformers

Install required libraries for transformer model

In [3]:
# Install transformers if not already available
import subprocess
import sys

try:
    import transformers
    print(f"Transformers version: {transformers.__version__}")
except ImportError:
    print("Installing transformers...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers"])
    import transformers
    print(f"Transformers installed: {transformers.__version__}")

# Import required modules
from transformers import DistilBertTokenizer, DistilBertModel
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

print("All transformer libraries ready")

Transformers version: 4.57.3


2026-01-11 19:17:56.884632: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-11 19:17:56.908181: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-11 19:17:56.915095: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


All transformer libraries ready


## Feature Engineering - Meta Features

Extract the same meta-features that worked well in the honest baseline

In [4]:
def engineer_meta_features(df):
    """Engineer meta-features (same as honest baseline)"""
    features = pd.DataFrame(index=df.index)
    
    # 1. Text features - combine title and text for full context
    if 'request_text' in df.columns:
        text_col = 'request_text'
    else:
        text_col = 'request_text_edit_aware'
    
    features['full_text'] = df['request_title'].fillna('') + ' ' + df[text_col].fillna('')
    
    # 2. Text length features
    features['text_length'] = df[text_col].fillna('').str.len()
    features['title_length'] = df['request_title'].fillna('').str.len()
    features['total_text_length'] = features['text_length'] + features['title_length']
    
    # Word count features
    features['word_count'] = df[text_col].fillna('').str.split().str.len()
    features['title_word_count'] = df['request_title'].fillna('').str.split().str.len()
    features['total_word_count'] = features['word_count'] + features['title_word_count']
    
    # 3. User activity features (SAFE - at request time)
    features['requester_number_of_comments_at_request'] = df['requester_number_of_comments_at_request']
    features['requester_number_of_posts_at_request'] = df['requester_number_of_posts_at_request']
    features['requester_upvotes_minus_downvotes_at_request'] = df['requester_upvotes_minus_downvotes_at_request']
    features['requester_upvotes_plus_downvotes_at_request'] = df['requester_upvotes_plus_downvotes_at_request']
    
    # 4. Account age and temporal features (SAFE)
    features['requester_account_age_in_days_at_request'] = df['requester_account_age_in_days_at_request']
    features['requester_days_since_first_post_on_raop_at_request'] = df['requester_days_since_first_post_on_raop_at_request']
    
    # Hour of day feature
    features['hour_of_day'] = pd.to_datetime(df['unix_timestamp_of_request'], unit='s').dt.hour
    features['day_of_week'] = pd.to_datetime(df['unix_timestamp_of_request'], unit='s').dt.dayofweek
    
    # 5. Binary keyword indicators (from honest baseline)
    keywords = ['thanks', 'thank', 'please', 'because', 'pay', 'forward']
    for keyword in keywords:
        features[f'keyword_{keyword}'] = features['full_text'].str.contains(keyword, case=False, na=False).astype(int)
    
    # 6. Post edited indicator (SAFE - at request time)
    if 'post_was_edited' in df.columns:
        features['post_was_edited'] = df['post_was_edited'].apply(
            lambda x: 1 if (isinstance(x, bool) and x) or (not isinstance(x, bool) and pd.notna(x)) else 0
        )
    else:
        features['post_was_edited'] = 0
    
    return features

# Engineer features for train and test
print("Engineering meta-features for training data...")
train_meta_features = engineer_meta_features(train_df)

print("Engineering meta-features for test data...")
test_meta_features = engineer_meta_features(test_df)

print(f"Train meta-features shape: {train_meta_features.shape}")
print(f"Test meta-features shape: {test_meta_features.shape}")

Engineering meta-features for training data...
Engineering meta-features for test data...
Train meta-features shape: (2878, 22)
Test meta-features shape: (1162, 22)


## DistilBERT Feature Extraction

Extract [CLS] token embeddings from DistilBERT for semantic text representation

In [7]:
class TextDataset(Dataset):
    """Dataset for processing text with DistilBERT"""
    def __init__(self, texts, tokenizer, max_length=256):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

def extract_distilbert_features(texts, tokenizer, model, device, batch_size=16, max_length=256):
    """Extract [CLS] embeddings from DistilBERT"""
    dataset = TextDataset(texts, tokenizer, max_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    all_features = []
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            # Get DistilBERT outputs
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            # Extract [CLS] token embeddings (first token)
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            all_features.append(cls_embeddings)
            
            if batch_idx % 50 == 0:
                print(f"  Processed {batch_idx * batch_size}/{len(texts)} texts")
    
    # Concatenate all batches
    all_features = np.vstack(all_features)
    print(f"  Completed: {all_features.shape}")
    
    return all_features

In [8]:
# Initialize DistilBERT tokenizer and model
print("Loading DistilBERT tokenizer and model...")
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model.eval()

print(f"DistilBERT model loaded successfully")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Using device: {device}")

# Extract features for training and test data
print("\nExtracting DistilBERT features for training data...")
train_texts = train_meta_features['full_text'].tolist()
train_distilbert_features = extract_distilbert_features(
    train_texts, tokenizer, model, device, 
    batch_size=16, max_length=256
)

print(f"Training DistilBERT features shape: {train_distilbert_features.shape}")

print("\nExtracting DistilBERT features for test data...")
test_texts = test_meta_features['full_text'].tolist()
test_distilbert_features = extract_distilbert_features(
    test_texts, tokenizer, model, device,
    batch_size=16, max_length=256
)

print(f"Test DistilBERT features shape: {test_distilbert_features.shape}")

Loading DistilBERT tokenizer and model...


DistilBERT model loaded successfully
Model parameters: 66,362,880
Using device: cuda

Extracting DistilBERT features for training data...
  Processed 0/2878 texts


  Processed 800/2878 texts


  Processed 1600/2878 texts


  Processed 2400/2878 texts


  Completed: (2878, 768)
Training DistilBERT features shape: (2878, 768)

Extracting DistilBERT features for test data...
  Processed 0/1162 texts


  Processed 800/1162 texts


  Completed: (1162, 768)
Test DistilBERT features shape: (1162, 768)


## Combine Features

Concatenate DistilBERT embeddings with meta-features

In [9]:
# Convert DistilBERT features to DataFrames with proper column names
train_distilbert_df = pd.DataFrame(
    train_distilbert_features, 
    columns=[f'distilbert_{i}' for i in range(train_distilbert_features.shape[1])],
    index=train_meta_features.index
)

test_distilbert_df = pd.DataFrame(
    test_distilbert_features,
    columns=[f'distilbert_{i}' for i in range(test_distilbert_features.shape[1])],
    index=test_meta_features.index
)

# Get meta-feature columns (excluding full_text)
meta_feature_cols = [col for col in train_meta_features.columns if col != 'full_text']

# Combine features
train_meta_only = train_meta_features[meta_feature_cols]
test_meta_only = test_meta_features[meta_feature_cols]

train_features_combined = pd.concat([train_meta_only, train_distilbert_df], axis=1)
test_features_combined = pd.concat([test_meta_only, test_distilbert_df], axis=1)

print(f"Combined feature shapes:")
print(f"Train: {train_features_combined.shape}")
print(f"Test: {test_features_combined.shape}")
print(f"Meta-features: {len(meta_feature_cols)}")
print(f"DistilBERT features: {train_distilbert_features.shape[1]}")

Combined feature shapes:
Train: (2878, 789)
Test: (1162, 789)
Meta-features: 21
DistilBERT features: 768


## Model Training with Cross-Validation

Train LightGBM on combined features (DistilBERT + meta-features)

In [10]:
# Prepare data for training
X = train_features_combined
y = train_df['requester_received_pizza'].astype(int)

# Stratified K-Fold to handle class imbalance
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)

# Store predictions for ensembling
train_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(test_features_combined))
fold_scores = []

print(f"Starting {n_splits}-fold cross-validation with DistilBERT + meta-features...")
print(f"Total features: {len(X.columns)} (meta: {len(meta_feature_cols)}, DistilBERT: {train_distilbert_features.shape[1]})")

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print(f"Fold {fold + 1}/{n_splits}")
    
    # Split data
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    # Calculate scale_pos_weight for handling class imbalance
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    print(f"  Scale pos weight: {scale_pos_weight:.2f}")
    
    # Train LightGBM model
    model = lgb.LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=RANDOM_SEED,
        scale_pos_weight=scale_pos_weight,
        n_jobs=-1
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='auc',
        callbacks=[
            lgb.early_stopping(50),
            lgb.log_evaluation(0)
        ]
    )
    
    # Predict on validation set
    valid_pred = model.predict_proba(X_valid)[:, 1]
    fold_auc = roc_auc_score(y_valid, valid_pred)
    fold_scores.append(fold_auc)
    
    # Store out-of-fold predictions
    train_predictions[valid_idx] = valid_pred
    
    # Predict on test set
    test_pred = model.predict_proba(test_features_combined)[:, 1]
    test_predictions += test_pred / n_splits
    
    print(f"  Fold {fold + 1} AUC: {fold_auc:.4f}")

# Calculate overall CV score
cv_score = roc_auc_score(y, train_predictions)
print(f"\n{'='*50}")
print(f"CROSS-VALIDATION RESULTS")
print(f"{'='*50}")
print(f"Fold scores: {[f'{score:.4f}' for score in fold_scores]}")
print(f"Mean AUC: {np.mean(fold_scores):.4f}")
print(f"Std AUC: {np.std(fold_scores):.4f}")
print(f"OOF AUC: {cv_score:.4f}")
print(f"Improvement over TF-IDF baseline: {cv_score - 0.6253:.4f}")
print(f"{'='*50}")

Starting 5-fold cross-validation with DistilBERT + meta-features...
Total features: 789 (meta: 21, DistilBERT: 768)
Fold 1/5
  Scale pos weight: 3.02
[LightGBM] [Info] Number of positive: 572, number of negative: 1730


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034164 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 198366


[LightGBM] [Info] Number of data points in the train set: 2302, number of used features: 789


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.248480 -> initscore=-1.106738
[LightGBM] [Info] Start training from score -1.106738
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[5]	valid_0's auc: 0.627174	valid_0's binary_logloss: 0.551594
  Fold 1 AUC: 0.6272
Fold 2/5
  Scale pos weight: 3.02
[LightGBM] [Info] Number of positive: 572, number of negative: 1730
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018496 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 198358
[LightGBM] [Info] Number of data points in the train set: 2302, number of used features: 789


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.248480 -> initscore=-1.106738
[LightGBM] [Info] Start training from score -1.106738
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[5]	valid_0's auc: 0.615247	valid_0's binary_logloss: 0.553539
  Fold 2 AUC: 0.6152
Fold 3/5
  Scale pos weight: 3.02
[LightGBM] [Info] Number of positive: 572, number of negative: 1730
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017185 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 198366
[LightGBM] [Info] Number of data points in the train set: 2302, number of used features: 789
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.248480 -> initscore=-1.106738
[LightGBM] [Info] Start training from score -1.106738


Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[9]	valid_0's auc: 0.657617	valid_0's binary_logloss: 0.547322
  Fold 3 AUC: 0.6576
Fold 4/5
  Scale pos weight: 3.03
[LightGBM] [Info] Number of positive: 572, number of negative: 1731
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016907 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 198357
[LightGBM] [Info] Number of data points in the train set: 2303, number of used features: 789
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.248372 -> initscore=-1.107316
[LightGBM] [Info] Start training from score -1.107316


Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[118]	valid_0's auc: 0.657828	valid_0's binary_logloss: 0.54546
  Fold 4 AUC: 0.6578
Fold 5/5
  Scale pos weight: 3.03
[LightGBM] [Info] Number of positive: 572, number of negative: 1731
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017000 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 198359
[LightGBM] [Info] Number of data points in the train set: 2303, number of used features: 789


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.248372 -> initscore=-1.107316
[LightGBM] [Info] Start training from score -1.107316
Training until validation scores don't improve for 50 rounds


Early stopping, best iteration is:
[3]	valid_0's auc: 0.598024	valid_0's binary_logloss: 0.554871
  Fold 5 AUC: 0.5980

CROSS-VALIDATION RESULTS
Fold scores: ['0.6272', '0.6152', '0.6576', '0.6578', '0.5980']
Mean AUC: 0.6312
Std AUC: 0.0236
OOF AUC: 0.6231
Improvement over TF-IDF baseline: -0.0022


## Feature Importance Analysis

Compare importance of DistilBERT vs meta-features

In [11]:
# Get feature importance from the last fold model
feature_importance = model.booster_.feature_importance(importance_type='gain')
feature_names = X.columns.tolist()

# Create importance dataframe
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Top 20 Most Important Features:")
print(importance_df.head(20))

# Analyze feature types
distilbert_features = importance_df[importance_df['feature'].str.startswith('distilbert_')]
meta_features = importance_df[~importance_df['feature'].str.startswith('distilbert_')]

print(f"\nFeature type summary:")
print(f"DistilBERT features: {len(distilbert_features)} features, total importance: {distilbert_features['importance'].sum():.2f}")
print(f"Meta features: {len(meta_features)} features, total importance: {meta_features['importance'].sum():.2f}")

# Check top meta-features
top_meta = meta_features.head(10)
print(f"\nTop 10 Meta Features:")
print(top_meta)

Top 20 Most Important Features:
                                               feature  importance
0                                          text_length  367.057404
8         requester_upvotes_minus_downvotes_at_request  270.540901
369                                     distilbert_348  137.043400
377                                     distilbert_356  105.990799
550                                     distilbert_529   92.859898
766                                     distilbert_745   82.963799
411                                     distilbert_390   76.864098
290                                     distilbert_269   76.505800
457                                     distilbert_436   73.598797
277                                     distilbert_256   65.429701
280                                     distilbert_259   58.305302
33                                       distilbert_12   57.306999
5                                     total_word_count   54.409698
11   requester_days_since_firs

## Create Submission

In [12]:
# Create submission file
submission = pd.DataFrame({
    'request_id': test_df['request_id'],
    'requester_received_pizza': test_predictions
})

print("Submission shape:", submission.shape)
print("\nFirst few rows:")
print(submission.head())

# Save submission
submission_path = '/home/submission/submission_004_distilbert_baseline.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission saved to: {submission_path}")

# Check distribution of predictions
print(f"\nPrediction distribution:")
print(f"Mean: {submission['requester_received_pizza'].mean():.4f}")
print(f"Std: {submission['requester_received_pizza'].std():.4f}")
print(f"Min: {submission['requester_received_pizza'].min():.4f}")
print(f"Max: {submission['requester_received_pizza'].max():.4f}")

# Compare to target distribution
target_mean = train_df['requester_received_pizza'].mean()
print(f"\nTarget distribution in training:")
print(f"Mean (positive rate): {target_mean:.4f}")
print(f"Our prediction mean: {submission['requester_received_pizza'].mean():.4f}")
print(f"Difference: {abs(submission['requester_received_pizza'].mean() - target_mean):.4f}")

Submission shape: (1162, 2)

First few rows:
  request_id  requester_received_pizza
0  t3_1aw5zf                  0.358209
1   t3_roiuw                  0.303894
2   t3_mjnbq                  0.316830
3   t3_t8wd1                  0.214725
4  t3_1m4zxu                  0.269134

Submission saved to: /home/submission/submission_004_distilbert_baseline.csv

Prediction distribution:
Mean: 0.2809
Std: 0.0594
Min: 0.1719
Max: 0.4606

Target distribution in training:
Mean (positive rate): 0.2484
Our prediction mean: 0.2809
Difference: 0.0324
