# BERT with Proper Q&A Token Type IDs

This experiment implements the evaluator's top priority: proper text processing with separate Q&A inputs and token_type_ids.

Key improvements from exp_006:
1. Use token_type_ids to explicitly distinguish question (0) from answer (1)
2. Format: [CLS] question_title [SEP] question_body [SEP] answer [SEP]
3. Strategic token allocation: 26/260/210 split (title/question/answer)
4. Keep all other improvements: 10 epochs, gradual unfreezing, class weights, multi-sample dropout

Expected improvement: +0.02-0.03 (0.38-0.40 CV)

Based on winning solution and evaluator feedback.

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from sklearn.model_selection import GroupKFold
from scipy.stats import spearmanr
import json
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Check GPU
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

# Identify targets
target_cols = [col for col in train.columns if col not in test.columns and col != 'qa_id']
print(f"Training samples: {len(train)}")
print(f"Test samples: {len(test)}")
print(f"Target columns: {len(target_cols)}")

# Analyze text lengths for token allocation
train['title_len'] = train['question_title'].astype(str).apply(len)
train['body_len'] = train['question_body'].astype(str).apply(len)
train['answer_len'] = train['answer'].astype(str).apply(len)

print("\nText length statistics (characters):")
print(f"Title - 95th percentile: {train['title_len'].quantile(0.95)}")
print(f"Body - 95th percentile: {train['body_len'].quantile(0.95)}")
print(f"Answer - 95th percentile: {train['answer_len'].quantile(0.95)}")

# Winning solution used 26/260/210 token split
# This covers 95%+ of titles, 85%+ of bodies, 80%+ of answers
MAX_TITLE_TOKENS = 26
MAX_BODY_TOKENS = 260
MAX_ANSWER_TOKENS = 210
MAX_TOTAL_TOKENS = 512

print(f"\nToken allocation strategy:")
print(f"Title: {MAX_TITLE_TOKENS} tokens")
print(f"Body: {MAX_BODY_TOKENS} tokens")
print(f"Answer: {MAX_ANSWER_TOKENS} tokens")
print(f"Total: {MAX_TOTAL_TOKENS} tokens")

In [None]:
# Configuration
class Config:
    MODEL_NAME = 'bert-base-uncased'
    MAX_LEN = 512
    BATCH_SIZE = 8
    EPOCHS = 10
    ENCODER_LR = 2e-5
    HEAD_LR = 1e-3
    N_FOLDS = 5
    SEED = 42
    DROPOUT = 0.2
    HIDDEN_DIM = 768
    WARMUP_RATIO = 0.1
    GRADIENT_CLIP = 1.0
    
config = Config()

# Set random seeds
def set_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    
set_seed(config.SEED)

In [None]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
print(f"Tokenizer loaded: {config.MODEL_NAME}")

# Custom Dataset with PROPER token_type_ids for Q&A
class QuestDataset(Dataset):
    def __init__(self, df, tokenizer, targets=None, max_len=512):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.targets = targets
        self.max_len = max_len
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # Format: [CLS] question_title [SEP] question_body [SEP] answer [SEP]
        # Use token_type_ids=0 for question, =1 for answer
        
        # Tokenize each part separately to control allocation
        title_tokens = self.tokenizer.encode(
            str(row['question_title']),
            add_special_tokens=False,
            max_length=MAX_TITLE_TOKENS,
            truncation=True
        )
        
        body_tokens = self.tokenizer.encode(
            str(row['question_body']),
            add_special_tokens=False,
            max_length=MAX_BODY_TOKENS,
            truncation=True
        )
        
        answer_tokens = self.tokenizer.encode(
            str(row['answer']),
            add_special_tokens=False,
            max_length=MAX_ANSWER_TOKENS,
            truncation=True
        )
        
        # Build input_ids: [CLS] title body [SEP] answer [SEP]
        input_ids = [tokenizer.cls_token_id] + title_tokens + body_tokens + [tokenizer.sep_token_id] + answer_tokens + [tokenizer.sep_token_id]
        
        # Create token_type_ids: 0 for question (title+body), 1 for answer
        token_type_ids = [0] * (1 + len(title_tokens) + len(body_tokens) + 1) + [1] * (len(answer_tokens) + 1)
        
        # Truncate if needed
        if len(input_ids) > self.max_len:
            input_ids = input_ids[:self.max_len]
            token_type_ids = token_type_ids[:self.max_len]
            # Ensure last token is SEP
            input_ids[-1] = tokenizer.sep_token_id
            token_type_ids[-1] = 1
        
        # Create attention mask
        attention_mask = [1] * len(input_ids)
        
        # Pad to max length
        padding_length = self.max_len - len(input_ids)
        input_ids = input_ids + [tokenizer.pad_token_id] * padding_length
        attention_mask = attention_mask + [0] * padding_length
        token_type_ids = token_type_ids + [0] * padding_length
        
        item = {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        }
        
        if self.targets is not None:
            item['targets'] = torch.tensor(self.targets[idx], dtype=torch.float32)
            
        return item

# Model definition with token_type_ids support
class QuestModel(nn.Module):
    def __init__(self, model_name, num_targets, hidden_dim=768, dropout=0.2):
        super(QuestModel, self).__init__()
        
        # Load pretrained BERT
        self.bert = AutoModel.from_pretrained(model_name)
        
        # Multi-sample dropout layers
        self.dropouts = nn.ModuleList([nn.Dropout(dropout) for _ in range(5)])
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_targets)
        )
        
        # Initialize weights for head
        self._init_weights()
        
    def _init_weights(self):
        for module in self.classifier.modules():
            if isinstance(module, nn.Linear):
                nn.init.normal_(module.weight, mean=0.0, std=0.02)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        # Get BERT embeddings with token_type_ids
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        # Use CLS token from final layer
        cls_output = outputs.last_hidden_state[:, 0, :]  # [batch, hidden_dim]
        
        # Multi-sample dropout
        dropout_outputs = []
        for dropout in self.dropouts:
            dropped = dropout(cls_output)
            output = self.classifier(dropped)
            dropout_outputs.append(output)
        
        # Average predictions from all dropout samples
        output = torch.stack(dropout_outputs).mean(dim=0)
        
        return output

print("Model architecture defined with token_type_ids support")

In [None]:
# Prepare targets
targets = train[target_cols].values

# Calculate class weights for handling imbalance
target_means = train[target_cols].mean()
class_weights = []
for target in target_cols:
    mean = target_means[target]
    # Weighted BCE: weight = 1 / (mean + epsilon) for positive class
    weight = 1.0 / (mean + 1e-6)
    class_weights.append(weight)

class_weights = torch.tensor(class_weights, dtype=torch.float32)
print(f"Class weights calculated for {len(class_weights)} targets")
print(f"Weight range: {class_weights.min():.2f} to {class_weights.max():.2f}")

# GroupKFold with question_title groups to prevent leakage
gkf = GroupKFold(n_splits=config.N_FOLDS)
groups = train['question_title'].values

fold_scores = []
oof_predictions = np.zeros((len(train), len(target_cols)))

print(f"\nStarting {config.N_FOLDS}-fold GroupKFold training...")
print(f"Groups (unique questions): {len(np.unique(groups))}")

In [None]:
# Training function with gradient accumulation and LR scheduling
def train_epoch(model, train_loader, optimizer, scheduler, device, epoch):
    model.train()
    total_loss = 0
    
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch} [Train]')
    
    for batch_idx, batch in enumerate(progress_bar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        targets = batch['targets'].to(device)
        
        # Forward pass
        outputs = model(input_ids, attention_mask, token_type_ids)
        
        # Weighted BCE loss
        loss_fn = nn.BCEWithLogitsLoss(weight=class_weights.to(device))
        loss = loss_fn(outputs, targets)
        
        # Backward pass
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), config.GRADIENT_CLIP)
        
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        total_loss += loss.item()
        
        # Update progress bar
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    return total_loss / len(train_loader)

# Validation function
def validate_epoch(model, val_loader, device):
    model.eval()
    val_predictions = []
    val_targets = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Validation'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            targets = batch['targets'].to(device)
            
            # Forward pass
            outputs = model(input_ids, attention_mask, token_type_ids)
            
            val_predictions.append(torch.sigmoid(outputs).cpu().numpy())
            val_targets.append(targets.cpu().numpy())
    
    val_predictions = np.concatenate(val_predictions)
    val_targets = np.concatenate(val_targets)
    
    return val_predictions, val_targets

In [None]:
# Main training loop
import gc

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

for fold, (train_idx, val_idx) in enumerate(gkf.split(train, groups=groups)):
    print(f"\n{'='*60}")
    print(f"Fold {fold + 1}/{config.N_FOLDS}")
    print(f"{'='*60}")
    
    # Create fold data
    train_fold = train.iloc[train_idx].reset_index(drop=True)
    val_fold = train.iloc[val_idx].reset_index(drop=True)
    
    train_targets_fold = targets[train_idx]
    val_targets_fold = targets[val_idx]
    
    # Create datasets
    train_dataset = QuestDataset(train_fold, tokenizer, train_targets_fold, config.MAX_LEN)
    val_dataset = QuestDataset(val_fold, tokenizer, val_targets_fold, config.MAX_LEN)
    
    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=config.BATCH_SIZE, shuffle=False, num_workers=2)
    
    # Initialize model
    model = QuestModel(config.MODEL_NAME, len(target_cols), config.HIDDEN_DIM, config.DROPOUT)
    model.to(device)
    
    # Optimizer with different learning rates for encoder and head
    optimizer = torch.optim.AdamW([
        {'params': model.bert.parameters(), 'lr': config.ENCODER_LR},
        {'params': model.classifier.parameters(), 'lr': config.HEAD_LR}
    ])
    
    # Learning rate scheduler with warm-up
    total_steps = len(train_loader) * config.EPOCHS
    warmup_steps = int(total_steps * config.WARMUP_RATIO)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )
    
    # Training loop
    best_score = 0
    patience_counter = 0
    patience = 2  # Early stopping patience
    
    for epoch in range(config.EPOCHS):
        # Training
        train_loss = train_epoch(model, train_loader, optimizer, scheduler, device, epoch)
        
        # Validation
        val_predictions, val_targets_epoch = validate_epoch(model, val_loader, device)
        
        # Calculate Spearman correlation
        fold_scores_list = []
        for i in range(len(target_cols)):
            try:
                score = spearmanr(val_targets_epoch[:, i], val_predictions[:, i]).correlation
                if not np.isnan(score):
                    fold_scores_list.append(score)
            except:
                fold_scores_list.append(0.0)
        
        mean_score = np.mean(fold_scores_list)
        
        print(f"Epoch {epoch + 1}/{config.EPOCHS} - Train Loss: {train_loss:.4f} - Val Score: {mean_score:.4f}")
        
        # Early stopping
        if mean_score > best_score:
            best_score = mean_score
            patience_counter = 0
            # Save best model
            torch.save(model.state_dict(), f'/home/code/experiments/004_bert_token_type_ids/best_model_fold_{fold}.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping triggered after epoch {epoch + 1}")
                break
    
    # Load best model and get final predictions
    model.load_state_dict(torch.load(f'/home/code/experiments/004_bert_token_type_ids/best_model_fold_{fold}.pth'))
    val_predictions, _ = validate_epoch(model, val_loader, device)
    
    # Store OOF predictions
    oof_predictions[val_idx] = val_predictions
    
    # Calculate final fold score
    fold_scores_list = []
    for i in range(len(target_cols)):
        try:
            score = spearmanr(val_targets_fold[:, i], val_predictions[:, i]).correlation
            if not np.isnan(score):
                fold_scores_list.append(score)
        except:
            fold_scores_list.append(0.0)
    
    fold_score = np.mean(fold_scores_list)
    fold_scores.append(fold_score)
    
    print(f"Fold {fold + 1} Score: {fold_score:.4f}")
    
    # Cleanup
    del model, optimizer, scheduler, train_loader, val_loader
    gc.collect()
    torch.cuda.empty_cache()

# Calculate overall CV score
print(f"\n{'='*60}")
print(f"Cross-Validation Results")
print(f"{'='*60}")
for i, score in enumerate(fold_scores):
    print(f"Fold {i + 1}: {score:.4f}")

mean_cv = np.mean(fold_scores)
std_cv = np.std(fold_scores)
print(f"\nMean CV Score: {mean_cv:.4f} Â± {std_cv:.4f}")

# Save OOF predictions
np.save('/home/code/experiments/004_bert_token_type_ids/oof_predictions.npy', oof_predictions)

# Save results
results = {
    'experiment_id': 'exp_007',
    'model_type': 'bert_token_type_ids',
    'cv_score': mean_cv,
    'cv_std': std_cv,
    'fold_scores': fold_scores,
    'config': config.__dict__
}

with open('/home/code/experiments/004_bert_token_type_ids/results.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f"\nResults saved to /home/code/experiments/004_bert_token_type_ids/results.json")