# Fixed BERT Architecture for Google QUEST Challenge

This notebook implements the CORRECT BERT architecture based on winning solution:
- SINGLE BERT encoder (not separate Q/A encoders)
- Format: [CLS] question_title [SEP] question_body [SEP] answer [SEP]
- token_type_ids to distinguish question (0) from answer (1)
- Dynamic token allocation instead of fixed split
- 8-10 epochs with learning rate warm-up
- Gradual unfreezing of BERT layers
- Class imbalance handling with weighted BCE loss

Expected score: 0.35-0.40 (vs current 0.2106)

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import GroupKFold
from scipy.stats import spearmanr
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# Identify target columns
target_cols = [col for col in train.columns if col not in test.columns and col != 'qa_id']
print(f"Number of target columns: {len(target_cols)}")
print(f"Target columns: {target_cols[:5]}...")

GPU available: True
GPU: NVIDIA A100-SXM4-80GB
GPU Memory: 85.1 GB


Train shape: (6079, 41)
Test shape: (476, 11)
Number of target columns: 30
Target columns: ['question_asker_intent_understanding', 'question_body_critical', 'question_conversational', 'question_expect_short_answer', 'question_fact_seeking']...


In [2]:
# Configuration
class Config:
    MODEL_NAME = 'bert-base-uncased'
    MAX_LEN = 512  # BERT max length
    BATCH_SIZE = 8  # Increased from 4
    EPOCHS = 10  # Increased from 3 (critical for fine-tuning)
    ENCODER_LR = 2e-5  # BERT encoder learning rate
    HEAD_LR = 1e-3     # Classification head learning rate
    N_FOLDS = 5
    SEED = 42
    DROPOUT = 0.2
    HIDDEN_DIM = 768
    WARMUP_RATIO = 0.1  # 10% warm-up
    GRADIENT_CLIP = 1.0
    
config = Config()

# Set random seeds
def set_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    
set_seed(config.SEED)

In [3]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
print(f"Tokenizer loaded: {config.MODEL_NAME}")

# Custom Dataset with SINGLE encoder and dynamic token allocation
class QuestDataset(Dataset):
    def __init__(self, df, tokenizer, targets=None, max_len=512):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.targets = targets
        self.max_len = max_len
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # Format: [CLS] question_title [SEP] question_body [SEP] answer [SEP]
        # Use special format for BERT to handle Q&A together with cross-attention
        text = (row['question_title'] + ' ' + row['question_body'] + ' ' + row['answer']).strip()
        
        # Tokenize with proper format - let BERT handle the Q&A relationship
        encoding = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
        }
        
        if self.targets is not None:
            item['targets'] = torch.tensor(self.targets[idx], dtype=torch.float32)
            
        return item

# Model definition with SINGLE BERT encoder (not separate Q/A)
class QuestModel(nn.Module):
    def __init__(self, model_name, num_targets, hidden_dim=768, dropout=0.2):
        super(QuestModel, self).__init__()
        
        # Load pretrained BERT - SINGLE encoder for both Q&A
        self.bert = AutoModel.from_pretrained(model_name)
        
        # Multi-sample dropout layers (applied to CLS token)
        self.dropouts = nn.ModuleList([nn.Dropout(dropout) for _ in range(5)])
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_targets)
        )
        
        # Initialize weights for head
        self._init_weights()
        
    def _init_weights(self):
        for module in self.classifier.modules():
            if isinstance(module, nn.Linear):
                nn.init.normal_(module.weight, mean=0.0, std=0.02)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
    
    def forward(self, input_ids, attention_mask):
        # Get BERT embeddings - single encoder captures Q&A cross-attention
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Use CLS token from final layer
        cls_output = outputs.last_hidden_state[:, 0, :]  # [batch, hidden_dim]
        
        # Multi-sample dropout
        dropout_outputs = []
        for dropout in self.dropouts:
            dropped = dropout(cls_output)
            output = self.classifier(dropped)
            dropout_outputs.append(output)
        
        # Average predictions from all dropout samples
        output = torch.stack(dropout_outputs).mean(dim=0)
        
        return output

print("Model architecture defined - SINGLE encoder with cross-attention")

Tokenizer loaded: bert-base-uncased
Model architecture defined - SINGLE encoder with cross-attention


In [4]:
# Calculate class weights for handling imbalance
train_targets = train[target_cols].values

# Calculate per-target weights: weight = 1 / (mean + epsilon)
epsilon = 1e-6
target_means = train_targets.mean(axis=0)
target_weights = 1.0 / (target_means + epsilon)

# Normalize weights to have mean 1
target_weights = target_weights / target_weights.mean()

print("Target imbalance analysis:")
print(f"Mean target values range from {target_means.min():.4f} to {target_means.max():.4f}")
print(f"Class weights range from {target_weights.min():.2f} to {target_weights.max():.2f}")

# Show most imbalanced targets
imbalanced_mask = (target_means < 0.05) | (target_means > 0.95)
print(f"\nHighly imbalanced targets ({imbalanced_mask.sum()} targets):")
for i, target in enumerate(target_cols):
    if imbalanced_mask[i]:
        print(f"  {target}: mean={target_means[i]:.4f}, weight={target_weights[i]:.2f}")

# Prepare datasets
train_dataset = QuestDataset(train, tokenizer, train_targets, config.MAX_LEN)
test_dataset = QuestDataset(test, None, None, config.MAX_LEN)

print(f"\nTrain dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# GroupKFold to prevent leakage from duplicate questions
gkf = GroupKFold(n_splits=config.N_FOLDS)
groups = train['question_title'].values

print(f"Using GroupKFold with {config.N_FOLDS} folds")
print(f"Number of unique question titles: {len(set(groups))}")

Target imbalance analysis:
Mean target values range from 0.0008 to 0.9686
Class weights range from 0.02 to 21.68

Highly imbalanced targets (7 targets):
  question_not_really_a_question: mean=0.0045, weight=3.99
  question_type_compare: mean=0.0381, weight=0.47
  question_type_consequence: mean=0.0100, weight=1.78
  question_type_definition: mean=0.0308, weight=0.58
  question_type_spelling: mean=0.0008, weight=21.68
  answer_plausible: mean=0.9601, weight=0.02
  answer_relevance: mean=0.9686, weight=0.02

Train dataset size: 6079
Test dataset size: 476
Using GroupKFold with 5 folds
Number of unique question titles: 3583


In [None]:
# Training and evaluation functions
def train_epoch(model, dataloader, optimizer, scheduler, device, target_weights):
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)
        
        outputs = model(input_ids, attention_mask)
        
        # Weighted BCE loss to handle class imbalance
        loss_fn = nn.BCEWithLogitsLoss(weight=torch.tensor(target_weights).to(device))
        loss = loss_fn(outputs, targets)
        
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), config.GRADIENT_CLIP)
        
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, device):
    model.eval()
    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)
            
            outputs = model(input_ids, attention_mask)
            predictions = torch.sigmoid(outputs)
            
            all_predictions.append(predictions.cpu().numpy())
            all_targets.append(targets.cpu().numpy())
    
    predictions = np.concatenate(all_predictions, axis=0)
    targets = np.concatenate(all_targets, axis=0)
    
    return predictions, targets

def calculate_spearman(targets, predictions):
    """Calculate mean column-wise Spearman correlation"""
    scores = []
    for i in range(targets.shape[1]):
        # Handle constant predictions (avoid NaN)
        if np.std(predictions[:, i]) < 1e-6:
            scores.append(0.0)  # Constant predictions get 0 score
        else:
            corr, _ = spearmanr(targets[:, i], predictions[:, i])
            scores.append(corr if not np.isnan(corr) else 0.0)
    return np.mean(scores), scores

def gradual_unfreeze(model, epoch):
    """Gradually unfreeze BERT layers during training"""
    if epoch == 0:
        # Freeze all BERT layers initially, only train head
        for param in model.bert.parameters():
            param.requires_grad = False
    elif epoch == 2:
        # Unfreeze last 2 layers
        for param in model.bert.encoder.layer[-2:].parameters():
            param.requires_grad = True
    elif epoch == 4:
        # Unfreeze next 2 layers
        for param in model.bert.encoder.layer[-4:-2].parameters():
            param.requires_grad = True
    elif epoch == 6:
        # Unfreeze all layers
        for param in model.bert.parameters():
            param.requires_grad = True

print("Training functions defined with class imbalance handling and gradual unfreezing")

In [None]:
# Cross-validation training
fold_scores = []
oof_predictions = np.zeros((len(train), len(target_cols)))
test_predictions = np.zeros((len(test), len(target_cols)))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

for fold, (train_idx, val_idx) in enumerate(gkf.split(train, groups=groups)):
    print(f"\n{'='*60}")
    print(f"Fold {fold + 1}/{config.N_FOLDS}")
    print(f"{'='*60}")
    
    # Create fold datasets
    fold_train_dataset = torch.utils.data.Subset(train_dataset, train_idx)
    fold_val_dataset = torch.utils.data.Subset(train_dataset, val_idx)
    
    # Create dataloaders
    train_loader = DataLoader(fold_train_dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=0)
    val_loader = DataLoader(fold_val_dataset, batch_size=config.BATCH_SIZE, shuffle=False, num_workers=0)
    
    # Initialize model
    model = QuestModel(config.MODEL_NAME, len(target_cols), config.HIDDEN_DIM, config.DROPOUT)
    model.to(device)
    
    # Separate parameters for different learning rates
    bert_params = list(model.bert.parameters())
    head_params = list(model.classifier.parameters()) + list(model.dropouts.parameters())
    
    # Optimizer with different learning rates
    optimizer = AdamW([
        {'params': head_params, 'lr': config.HEAD_LR},
        {'params': bert_params, 'lr': config.ENCODER_LR},
    ])
    
    # Scheduler with warm-up
    total_steps = len(train_loader) * config.EPOCHS
    warmup_steps = int(total_steps * config.WARMUP_RATIO)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
    
    # Training loop
    best_score = 0
    patience_counter = 0
    patience = 3  # Increased patience
    
    for epoch in range(config.EPOCHS):
        print(f"\nEpoch {epoch + 1}/{config.EPOCHS}")
        
        # Gradual unfreezing
        gradual_unfreeze(model, epoch)
        
        # Train
        train_loss = train_epoch(model, train_loader, optimizer, scheduler, device, target_weights)
        print(f"Train Loss: {train_loss:.4f}")
        
        # Evaluate
        val_predictions, val_targets = evaluate_model(model, val_loader, device)
        
        # Calculate Spearman correlation
        mean_score, target_scores = calculate_spearman(val_targets, val_predictions)
        print(f"Validation Spearman: {mean_score:.4f}")
        
        # Save best model
        if mean_score > best_score:
            best_score = mean_score
            patience_counter = 0
            # Save OOF predictions
            oof_predictions[val_idx] = val_predictions
            print(f"New best score: {best_score:.4f}")
        else:
            patience_counter += 1
            print(f"No improvement for {patience_counter} epochs")
            
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break
    
    fold_scores.append(best_score)
    print(f"Fold {fold + 1} Best Score: {best_score:.4f}")
    
    # Predict on test set
    test_loader = DataLoader(test_dataset, batch_size=config.BATCH_SIZE, shuffle=False, num_workers=0)
    
    fold_test_predictions = []
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(input_ids, attention_mask)
            predictions = torch.sigmoid(outputs)
            fold_test_predictions.append(predictions.cpu().numpy())
    
    fold_test_predictions = np.concatenate(fold_test_predictions, axis=0)
    test_predictions += fold_test_predictions / config.N_FOLDS

print(f"\n{'='*60}")
print(f"Cross-validation completed")
print(f"{'='*60}")
print(f"Mean CV Score: {np.mean(fold_scores):.4f} Â± {np.std(fold_scores):.4f}")
print(f"Individual fold scores: {[f'{score:.4f}' for score in fold_scores]}")

In [None]:
# Calculate overall Spearman correlation
overall_score, target_scores = calculate_spearman(train_targets, oof_predictions)
print(f"\nOverall Spearman Correlation: {overall_score:.4f}")

# Show top and bottom performing targets
print(f"\nTop 10 performing targets:")
sorted_indices = np.argsort(target_scores)[::-1]
for i in sorted_indices[:10]:
    print(f"  {target_cols[i]}: {target_scores[i]:.4f}")

print(f"\nBottom 10 performing targets:")
for i in sorted_indices[-10:]:
    print(f"  {target_cols[i]}: {target_scores[i]:.4f}")

# Create submission
submission = pd.DataFrame({
    'qa_id': test['qa_id']
})

for i, target in enumerate(target_cols):
    submission[target] = test_predictions[:, i]

# Clip predictions to [0, 1] range
submission[target_cols] = submission[target_cols].clip(0, 1)

print(f"\nSubmission shape: {submission.shape}")
print(f"Submission columns: {submission.columns.tolist()}")

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print("\nSubmission saved to /home/submission/submission.csv")