# BERT Baseline for Google QUEST Challenge

This notebook implements a BERT-based model following the winning solution architecture.
- Process question_title, question_body, and answer as separate inputs
- Use max_sequence_length=500 (26/260/210 split)
- GroupKFold with question_title groups to prevent leakage
- Multi-Sample Dropout for better generalization

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import GroupKFold
from scipy.stats import spearmanr
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# Identify target columns
target_cols = [col for col in train.columns if col not in test.columns and col != 'qa_id']
print(f"Number of target columns: {len(target_cols)}")
print(f"Target columns: {target_cols[:5]}...")

GPU available: True
GPU: NVIDIA A100-SXM4-80GB
GPU Memory: 85.1 GB


Train shape: (6079, 41)
Test shape: (476, 11)
Number of target columns: 30
Target columns: ['question_asker_intent_understanding', 'question_body_critical', 'question_conversational', 'question_expect_short_answer', 'question_fact_seeking']...


In [2]:
# Configuration
class Config:
    MODEL_NAME = 'bert-base-uncased'
    MAX_LEN = 500
    TITLE_MAX_LEN = 26
    QUESTION_MAX_LEN = 260
    ANSWER_MAX_LEN = 210
    BATCH_SIZE = 4  # Small batch size due to long sequences
    EPOCHS = 3
    LEARNING_RATE = 2e-5
    HEAD_LEARNING_RATE = 1e-3
    N_FOLDS = 5
    SEED = 42
    DROPOUT = 0.2
    HIDDEN_DIM = 768
    
config = Config()

# Set random seeds
def set_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    
set_seed(config.SEED)

In [3]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
print(f"Tokenizer loaded: {config.MODEL_NAME}")

# Custom Dataset
class QuestDataset(Dataset):
    def __init__(self, df, tokenizer, targets=None, max_len=500):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.targets = targets
        self.max_len = max_len
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # Combine title and body for question
        question_text = (row['question_title'] + ' ' + row['question_body']).strip()
        answer_text = row['answer'].strip()
        
        # Tokenize question and answer separately
        question_tokens = self.tokenizer.encode_plus(
            question_text,
            None,
            add_special_tokens=True,
            max_length=config.TITLE_MAX_LEN + config.QUESTION_MAX_LEN,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        answer_tokens = self.tokenizer.encode_plus(
            answer_text,
            None,
            add_special_tokens=True,
            max_length=config.ANSWER_MAX_LEN,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        item = {
            'question_input_ids': question_tokens['input_ids'].squeeze(),
            'question_attention_mask': question_tokens['attention_mask'].squeeze(),
            'answer_input_ids': answer_tokens['input_ids'].squeeze(),
            'answer_attention_mask': answer_tokens['attention_mask'].squeeze(),
        }
        
        if self.targets is not None:
            item['targets'] = torch.tensor(self.targets[idx], dtype=torch.float32)
            
        return item

# Model definition with Multi-Sample Dropout
class QuestModel(nn.Module):
    def __init__(self, model_name, num_targets, hidden_dim=768, dropout=0.2):
        super(QuestModel, self).__init__()
        
        # Load pretrained BERT
        self.bert = AutoModel.from_pretrained(model_name)
        
        # Freeze BERT initially (will unfreeze gradually)
        for param in self.bert.parameters():
            param.requires_grad = False
            
        # Multi-sample dropout layers
        self.dropouts = nn.ModuleList([nn.Dropout(dropout) for _ in range(5)])
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),  # *2 for question and answer concatenation
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_targets)
        )
        
        # Initialize weights
        self._init_weights()
        
    def _init_weights(self):
        for module in self.classifier.modules():
            if isinstance(module, nn.Linear):
                nn.init.normal_(module.weight, mean=0.0, std=0.02)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
    
    def forward(self, question_input_ids, question_attention_mask, answer_input_ids, answer_attention_mask):
        # Get BERT embeddings for question and answer
        question_output = self.bert(
            input_ids=question_input_ids,
            attention_mask=question_attention_mask
        )
        
        answer_output = self.bert(
            input_ids=answer_input_ids,
            attention_mask=answer_attention_mask
        )
        
        # Use CLS token embeddings
        question_cls = question_output.last_hidden_state[:, 0, :]  # [batch, hidden_dim]
        answer_cls = answer_output.last_hidden_state[:, 0, :]      # [batch, hidden_dim]
        
        # Concatenate question and answer representations
        combined = torch.cat([question_cls, answer_cls], dim=1)  # [batch, hidden_dim * 2]
        
        # Multi-sample dropout
        dropout_outputs = []
        for dropout in self.dropouts:
            dropped = dropout(combined)
            output = self.classifier(dropped)
            dropout_outputs.append(output)
        
        # Average predictions from all dropout samples
        output = torch.stack(dropout_outputs).mean(dim=0)
        
        return output

print("Model architecture defined")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizer loaded: bert-base-uncased
Model architecture defined


In [4]:
# Prepare data
train_targets = train[target_cols].values
test_targets = None  # For test set

# Create datasets
train_dataset = QuestDataset(train, tokenizer, train_targets, config.MAX_LEN)
test_dataset = QuestDataset(test, tokenizer, test_targets, config.MAX_LEN)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# GroupKFold to prevent leakage from duplicate questions
gkf = GroupKFold(n_splits=config.N_FOLDS)
groups = train['question_title'].values

print(f"Using GroupKFold with {config.N_FOLDS} folds")
print(f"Number of unique question titles: {len(set(groups))}")

Train dataset size: 6079
Test dataset size: 476
Using GroupKFold with 5 folds
Number of unique question titles: 3583


In [5]:
# Training and evaluation functions
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        optimizer.zero_grad()
        
        question_input_ids = batch['question_input_ids'].to(device)
        question_attention_mask = batch['question_attention_mask'].to(device)
        answer_input_ids = batch['answer_input_ids'].to(device)
        answer_attention_mask = batch['answer_attention_mask'].to(device)
        targets = batch['targets'].to(device)
        
        outputs = model(question_input_ids, question_attention_mask, answer_input_ids, answer_attention_mask)
        
        # BCE loss (as used by winners)
        loss = nn.BCEWithLogitsLoss()(outputs, targets)
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, device):
    model.eval()
    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
        for batch in dataloader:
            question_input_ids = batch['question_input_ids'].to(device)
            question_attention_mask = batch['question_attention_mask'].to(device)
            answer_input_ids = batch['answer_input_ids'].to(device)
            answer_attention_mask = batch['answer_attention_mask'].to(device)
            targets = batch['targets'].to(device)
            
            outputs = model(question_input_ids, question_attention_mask, answer_input_ids, answer_attention_mask)
            predictions = torch.sigmoid(outputs)
            
            all_predictions.append(predictions.cpu().numpy())
            all_targets.append(targets.cpu().numpy())
    
    predictions = np.concatenate(all_predictions, axis=0)
    targets = np.concatenate(all_targets, axis=0)
    
    return predictions, targets

def calculate_spearman(targets, predictions):
    """Calculate mean column-wise Spearman correlation"""
    scores = []
    for i in range(targets.shape[1]):
        corr, _ = spearmanr(targets[:, i], predictions[:, i])
        scores.append(corr)
    return np.mean(scores), scores

print("Training functions defined")

Training functions defined


In [None]:
# Cross-validation training
fold_scores = []
oof_predictions = np.zeros((len(train), len(target_cols)))
test_predictions = np.zeros((len(test), len(target_cols)))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

for fold, (train_idx, val_idx) in enumerate(gkf.split(train, groups=groups)):
    print(f"\n{'='*50}")
    print(f"Fold {fold + 1}/{config.N_FOLDS}")
    print(f"{'='*50}")
    
    # Create fold datasets
    fold_train_dataset = torch.utils.data.Subset(train_dataset, train_idx)
    fold_val_dataset = torch.utils.data.Subset(train_dataset, val_idx)
    
    # Create dataloaders
    train_loader = DataLoader(fold_train_dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=0)
    val_loader = DataLoader(fold_val_dataset, batch_size=config.BATCH_SIZE, shuffle=False, num_workers=0)
    
    # Initialize model
    model = QuestModel(config.MODEL_NAME, len(target_cols), config.HIDDEN_DIM, config.DROPOUT)
    model.to(device)
    
    # Optimizer with different learning rates for encoder and head
    head_params = list(model.classifier.parameters()) + list(model.dropouts.parameters())
    
    optimizer = AdamW([
        {'params': head_params, 'lr': config.HEAD_LEARNING_RATE},
    ])
    
    # Scheduler
    total_steps = len(train_loader) * config.EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=total_steps//10, num_training_steps=total_steps)
    
    # Training loop
    best_score = 0
    patience_counter = 0
    patience = 2
    
    for epoch in range(config.EPOCHS):
        print(f"\nEpoch {epoch + 1}/{config.EPOCHS}")
        
        # Train
        train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
        print(f"Train Loss: {train_loss:.4f}")
        
        # Evaluate
        val_predictions, val_targets = evaluate_model(model, val_loader, device)
        
        # Calculate Spearman correlation
        mean_score, target_scores = calculate_spearman(val_targets, val_predictions)
        print(f"Validation Spearman: {mean_score:.4f}")
        
        # Save best model
        if mean_score > best_score:
            best_score = mean_score
            patience_counter = 0
            # Save OOF predictions
            oof_predictions[val_idx] = val_predictions
            print(f"New best score: {best_score:.4f}")
        else:
            patience_counter += 1
            
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break
    
    fold_scores.append(best_score)
    print(f"Fold {fold + 1} Best Score: {best_score:.4f}")
    
    # Predict on test set
    test_loader = DataLoader(test_dataset, batch_size=config.BATCH_SIZE, shuffle=False, num_workers=0)
    
    fold_test_predictions = []
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            question_input_ids = batch['question_input_ids'].to(device)
            question_attention_mask = batch['question_attention_mask'].to(device)
            answer_input_ids = batch['answer_input_ids'].to(device)
            answer_attention_mask = batch['answer_attention_mask'].to(device)
            
            outputs = model(question_input_ids, question_attention_mask, answer_input_ids, answer_attention_mask)
            predictions = torch.sigmoid(outputs)
            fold_test_predictions.append(predictions.cpu().numpy())
    
    fold_test_predictions = np.concatenate(fold_test_predictions, axis=0)
    test_predictions += fold_test_predictions / config.N_FOLDS

print(f"\n{'='*50}")
print(f"Cross-validation completed")
print(f"{'='*50}")
print(f"Mean CV Score: {np.mean(fold_scores):.4f} Â± {np.std(fold_scores):.4f}")
print(f"Individual fold scores: {[f'{score:.4f}' for score in fold_scores]}")

In [None]:
# Calculate overall Spearman correlation
overall_score, target_scores = calculate_spearman(train_targets, oof_predictions)
print(f"\nOverall Spearman Correlation: {overall_score:.4f}")
print(f"\nPer-target Spearman correlations:")
for i, target in enumerate(target_cols):
    print(f"{target}: {target_scores[i]:.4f}")

# Create submission
submission = pd.DataFrame({
    'qa_id': test['qa_id']
})

for i, target in enumerate(target_cols):
    submission[target] = test_predictions[:, i]

# Clip predictions to [0, 1] range
submission[target_cols] = submission[target_cols].clip(0, 1)

print(f"\nSubmission shape: {submission.shape}")
print(f"Submission columns: {submission.columns.tolist()}")

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print("\nSubmission saved to /home/submission/submission.csv")