Import the package

In [None]:
# Code Block 1: Environment Setup and Data Loading

import warnings
warnings.filterwarnings('ignore', message='.*overflowing tokens.*')

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification

# (Optional) Set random seeds for reproducibility
def set_seed(seed=42):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

# Check and set device (using GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Define the local model path for the pretrained BERT model
model_path = '/kaggle/input/bert-base-uncased/pytorch/default/1/bert-base-uncased'

# Load the BERT tokenizer from the local path
tokenizer = BertTokenizer.from_pretrained(model_path)

# Load training data
train_data = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
print("Training data shape:", train_data.shape)


In [None]:
# Code Block 2: Preprocessing Training Data (Batch Tokenization)

def preprocess_dataset(df, tokenizer, max_length=512):
    # Convert the responses to lists
    responses_a = df['response_a'].tolist()
    responses_b = df['response_b'].tolist()
    
    # Tokenize all responses in batch
    encoding_a = tokenizer(
        responses_a,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    
    encoding_b = tokenizer(
        responses_b,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    
    return {
        'input_ids_a': encoding_a['input_ids'],
        'attention_mask_a': encoding_a['attention_mask'],
        'input_ids_b': encoding_b['input_ids'],
        'attention_mask_b': encoding_b['attention_mask']
    }

print("Preprocessing training data...")
preprocessed_train = preprocess_dataset(train_data, tokenizer, max_length=512)
print("Tokenized training data shapes:",
      preprocessed_train['input_ids_a'].shape,
      preprocessed_train['input_ids_b'].shape)

# Process label data: assume the label columns are 'winner_model_a', 'winner_model_b', 'winner_tie'
labels = torch.tensor(train_data[['winner_model_a', 'winner_model_b', 'winner_tie']].values, dtype=torch.long)


In [None]:
# Code Block 3: Define Dataset and DataLoader

class ChatbotDataset(Dataset):
    def __init__(self, preprocessed_data, labels):
        self.input_ids_a = preprocessed_data['input_ids_a']
        self.attention_mask_a = preprocessed_data['attention_mask_a']
        self.input_ids_b = preprocessed_data['input_ids_b']
        self.attention_mask_b = preprocessed_data['attention_mask_b']
        self.labels = labels

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'input_ids_a': self.input_ids_a[idx],
            'attention_mask_a': self.attention_mask_a[idx],
            'input_ids_b': self.input_ids_b[idx],
            'attention_mask_b': self.attention_mask_b[idx],
            'label': self.labels[idx]
        }

# Create the training dataset and DataLoader
train_dataset = ChatbotDataset(preprocessed_train, labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4, pin_memory=True)


In [None]:
# Code Block 4: Define the Multi-Task BERT Model

class MultiTaskBERTModel(nn.Module):
    def __init__(self, model_path, num_labels=3):
        super(MultiTaskBERTModel, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained(
            model_path, 
            num_labels=num_labels, 
            ignore_mismatched_sizes=True  # In case of slight differences in model architecture
        )
    
    def forward(self, input_ids_a, attention_mask_a, input_ids_b, attention_mask_b):
        # Compute logits for response_a and response_b separately
        output_a = self.bert(input_ids=input_ids_a, attention_mask=attention_mask_a)
        output_b = self.bert(input_ids=input_ids_b, attention_mask=attention_mask_b)
        return output_a.logits, output_b.logits

# Initialize the model and move it to the specified device
model = MultiTaskBERTModel(model_path, num_labels=3)
model.to(device)


In [None]:
# Code Block 5: Training Setup and Loop

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = StepLR(optimizer, step_size=2, gamma=0.1)
loss_fn = nn.CrossEntropyLoss()

# Set up mixed precision training
scaler = torch.cuda.amp.GradScaler()

epochs = 3
print("Start training...")

for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    
    for batch in train_loader:
        optimizer.zero_grad()
        
        # Move batch data to GPU
        input_ids_a = batch['input_ids_a'].to(device)
        attention_mask_a = batch['attention_mask_a'].to(device)
        input_ids_b = batch['input_ids_b'].to(device)
        attention_mask_b = batch['attention_mask_b'].to(device)
        labels_batch = batch['label'].to(device)
        
        # Forward pass with mixed precision
        with torch.cuda.amp.autocast():
            logits_a, logits_b = model(input_ids_a, attention_mask_a, input_ids_b, attention_mask_b)
            # Compute individual losses for each task:
            loss_a = loss_fn(logits_a, labels_batch[:, 0])   # For winner_model_a
            loss_b = loss_fn(logits_b, labels_batch[:, 1])   # For winner_model_b
            # For tie prediction, use the average of logits_a and logits_b
            loss_tie = loss_fn((logits_a + logits_b) / 2, labels_batch[:, 2])
            loss = loss_a + loss_b + loss_tie
        
        # Backward pass and optimization
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
    
    scheduler.step()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")


In [None]:
# Code Block 6: Inference and Submission File Generation

# Load test data (ensure test.csv contains 'id', 'response_a', 'response_b')
test_data = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')
print("Test data shape:", test_data.shape)

def preprocess_test_dataset(df, tokenizer, max_length=512):
    responses_a = df['response_a'].tolist()
    responses_b = df['response_b'].tolist()
    
    encoding_a = tokenizer(
        responses_a,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    
    encoding_b = tokenizer(
        responses_b,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    
    return {
        'input_ids_a': encoding_a['input_ids'],
        'attention_mask_a': encoding_a['attention_mask'],
        'input_ids_b': encoding_b['input_ids'],
        'attention_mask_b': encoding_b['attention_mask']
    }

print("Preprocessing test data...")
preprocessed_test = preprocess_test_dataset(test_data, tokenizer, max_length=512)

# Define test Dataset and DataLoader
class ChatbotTestDataset(Dataset):
    def __init__(self, preprocessed_data):
        self.input_ids_a = preprocessed_data['input_ids_a']
        self.attention_mask_a = preprocessed_data['attention_mask_a']
        self.input_ids_b = preprocessed_data['input_ids_b']
        self.attention_mask_b = preprocessed_data['attention_mask_b']
    
    def __len__(self):
        return self.input_ids_a.size(0)
    
    def __getitem__(self, idx):
        return {
            'input_ids_a': self.input_ids_a[idx],
            'attention_mask_a': self.attention_mask_a[idx],
            'input_ids_b': self.input_ids_b[idx],
            'attention_mask_b': self.attention_mask_b[idx]
        }

test_dataset = ChatbotTestDataset(preprocessed_test)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=4, pin_memory=True)

# Perform inference and generate predictions
model.eval()  # Set model to evaluation mode

predictions_a = []
predictions_b = []
predictions_tie = []

with torch.no_grad():
    for batch in test_loader:
        # Move data to GPU
        input_ids_a = batch['input_ids_a'].to(device)
        attention_mask_a = batch['attention_mask_a'].to(device)
        input_ids_b = batch['input_ids_b'].to(device)
        attention_mask_b = batch['attention_mask_b'].to(device)
        
        logits_a, logits_b = model(input_ids_a, attention_mask_a, input_ids_b, attention_mask_b)
        
        # Compute softmax probabilities
        probs_a = F.softmax(logits_a, dim=-1)  # [batch, 3]
        probs_b = F.softmax(logits_b, dim=-1)
        probs_tie = F.softmax((logits_a + logits_b) / 2, dim=-1)
        
        # Extract probability for the target class (adjust index if necessary)
        predictions_a.append(probs_a[:, 1].detach().cpu())
        predictions_b.append(probs_b[:, 1].detach().cpu())
        predictions_tie.append(probs_tie[:, 1].detach().cpu())

# Concatenate predictions from all batches
predictions_a = torch.cat(predictions_a)
predictions_b = torch.cat(predictions_b)
predictions_tie = torch.cat(predictions_tie)

# Construct the submission DataFrame
submission = pd.DataFrame({
    'id': test_data['id'],
    'winner_model_a': predictions_a.numpy(),
    'winner_model_b': predictions_b.numpy(),
    'winner_tie': predictions_tie.numpy()
})

submission.to_csv('submission.csv', index=False)
print("Submission file saved as submission.csv")
