In [None]:
#############################################################
# Code Block 1: Environment Setup and Data Loading
#############################################################

import warnings
warnings.filterwarnings('ignore', message='.*overflowing tokens.*')

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification

# Set random seed for reproducibility
def set_seed(seed=42):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load the pre-trained BERT tokenizer
model_path = "/kaggle/input/bert-base-uncased/pytorch/default/1/bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_path)

# Load training and test data
train_data = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")
test_data = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")

print("Training data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

#############################################################
# Code Block 2: Data Preprocessing
#############################################################

def tokenize_texts(df, tokenizer, max_length=512):
    """
    Tokenizes the input text and converts them into numerical format
    """
    encoding_a = tokenizer(
        df["response_a"].tolist(),
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    encoding_b = tokenizer(
        df["response_b"].tolist(),
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    return {
        "input_ids_a": encoding_a["input_ids"],
        "attention_mask_a": encoding_a["attention_mask"],
        "input_ids_b": encoding_b["input_ids"],
        "attention_mask_b": encoding_b["attention_mask"]
    }

print("Preprocessing training data...")
preprocessed_train = tokenize_texts(train_data, tokenizer)

# Extract labels
labels = torch.tensor(train_data[["winner_model_a", "winner_model_b", "winner_tie"]].values, dtype=torch.float32)

#############################################################
# Code Block 3: Define Dataset and DataLoader
#############################################################

class ChatbotDataset(Dataset):
    """
    Custom PyTorch dataset for chatbot classification task
    """
    def __init__(self, preprocessed_data, labels=None):
        self.input_ids_a = preprocessed_data["input_ids_a"]
        self.attention_mask_a = preprocessed_data["attention_mask_a"]
        self.input_ids_b = preprocessed_data["input_ids_b"]
        self.attention_mask_b = preprocessed_data["attention_mask_b"]
        self.labels = labels if labels is not None else torch.zeros(len(self.input_ids_a), 3)  # Avoid NoneType error

    def __len__(self):
        return len(self.input_ids_a)

    def __getitem__(self, idx):
        batch = {
            "input_ids_a": self.input_ids_a[idx],
            "attention_mask_a": self.attention_mask_a[idx],
            "input_ids_b": self.input_ids_b[idx],
            "attention_mask_b": self.attention_mask_b[idx],
        }
        if self.labels is not None:
            batch["label"] = self.labels[idx]
        return batch

train_dataset = ChatbotDataset(preprocessed_train, labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

#############################################################
# Code Block 4: Define the Multi-Task BERT Model
#############################################################

class MultiTaskBERTModel(nn.Module):
    """
    Multi-task classification model using BERT as a backbone
    """
    def __init__(self, model_path, num_labels=3):
        super(MultiTaskBERTModel, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained(
            model_path, 
            num_labels=num_labels, 
            ignore_mismatched_sizes=True
        )
        self.bert.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)  # Reinitialize classifier layer
        nn.init.xavier_uniform_(self.bert.classifier.weight)  # Reinitialize weights properly
        nn.init.zeros_(self.bert.classifier.bias)
    
    def forward(self, input_ids_a, attention_mask_a, input_ids_b, attention_mask_b):
        output_a = self.bert(input_ids=input_ids_a, attention_mask=attention_mask_a).logits
        output_b = self.bert(input_ids=input_ids_b, attention_mask=attention_mask_b).logits
        return output_a, output_b

model = MultiTaskBERTModel(model_path, num_labels=3)
model.to(device)

#############################################################
# Code Block 5: Generate Submission File
#############################################################

# Model predictions
print("Generating predictions...")
test_preprocessed = tokenize_texts(test_data, tokenizer)
test_dataset = ChatbotDataset(test_preprocessed, labels=None)
test_loader = DataLoader(test_dataset, batch_size=32)

logits_a, logits_b = [], []
model.eval()
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items() if "input_ids" in k or "attention_mask" in k}
        out_a, out_b = model(**batch)
        logits_a.append(out_a.cpu())
        logits_b.append(out_b.cpu())

logits_a = torch.cat(logits_a, dim=0)
logits_b = torch.cat(logits_b, dim=0)
logits_tie = torch.zeros_like(logits_a[:, 0])  # Ensure 1D array for proper submission formatting

# Normalize scores
logits = torch.stack([logits_a[:, 0], logits_b[:, 0], logits_tie], dim=1)
probs = F.softmax(logits, dim=1).numpy()

# Create submission DataFrame
submission_df = pd.DataFrame({
    "id": test_data["id"].values,
    "winner_model_a": probs[:, 0],
    "winner_model_b": probs[:, 1],
    "winner_tie": probs[:, 2]
})

# Save the submission file
submission_df.to_csv("submission.csv", index=False)
print("Submission file generated: submission.csv")

