# Import

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

import ast
import json
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

# Load model

In [30]:
from transformers import AutoModel

model = AutoModel.from_pretrained("microsoft/deberta-v3-base",  num_labels=3, return_dict=True)

In [31]:
model

DebertaV2Model(
  (embeddings): DebertaV2Embeddings(
    (word_embeddings): Embedding(128100, 768, padding_idx=0)
    (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
    (dropout): StableDropout()
  )
  (encoder): DebertaV2Encoder(
    (layer): ModuleList(
      (0-11): 12 x DebertaV2Layer(
        (attention): DebertaV2Attention(
          (self): DisentangledSelfAttention(
            (query_proj): Linear(in_features=768, out_features=768, bias=True)
            (key_proj): Linear(in_features=768, out_features=768, bias=True)
            (value_proj): Linear(in_features=768, out_features=768, bias=True)
            (pos_dropout): StableDropout()
            (dropout): StableDropout()
          )
          (output): DebertaV2SelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
            (dropout): StableDropout()
          )
        )
        (intermedia

In [None]:
!nvidia-smi

In [4]:
train_df = pd.read_csv('/content/drive/MyDrive/kaggle/lmsys-chatbot-arena/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/kaggle/lmsys-chatbot-arena/test.csv')
sample_submission_df = pd.read_csv('/content/drive/MyDrive/kaggle/lmsys-chatbot-arena/sample_submission.csv')

# Data preprocessing

In [5]:
train_df['prompt'] = train_df['prompt'].apply(ast.literal_eval)

test_df['prompt'] = test_df['prompt'].apply(ast.literal_eval)

In [6]:
train_df['response_a'] = train_df['response_a'].apply(lambda x: json.loads(x))

train_df['response_b'] = train_df['response_b'].apply(lambda x: json.loads(x))

test_df['response_a'] = test_df['response_a'].apply(lambda x: json.loads(x))

test_df['response_b'] = test_df['response_b'].apply(lambda x: json.loads(x))

In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [8]:
def prepare_data(row):
    # Replace NaN values with empty strings and concatenate prompt with each response
    chat_a = " [SEP] ".join([f"prompt: {p if pd.notna(p) else ''} [RESPONSE_A] {r if pd.notna(r) else ''}" for p, r in zip(row['prompt'], row['response_a'])])
    chat_b = " [SEP] ".join([f"prompt: {p if pd.notna(p) else ''} [RESPONSE_B] {r if pd.notna(r) else ''}" for p, r in zip(row['prompt'], row['response_b'])])

    tokens_a = tokenizer(chat_a, max_length=512, truncation=True, padding="max_length", return_tensors='pt')
    tokens_b = tokenizer(chat_b, max_length=512, truncation=True, padding="max_length", return_tensors='pt')
    return tokens_a, tokens_b

In [9]:
def remove_surrogates(text_list):
    cleaned_list = []
    for text in text_list:
        if text is None:
            cleaned_list.append("")
        else:
            try:
                # handle surrogates by encoding to 'utf-16' and decoding back to 'utf-8'
                text = text.encode('utf-16', 'surrogatepass').decode('utf-16')
                text = text.encode('utf-8', 'strict').decode('utf-8')
            except UnicodeEncodeError:
                # If error persists, remove characters that cannot be encoded in UTF-8
                text = text.encode('utf-8', 'ignore').decode('utf-8')
            cleaned_list.append(text)
    return cleaned_list

In [10]:
train_df['prompt'] = train_df['prompt'].apply(remove_surrogates)
train_df['response_a'] = train_df['response_a'].apply(remove_surrogates)
train_df['response_b'] = train_df['response_b'].apply(remove_surrogates)

In [11]:
test_df['prompt'] = test_df['prompt'].apply(remove_surrogates)
test_df['response_a'] = test_df['response_a'].apply(remove_surrogates)
test_df['response_b'] = test_df['response_b'].apply(remove_surrogates)

In [12]:
train_df[['tokens_a', 'tokens_b']] = train_df.apply(lambda row: prepare_data(row), axis=1, result_type='expand')

In [14]:
test_df[['tokens_a', 'tokens_b']] = test_df.apply(lambda row: prepare_data(row), axis=1, result_type='expand')

In [15]:
def label(row):
    if row['winner_model_a'] == 1:
        return 0
    elif row['winner_model_b'] == 1:
        return 1
    elif row['winner_tie'] == 1:
        return 2

train_df['label'] = train_df.apply(label, axis=1)

# Modelling

Create custom dataset

In [16]:
from torch.utils.data import Dataset, DataLoader
import torch

class TextComparisonDataset(Dataset):
    def __init__(self, tokens_a, tokens_b, labels):
        self.tokens_a = tokens_a
        self.tokens_b = tokens_b
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids_a': self.tokens_a[idx]['input_ids'].squeeze(),
            'attention_mask_a': self.tokens_a[idx]['attention_mask'].squeeze(),
            'input_ids_b': self.tokens_b[idx]['input_ids'].squeeze(),
            'attention_mask_b': self.tokens_b[idx]['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

Initial split: 70% for training, 30% for combined validation + testing

In [17]:

X_train, X_val_test, y_train, y_val_test = train_test_split(
    list(zip(train_df['tokens_a'], train_df['tokens_b'])),
    train_df['label'],
    test_size=0.30,  # 30% for validation + testing
    random_state=42
)

# Second split: Divide the 30% reserved above into 50% validation and 50% testing
X_val, X_test, y_val, y_test = train_test_split(
    X_val_test,
    y_val_test,
    test_size=0.50,  # 50% of the remaining data
    random_state=42
)

tokens_a_train, tokens_b_train = zip(*X_train)
tokens_a_val, tokens_b_val = zip(*X_val)
tokens_a_test, tokens_b_test = zip(*X_test)

y_train = y_train.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


train_dataset = TextComparisonDataset(tokens_a_train, tokens_b_train, y_train)
val_dataset = TextComparisonDataset(tokens_a_val, tokens_b_val, y_val)
test_dataset = TextComparisonDataset(tokens_a_test, tokens_b_test, y_test)

Create model

In [36]:
from torch import nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class ComparisonModel(nn.Module):
    def __init__(self):
        super().__init__()

        self.model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base",  num_labels=3, return_dict=True)
        self.comparator = nn.Linear(6, 3)

    def forward(self, input_ids_a, attention_mask_a, input_ids_b, attention_mask_b):

        outputs_a = self.model(input_ids=input_ids_a, attention_mask=attention_mask_a)
        logits_a = outputs_a.logits

        outputs_b = self.model(input_ids=input_ids_b, attention_mask=attention_mask_b)
        logits_b = outputs_b.logits

        combined_logits = torch.cat((logits_a, logits_b), dim=1)

        final_logits = self.comparator(combined_logits)

        return final_logits

In [50]:
from torch.optim import Adam
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

2000

In [51]:
model = ComparisonModel()
optimizer = Adam(model.parameters(), lr=2e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

print('x')

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


x


In [52]:
loss_fn = nn.CrossEntropyLoss()
num_epochs = 1
report_every = 50

In [53]:
import logging

logging.basicConfig(filename='training_log.log', level=logging.INFO, format='%(asctime)s:%(levelname)s:%(message)s')

In [47]:
len(train_loader)

10059

In [49]:
len(val_loader)

2156

Training using Pytorch

In [56]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    logging.info(f"Epoch {epoch+1}/{num_epochs} started.")

    # Training phase
    model.train()
    train_loss = 0
    for i, batch in enumerate(train_loader):
        optimizer.zero_grad()

        input_ids_a = batch['input_ids_a'].to(device)
        attention_mask_a = batch['attention_mask_a'].to(device)
        input_ids_b = batch['input_ids_b'].to(device)
        attention_mask_b = batch['attention_mask_b'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids_a, attention_mask_a, input_ids_b, attention_mask_b)
        loss = loss_fn(outputs, labels)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()

        if (i + 1) % report_every == 0:
            print(f'Epoch {epoch+1}, Batch {i+1}, Intermediate Training Loss: {loss.item():.4f}')
            logging.info(f'Epoch {epoch+1}, Batch {i+1}, Intermediate Training Loss: {loss.item():.4f}')

    average_train_loss = train_loss / len(train_loader)
    print(f"End of Epoch Training Loss: {average_train_loss:.4f}")
    logging.info(f"End of Epoch {epoch+1} Training Loss: {average_train_loss:.4f}")

    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(val_loader):
            input_ids_a = batch['input_ids_a'].to(device)
            attention_mask_a = batch['attention_mask_a'].to(device)
            input_ids_b = batch['input_ids_b'].to(device)
            attention_mask_b = batch['attention_mask_b'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids_a, attention_mask_a, input_ids_b, attention_mask_b)
            loss = loss_fn(outputs, labels)
            val_loss += loss.item()

            if (i + 1) % report_every == 0:
                print(f'Epoch {epoch+1}, Batch {i+1}, Intermediate Validation Loss: {loss.item():.4f}')
                logging.info(f'Epoch {epoch+1}, Batch {i+1}, Intermediate Validation Loss: {loss.item():.4f}')

    average_val_loss = val_loss / len(val_loader)
    print(f"End of Epoch Validation Loss: {average_val_loss:.4f}")
    logging.info(f"End of Epoch {epoch+1} Validation Loss: {average_val_loss:.4f}")

Epoch 1/1
Epoch 1, Batch 50, Intermediate Training Loss: 1.0646
Epoch 1, Batch 100, Intermediate Training Loss: 1.1026
Epoch 1, Batch 150, Intermediate Training Loss: 1.0943
Epoch 1, Batch 200, Intermediate Training Loss: 1.0945
Epoch 1, Batch 250, Intermediate Training Loss: 1.1220
Epoch 1, Batch 300, Intermediate Training Loss: 1.1109
Epoch 1, Batch 350, Intermediate Training Loss: 1.0536
Epoch 1, Batch 400, Intermediate Training Loss: 1.1127
Epoch 1, Batch 450, Intermediate Training Loss: 1.1251
Epoch 1, Batch 500, Intermediate Training Loss: 1.1114
Epoch 1, Batch 550, Intermediate Training Loss: 1.0714
Epoch 1, Batch 600, Intermediate Training Loss: 1.1491
Epoch 1, Batch 650, Intermediate Training Loss: 1.0470
Epoch 1, Batch 700, Intermediate Training Loss: 1.0297
Epoch 1, Batch 750, Intermediate Training Loss: 1.1881
Epoch 1, Batch 800, Intermediate Training Loss: 1.0660
Epoch 1, Batch 850, Intermediate Training Loss: 1.1198
Epoch 1, Batch 900, Intermediate Training Loss: 1.1286
E

In [57]:
torch.save(model, 'kaggle/lmsys-chatbot-arena/deberta-full.pth')

In [None]:
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids_a = batch['input_ids_a'].to(device)
        attention_mask_a = batch['attention_mask_a'].to(device)
        input_ids_b = batch['input_ids_b'].to(device)
        attention_mask_b = batch['attention_mask_b'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids_a, attention_mask_a, input_ids_b, attention_mask_b)
        # Apply softmax to convert outputs to probabilities
        probabilities = torch.nn.functional.softmax(outputs, dim=1)
        predictions.append(probabilities.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

predictions = np.vstack(predictions)
true_labels = np.array(true_labels)

In [None]:
from sklearn.metrics import log_loss

In [None]:
loss = log_loss(true_labels, predictions)
print(f'Log Loss on Test Dataset: {loss}')

Log Loss on Test Dataset: 1.0967135634220906


In [None]:
tokens_a = test_df['tokens_a'].tolist()
tokens_b = test_df['tokens_b'].tolist()

test_dataset = TextComparisonDataset(tokens_a, tokens_b, [0] * len(tokens_b))  # Use dummy labels
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [None]:
predictions = []

with torch.no_grad():
    for batch in test_loader:
        # Manually remove 'labels' since the dataset provides it as None
        batch.pop('labels', 0)  # Safe to do, as we know it's dummy data

        input_ids_a = batch['input_ids_a'].to(device)
        attention_mask_a = batch['attention_mask_a'].to(device)
        input_ids_b = batch['input_ids_b'].to(device)
        attention_mask_b = batch['attention_mask_b'].to(device)

        outputs = model(input_ids_a, attention_mask_a, input_ids_b, attention_mask_b)
        probabilities = torch.nn.functional.softmax(outputs, dim=1)
        predictions.extend(probabilities.cpu().numpy())

In [None]:
predictions_df = pd.DataFrame(predictions, columns=['winner_model_a', 'winner_model_b', 'winner_model_tie'])

# Add the 'id' column from the test_df
predictions_df['id'] = test_df['id'].values

# Reorder columns to match the expected format
predictions_df = predictions_df[['id', 'winner_model_a', 'winner_model_b', 'winner_model_tie']]

In [None]:
predictions_df.to_csv('submission.csv', index=False)