In [8]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

import ast
import json
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

In [17]:
import torch
from torch import nn

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer,AutoModelForSequenceClassification
from transformers import LongformerTokenizer, LongformerForSequenceClassification

In [18]:
# !zip -r /kaggle/working/deberta-v3-base-tokenizer.zip /kaggle/working/deberta-v3-base-tokenizer

In [19]:
train_df = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')
test_df = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')
sample_submission_df = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/sample_submission.csv')

In [20]:
model_path = '/kaggle/input/deberta-base/pytorch/deberta/1/deberta-full.pth'

tokenizer_path = '/kaggle/input/deberta-tokenizer/kaggle/working/deberta-v3-base-tokenizer'

In [22]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [23]:
class ComparisonModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base",  
                                                                        num_labels=3, return_dict=True)
        self.comparator = nn.Linear(6, 3)

    def forward(self, input_ids_a, attention_mask_a, input_ids_b, attention_mask_b):
        # Forward pass for input A
        outputs_a = self.model(input_ids=input_ids_a, attention_mask=attention_mask_a)
        logits_a = outputs_a.logits  # Accessing logits from the output

        # Forward pass for input B
        outputs_b = self.model(input_ids=input_ids_b, attention_mask=attention_mask_b)
        logits_b = outputs_b.logits  # Accessing logits from the output

        # Concatenate logits
        combined_logits = torch.cat((logits_a, logits_b), dim=1)

        # Pass through the comparator
        final_logits = self.comparator(combined_logits)

        return final_logits

In [24]:
model = torch.load(model_path)
model.eval()

ComparisonModel(
  (model): DebertaV2ForSequenceClassification(
    (deberta): DebertaV2Model(
      (embeddings): DebertaV2Embeddings(
        (word_embeddings): Embedding(128100, 768, padding_idx=0)
        (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
        (dropout): StableDropout()
      )
      (encoder): DebertaV2Encoder(
        (layer): ModuleList(
          (0-11): 12 x DebertaV2Layer(
            (attention): DebertaV2Attention(
              (self): DisentangledSelfAttention(
                (query_proj): Linear(in_features=768, out_features=768, bias=True)
                (key_proj): Linear(in_features=768, out_features=768, bias=True)
                (value_proj): Linear(in_features=768, out_features=768, bias=True)
                (pos_dropout): StableDropout()
                (dropout): StableDropout()
              )
              (output): DebertaV2SelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
        

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

ComparisonModel(
  (model): DebertaV2ForSequenceClassification(
    (deberta): DebertaV2Model(
      (embeddings): DebertaV2Embeddings(
        (word_embeddings): Embedding(128100, 768, padding_idx=0)
        (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
        (dropout): StableDropout()
      )
      (encoder): DebertaV2Encoder(
        (layer): ModuleList(
          (0-11): 12 x DebertaV2Layer(
            (attention): DebertaV2Attention(
              (self): DisentangledSelfAttention(
                (query_proj): Linear(in_features=768, out_features=768, bias=True)
                (key_proj): Linear(in_features=768, out_features=768, bias=True)
                (value_proj): Linear(in_features=768, out_features=768, bias=True)
                (pos_dropout): StableDropout()
                (dropout): StableDropout()
              )
              (output): DebertaV2SelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
        

In [26]:
test_df['prompt'] = test_df['prompt'].apply(ast.literal_eval)

In [27]:
test_df['response_a'] = test_df['response_a'].apply(lambda x: json.loads(x))

test_df['response_b'] = test_df['response_b'].apply(lambda x: json.loads(x))

In [28]:
def prepare_data(row):
    # Replace NaN values with empty strings and concatenate prompt with each response
    chat_a = " [SEP] ".join([f"prompt: {p if pd.notna(p) else ''} [RESPONSE_A] {r if pd.notna(r) else ''}" for p, r in zip(row['prompt'], row['response_a'])])
    chat_b = " [SEP] ".join([f"prompt: {p if pd.notna(p) else ''} [RESPONSE_B] {r if pd.notna(r) else ''}" for p, r in zip(row['prompt'], row['response_b'])])

    # Tokenize inputs
    tokens_a = tokenizer(chat_a, max_length=512, truncation=True, padding="max_length", return_tensors='pt')
    tokens_b = tokenizer(chat_b, max_length=512, truncation=True, padding="max_length", return_tensors='pt')
    return tokens_a, tokens_b

In [29]:
def remove_surrogates(text_list):
    cleaned_list = []
    for text in text_list:
        if text is None:
            cleaned_list.append("")
        else:
            try:
                # Try to handle surrogates by encoding to 'utf-16' and decoding back to 'utf-8'
                text = text.encode('utf-16', 'surrogatepass').decode('utf-16')
                text = text.encode('utf-8', 'strict').decode('utf-8')
            except UnicodeEncodeError:
                # If error persists, remove characters that cannot be encoded in UTF-8
                text = text.encode('utf-8', 'ignore').decode('utf-8')
            cleaned_list.append(text)
    return cleaned_list

In [30]:
test_df['prompt'] = test_df['prompt'].apply(remove_surrogates)
test_df['response_a'] = test_df['response_a'].apply(remove_surrogates)
test_df['response_b'] = test_df['response_b'].apply(remove_surrogates)

In [31]:
test_df[['tokens_a', 'tokens_b']] = test_df.apply(lambda row: prepare_data(row), axis=1, result_type='expand')

In [32]:
from torch.utils.data import Dataset, DataLoader
import torch

class TextComparisonDataset(Dataset):
    def __init__(self, tokens_a, tokens_b, labels):
        self.tokens_a = tokens_a
        self.tokens_b = tokens_b
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'input_ids_a': self.tokens_a[idx]['input_ids'].squeeze(),
            'attention_mask_a': self.tokens_a[idx]['attention_mask'].squeeze(),
            'input_ids_b': self.tokens_b[idx]['input_ids'].squeeze(),
            'attention_mask_b': self.tokens_b[idx]['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [33]:
tokens_a = test_df['tokens_a'].tolist()
tokens_b = test_df['tokens_b'].tolist()

test_dataset = TextComparisonDataset(tokens_a, tokens_b, [0] * len(tokens_b))
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [34]:
predictions = []

with torch.no_grad():
    for batch in test_loader:
        # Manually remove 'labels' since the dataset provides it as None
        batch.pop('labels', 0)  # Safe to do, as we know it's dummy data

        input_ids_a = batch['input_ids_a'].to(device)
        attention_mask_a = batch['attention_mask_a'].to(device)
        input_ids_b = batch['input_ids_b'].to(device)
        attention_mask_b = batch['attention_mask_b'].to(device)

        outputs = model(input_ids_a, attention_mask_a, input_ids_b, attention_mask_b)
        probabilities = torch.nn.functional.softmax(outputs, dim=1)
        predictions.extend(probabilities.cpu().numpy())

In [35]:
predictions_df = pd.DataFrame(predictions, columns=['winner_model_a', 'winner_model_b', 'winner_model_tie'])

# Add the 'id' column from the test_df
predictions_df['id'] = test_df['id'].values

# Reorder columns to match the expected format
predictions_df = predictions_df[['id', 'winner_model_a', 'winner_model_b', 'winner_model_tie']]

In [36]:
predictions_df.to_csv('submission.csv', index=False)

In [None]:
# I've tried longformer and the memory isn't enough to process

In [None]:
## Try to use two gpu's to have the bigger context window