In [17]:
import pandas as pd

train_main_df = pd.read_parquet("data/train.parquet", engine="pyarrow")
train_main_df.head()

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
0,00007cff95d7f7974642a785aca248b0f26e60d3312fac...,vieš po Slovensky?,"Áno, hovorím po slovensky. Ako vám môžem pomôcť?","Áno, veď som tu! Môžem ti pomôcť s otázkami al...",model_a,o1-preview,reka-core-20240904,Slovak
1,00010ed04b536f56ebe43eef1100c13906abea12bf9855...,You will be given a piece of news. Analyze it ...,Let's break down the news and analyze it accor...,"```json\n{\n ""contains_orgs"": true,\n ""orgs""...",model_a,gemma-2-27b-it,gemini-1.5-flash-002,Russian
2,0003800d510e38803efba5ceaec122bc66408fe367b0be...,"Dört basamaklı, rakamları birbirinden ve sıfır...","Bu soruyu çözmek için, verilen koşulları adım ...",Bu problemi adım adım çözelim:\n\n1) ABCD - DC...,model_a,gpt-4-0125-preview,claude-3-5-sonnet-20240620,Turkish
3,00072026c68f5418ef2da238394e418ce72a534b9b22d5...,"현재 추천된 탑 3 종목인 Cabaletta Bio (CABA), Rocket Ph...","죄송하지만 저는 금융 조언을 제공할 수 없습니다. 저는 AI 모델이며, 투자 결정에...",현재 추천된 탑 3 종목에 순위를 매기기 위해서는 여러 가지 요소들을 고려해야 합니...,model_b,gemma-2-2b-it,llama-3.1-nemotron-70b-instruct,English
4,0007ce7cf6bc1b5a8f8a4669b854fb12030863c970d9dc...,Please be boring,"Alright, I'll be as boring as possible.\n\nTod...","Understood. Here is a straightforward, unadorn...",model_a,reka-flash-20240722,grok-2-2024-08-13,English


In [19]:
required_columns = ["prompt", "response_a", "response_b", "winner"]
train_df = train_main_df[required_columns]
train_df.head()

Unnamed: 0,prompt,response_a,response_b,winner
0,vieš po Slovensky?,"Áno, hovorím po slovensky. Ako vám môžem pomôcť?","Áno, veď som tu! Môžem ti pomôcť s otázkami al...",model_a
1,You will be given a piece of news. Analyze it ...,Let's break down the news and analyze it accor...,"```json\n{\n ""contains_orgs"": true,\n ""orgs""...",model_a
2,"Dört basamaklı, rakamları birbirinden ve sıfır...","Bu soruyu çözmek için, verilen koşulları adım ...",Bu problemi adım adım çözelim:\n\n1) ABCD - DC...,model_a
3,"현재 추천된 탑 3 종목인 Cabaletta Bio (CABA), Rocket Ph...","죄송하지만 저는 금융 조언을 제공할 수 없습니다. 저는 AI 모델이며, 투자 결정에...",현재 추천된 탑 3 종목에 순위를 매기기 위해서는 여러 가지 요소들을 고려해야 합니...,model_b
4,Please be boring,"Alright, I'll be as boring as possible.\n\nTod...","Understood. Here is a straightforward, unadorn...",model_a


In [21]:
from sklearn.model_selection import train_test_split

train_frame, validation_frame = train_test_split(train_df, random_state=2024, test_size=0.25)

In [37]:
corpus = list()
for idx in range(len(train_frame)):
    corpus.append(train_frame.iloc[idx]["prompt"])
    corpus.append(train_frame.iloc[idx]["response_a"])
    corpus.append(train_frame.iloc[idx]["response_b"])

In [43]:
from tqdm.auto import tqdm

vocabulary = set()
token_lens = list()


for idx, sentence in tqdm(enumerate(corpus)):
    sentence = sentence.replace("\n", " ")
    tokens = sentence.split(" ")
    token_lens.append(len(tokens))
    for token in tokens:
        if token != " " or token != "":
            vocabulary.add(token)

108987it [00:05, 18903.23it/s]


In [44]:
word_to_idx = {
    word:idx for idx, word in enumerate(vocabulary)
}
len(word_to_idx)

2061647

In [45]:
import pickle

with open("vocabulary.pkl", "wb") as f:
    pickle.dump(word_to_idx, f)

In [48]:
import numpy as np

np.max(token_lens)

51214

In [92]:
from torch.utils.data import Dataset
import torch


class ResponseDataset(Dataset):
    def __init__(self, df, word_to_idx=word_to_idx, max_len=2048, pad_token="[PAD]", oov_token="[OOV]"):
        self.df = df
        self.word_to_idx = word_to_idx
        self.max_len = max_len
        self.pad_token = pad_token
        self.oov_token = oov_token
        
        # add pad and oov token
        self.word_to_idx[pad_token] = len(word_to_idx)
        self.word_to_idx[oov_token] = len(word_to_idx)
        
        # label dict
        self.label_dict = {
            "model_a": 0,
            "model_b": 1
        }
        
    def __len__(self):
        return len(self.df)
    
    def __encode(self, text):
        encoded = torch.ones(self.max_len, dtype=torch.long) * \
            self.word_to_idx.get(self.pad_token)
        
        text = text.replace("\n", " ")
        tokens = text.split(" ")
        # limit to max len
        tokens = tokens[:self.max_len]
        
        for idx, token in enumerate(tokens):
            word_idx = self.word_to_idx.get(token, self.word_to_idx.get(self.oov_token))
            encoded[idx] = word_idx
            
        return encoded
    
    def __getitem__(self, idx):
        prompt = self.__encode(self.df.iloc[idx]["prompt"])
        response_a = self.__encode(self.df.iloc[idx]["response_a"])
        response_b = self.__encode(self.df.iloc[idx]["response_b"])
        
        label = self.df.iloc[idx]["winner"]
        label = self.label_dict.get(label)
        
        return {
            "prompt": prompt,
            "positive": response_a if label == 0 else response_b,
            "negative": response_b if label == 0 else response_a,
        }
        
        
ds = ResponseDataset(train_frame)
ds[0]

{'prompt': tensor([ 347465,  857093, 1080773,  ..., 2061648, 2061648, 2061648]),
 'positive': tensor([1518887,  226638,  617963,  ..., 2061648, 2061648, 2061648]),
 'negative': tensor([1198971,  602493,       0,  ..., 2061648, 2061648, 2061648])}

In [93]:
trainset = ResponseDataset(train_frame)
valset = ResponseDataset(validation_frame)

In [94]:
from torch.utils.data import DataLoader

bs = 128
train_loader = DataLoader(trainset, batch_size=bs, shuffle=True) 
val_loader = DataLoader(valset, batch_size=bs, shuffle=False) 

In [95]:
for batch in train_loader:
    print(batch)
    break

{'prompt': tensor([[1145386,  816394,  724135,  ..., 2061648, 2061648, 2061648],
        [1562672, 2015393, 1812222,  ..., 2061648, 2061648, 2061648],
        [ 106649, 1128516, 1582120,  ..., 2061648, 2061648, 2061648],
        ...,
        [1698758,  683475,  627324,  ..., 2061648, 2061648, 2061648],
        [1094831, 1455095, 1177346,  ..., 2061648, 2061648, 2061648],
        [ 802773,  801709,  877170,  ..., 2061648, 2061648, 2061648]]), 'positive': tensor([[ 231902,  632335, 1842316,  ..., 2061648, 2061648, 2061648],
        [1345012,  612782,       0,  ..., 2061648, 2061648, 2061648],
        [ 580417, 1464334,  875782,  ..., 2061648, 2061648, 2061648],
        ...,
        [ 138431, 1162813,   69453,  ..., 2061648, 2061648, 2061648],
        [ 378806, 1045391,  664294,  ..., 2061648, 2061648, 2061648],
        [ 337284, 1613688,       0,  ..., 2061648, 2061648, 2061648]]), 'negative': tensor([[ 231902,  632335, 1842316,  ..., 2061648, 2061648, 2061648],
        [1772813,  545400

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import lightning.pytorch as L
import torch.optim as optim

class Classifier(L.LightningModule):
    def __init__(
        self,
        input_dim,
        hidden_dim,
        embedding_dim,
        embedding_size,
        dropout,
        lr,
        batch_size
    ):
        super().__init__()
        
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.embedding_size = embedding_size
        self.dropout = dropout
        self.lr = lr
        self.batch_size = batch_size
        
        self.save_hyperparameters()
        
        # modules
        self.embedding = nn.Embedding(
            self.embedding_size,
            self.embedding_dim
        )
        
        self.mlp = nn.Sequential(
            nn.Linear(self.embedding_dim, self.hidden_dim),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dim, hidden_dim // 2),
            nn.Tanh()
        )
        
        # criterion 
        self.criterion = nn.TripletMarginLoss()
        
    def forward(self, prompt, positive, negative):
        prompt = self.embedding(prompt)
        positive = self.embedding(positive)
        negative = self.embedding(negative)
        
        prompt = self.mlp(prompt)
        positive = self.mlp(positive)
        negative = self.mlp(negative)
        
        return prompt, positive, negative
    

    


# model = Classifier(2048, 256, 512, len(word_to_idx) + 2, 0.1, 1e-3, bs)
# with torch.no_grad():
#     for batch in train_loader:
#         out = model(**batch)
#         p, a, b = out
        
#         loss = F.triplet_margin_loss(p, a, b)
#         print(loss)
        
#         break

tensor(1.0841)
