In [None]:
import pandas as pd
import json
import torch
from transformers import AutoTokenizer, AutoModel
from datasets import Dataset
from torch import nn
import numpy as np
import matplotlib.pyplot as plt
import gc


# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if torch.cuda.is_available():
  with torch.cuda.device(device):
    torch.cuda.empty_cache()
    print(torch.cuda.memory_summary(device=None, abbreviated=False))

# Data Preprocessing

In [None]:
train_raw = pd.read_csv("./data/train.csv")
train_raw.head()

In [None]:
# an access token is needed if we use models like Mistral
def get_access_token() -> str:
    token = ""
    with open('token.txt', 'r') as f:
        token = f.read().strip()
    print(token)
    return token

In [None]:
train_proc = pd.concat([train_raw['prompt'].apply(json.loads), train_raw['response_a'].apply(json.loads), train_raw['response_b'].apply(json.loads), train_raw['winner_model_a'].astype(np.float32), train_raw['winner_model_b'].astype(np.float32), train_raw['winner_tie'].astype(np.float32)], axis=1)
train_proc.head()

In [None]:
print(train_proc['prompt'].apply(len).max(), train_proc['prompt'].apply(len).min())
print(train_proc['response_a'].apply(len).max(), train_proc['response_a'].apply(len).min())
print(train_proc['response_b'].apply(len).max(), train_proc['response_b'].apply(len).min())
all(train_proc['response_b'].apply(len)==train_proc['response_a'].apply(len)) and all(train_proc['response_b'].apply(len)==train_proc['prompt'].apply(len))

In [None]:
train = train_proc.explode(['prompt', 'response_a', 'response_b'])
train.head()

In [None]:
train['response_a'].isna().sum(), train['response_b'].isna().sum()

In [None]:
train.fillna("",inplace=True)
train['response_a'].isna().sum(), train['response_b'].isna().sum()

In [None]:
# optimizations are possible by moving computation to GPU and using flash attention
# we just use cpu
tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')

train['combined_a'] = train.apply(lambda x: x['prompt'] + tokenizer.sep_token + x['response_a'], axis=1)
train['combined_b'] = train.apply(lambda x: x['prompt'] + tokenizer.sep_token + x['response_b'], axis=1)

In [None]:
train.drop(['prompt', 'response_a', 'response_b'], axis=1, inplace=True)
train.to_pickle("./data/train_final.pkl")
del train_raw
del train_proc
del train
gc.collect()

# Training

In [None]:
#pd.set_option('display.max_colwidth', None)
train = pd.read_pickle("./data/train_final.pkl")
train.head()

In [None]:
def create_torch_dataloader(df, tokenizer, batch_size):
    def encode(batch):
        # tokenizer.__call__ will give ids
        resultA = tokenizer(batch['combined_a'], return_tensors="pt", padding='max_length', truncation=True)
        resultB = tokenizer(batch['combined_b'], return_tensors="pt", padding='max_length', truncation=True)
        result = {
            'input_ids_a': resultA["input_ids"],
            'attention_mask_a': resultA["attention_mask"],
            'input_ids_b': resultB["input_ids"],
            'attention_mask_b': resultB["attention_mask"],
            'winner_a': batch['winner_model_a'], 
            'winner_b': batch['winner_model_b'],
            'tie': batch['winner_tie'], 
        }
        return result

    dataset = Dataset.from_pandas(df[['combined_a', 'combined_b', 'winner_model_a', 'winner_model_b', 'winner_tie']])
    # tokenization is only applied when examples are accessed
    dataset_tf = dataset.with_transform(encode)
    dataset_tf.with_format(type='torch')
    return torch.utils.data.DataLoader(dataset_tf, batch_size=batch_size)


In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')

dl = create_torch_dataloader(train, tokenizer, 2)

In [None]:
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        # models are stored in /home/<user>/.cache/huggingface
        # find /home/<user> -type d -name huggingface
        # batch_size x sequence_length x embedding_size
        self.llm = AutoModel.from_pretrained("distilbert/distilbert-base-uncased")
        # CLS token: batch_size x 1 x 768
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(2*768, 3),
            nn.Dropout(0.1),
        )

    def forward(self, input_a, attention_a, input_b, attention_b):
        hidden_state_a = self.llm(input_a, attention_mask=attention_a).last_hidden_state
        hidden_state_b = self.llm(input_b, attention_mask=attention_b).last_hidden_state
        concat_state = torch.concat((hidden_state_a[:,0,:], hidden_state_b[:,0,:]), 1)
        out = self.linear_relu_stack(concat_state)
        return out

In [None]:
#https://huggingface.co/distilbert/distilbert-base-uncased/discussions/11
# DistilBert with 66mio params should use 0,5GB + FC layer
model = MyModel()

In [None]:
# memory consumption:
# number of params in model: DistilBert with 66mio params should use 0,5GB + FC layer
# x2 for autograd nodes
# batch tensor: <batch_size>*512*768*8*4 (12MB per sample)
# + 600MiB for bootstrapping GPU use for pytorch: https://stackoverflow.com/questions/62547072/why-does-pytorch-use-so-much-gpu-memory-to-store-tensors
def training(epochs: int, dataloader, model):
    loss_fn = nn.CrossEntropyLoss(reduction="mean")
    optimizer = torch.optim.AdamW(model.parameters())
    plot_pts = []
    model.to(device)
    model.train()
    for i in range(epochs):
        for j, batch in enumerate(dataloader):
            print("batching")
            # batch should use 
            batch['input_ids_a'].to(device)
            batch['attention_mask_a'].to(device)
            batch['input_ids_b'].to(device)
            batch['attention_mask_b'].to(device)
            for k, v in batch.items():
                if k in ['input_ids_a', 'input_ids_b', 'attention_mask_a', 'attention_mask_b']:
                  batch[k] = v.to(device)
            target = torch.stack((batch["winner_a"],batch["winner_b"],batch["tie"]),1).to(device)
            output = model(batch['input_ids_a'],batch['attention_mask_a'],batch['input_ids_b'],batch['attention_mask_b'])
            loss = loss_fn(output, target)
            if j % 1 == 0:
              print(f"epoch {i} - batch {j} - loss: {loss}")
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            plot_pts.append(loss.item())
            loss.cpu()
            del batch['input_ids_a'], batch['attention_mask_a'], batch['input_ids_b'], batch['attention_mask_b'],batch["winner_a"],batch["winner_b"],batch["tie"]
            del batch, target, output, loss
            gc.collect()
            if device == 'cpu':
              continue
            else:
              with torch.cuda.device(device):
                torch.cuda.empty_cache()
    return plot_pts

In [None]:
plot_pts = training(1, dl, model)

In [None]:
print(plot_pts)
plt.plot(plot_pts)
plt.show()

In [None]:
del model
gc.collect()
torch.cuda.empty_cache()