### 0. Imports

In [None]:
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
import emoji
import torch
import re
from transformers import BertForSequenceClassification, BertConfig,BertTokenizer, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
import random
from scipy.special import softmax

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

### 1. Dataset Load

In [None]:
train_df = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")
test_df = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")
submission_template = pd.read_csv("/kaggle/input/llm-classification-finetuning/sample_submission.csv")

### 2. EDA

In [None]:
#train_df['model_a'].value_counts()

In [None]:
#train_df['model_b'].value_counts()

In [None]:
#train_df.iloc[:,[6,7,8]].sum(axis=0)

In [None]:
# train_df = train_df.iloc[:10]
# test_df = test_df.iloc[:1000]

### 3. Preprocessing

In [None]:
def clean_text(text):
    #print("Beg: "+text)
    #1. case folding
    text = text.lower()
    #print("1: "+text)
    #2. remove html tags
    text = re.sub(r"<.*?>", "", text)
    #print("2: "+text)
    #3. remove URLs
    text = re.sub(r"https?:\/\/\S+|www\.\S+", "", text)
    #print("3: "+text)
    #4. remove punctutation
    text = text.translate(str.maketrans('', '', string.punctuation))
    #print("4: "+text)
    #5. remove stopwords
    stopword_list = stopwords.words('english')
    text = [word for word in text.split() if word not in stopword_list]
    text = " ".join(text)
    #print("5: "+text)
    #6. handle emojis
    text = emoji.demojize(text)
    #print("End: "+text)

    return text

In [None]:
def replace_emptystring(text):
    if (text == ''):
        return "NA"
    return text

In [None]:
#One hot encoded
#train_df['label'] = [[train_df['winner_model_a'][i], train_df['winner_model_b'][i], train_df['winner_tie'][i]] for i in range(0, len(train_df))]

In [None]:
#Class indices
train_df['label'] = [0 if train_df['winner_model_a'][i]==1 else(1 if train_df['winner_model_b'][i]==1 else 2) for i in range(0, len(train_df))]

In [None]:
train_df['prompt'] = train_df['prompt'].apply(lambda x: clean_text(x))
train_df['response_a'] = train_df['response_a'].apply(lambda x: clean_text(x))
train_df['response_b'] = train_df['response_b'].apply(lambda x: clean_text(x))

test_df['prompt'] = test_df['prompt'].apply(lambda x: clean_text(x))
test_df['response_a'] = test_df['response_a'].apply(lambda x: clean_text(x))
test_df['response_b'] = test_df['response_b'].apply(lambda x: clean_text(x))

In [None]:
train_df['prompt'] = train_df['prompt'].apply(lambda x: replace_emptystring(x))
train_df['response_a'] = train_df['response_a'].apply(lambda x: replace_emptystring(x))
train_df['response_b'] = train_df['response_b'].apply(lambda x: replace_emptystring(x))

test_df['prompt'] = test_df['prompt'].apply(lambda x: replace_emptystring(x))
test_df['response_a'] = test_df['response_a'].apply(lambda x: replace_emptystring(x))
test_df['response_b'] = test_df['response_b'].apply(lambda x: replace_emptystring(x))

In [None]:
(train_df['prompt']=='').value_counts()

### 4. LLM Finetune
#### 4.1 Tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

#### 4.2 Configure maxlength of encoder
setting it too high wastes memory, while setting it too low cuts off important context

In [None]:
# maxlength_encoded_prompt = 0
# maxlength_encoded_response1 = 0
# maxlength_encoded_response2 = 0
# for prompt, response1, response2 in zip(train_df['prompt'], train_df['response_a'] ,train_df['response_b']):
#     encoded_prompt = tokenizer.encode(prompt, add_special_tokens=True)
#     encoded_response1 = tokenizer.encode(response1, add_special_tokens=True)
#     encoded_response2 = tokenizer.encode(response2, add_special_tokens=True)
#     maxlength_encoded_prompt = max(maxlength_encoded_prompt, len(encoded_prompt))
#     maxlength_encoded_response1 = max(maxlength_encoded_response1, len(encoded_response1))
#     maxlength_encoded_response2 = max(maxlength_encoded_response2, len(encoded_response2))


# print(maxlength_encoded_prompt)
# print(maxlength_encoded_response1)
# print(maxlength_encoded_response2)

Maxlength, avglength comparison: </br>
4477, 53 - prompt </br>
8552, 181 - response1 </br>
9657, 183 - response2 </br>
Since bert is limited to max #subtokens to be 512, we can: </br>
1. take front 512 </br>
2. take middle 512 context </br>
3. split text into subtexts of 512, classify them and recombine </br>
4. maybe we can take the median value as the max_length </br>
https://stackoverflow.com/questions/58636587/how-can-i-use-bert-for-long-text-classification


In [None]:
# avglength_encoded_prompt = 0
# avglength_encoded_response1 = 0
# avglength_encoded_response2 = 0
# for prompt, response1, response2 in zip(train_df['prompt'], train_df['response_a'] ,train_df['response_b']):
#     encoded_prompt = tokenizer.encode(prompt, add_special_tokens=True)
#     encoded_response1 = tokenizer.encode(response1, add_special_tokens=True)
#     encoded_response2 = tokenizer.encode(response2, add_special_tokens=True)
#     avglength_encoded_prompt += len(encoded_prompt)
#     avglength_encoded_response1 += len(encoded_response1)
#     avglength_encoded_response2 += len(encoded_response2)

# print(avglength_encoded_prompt / len(train_df))
# print(avglength_encoded_response1 / len(train_df))
# print(avglength_encoded_response2 / len(train_df))

In [None]:
# print(avglength_encoded_prompt / len(train_df))
# print(avglength_encoded_response1 / len(train_df))
# print(avglength_encoded_response2 / len(train_df))

#### 4.3 Encoding train dataset

In [None]:
input_encodings  = []
attention_masks = []

for prompt, response1, response2 in zip(train_df['prompt'], train_df['response_a'], train_df['response_b']):
    prompt_encoded_dict = tokenizer(
        prompt,
        add_special_tokens = False,
        max_length = 100,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt',
        truncation=True
    )

    response1_encoded_dict = tokenizer(
        response1,
        add_special_tokens = False,
        max_length = 204,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt',
        truncation=True
    )

    response2_encoded_dict = tokenizer(
        prompt,
        add_special_tokens = False,
        max_length = 204,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt',
        truncation=True
    )

    input_id = torch.cat(
        [torch.tensor([101]), 
         prompt_encoded_dict['input_ids'][0], 
         torch.tensor([102]), 
         response1_encoded_dict['input_ids'][0], 
         torch.tensor([102]),
         response2_encoded_dict['input_ids'][0],
         torch.tensor([102])], dim=0)
    attention_mask = torch.cat(
        [torch.tensor([1]), 
         prompt_encoded_dict['attention_mask'][0], 
         torch.tensor([1]), 
         response1_encoded_dict['attention_mask'][0], 
         torch.tensor([1]),
         response2_encoded_dict['attention_mask'][0],
         torch.tensor([1])], dim=0)
    input_encodings.append(input_id)
    attention_masks.append(attention_mask)
    
dataset = TensorDataset(
    torch.stack(input_encodings), 
    torch.stack(attention_masks),
    torch.tensor(train_df['label']))

In [None]:
test_input_encodings  = []
test_attention_masks = []

for prompt, response1, response2 in zip(test_df['prompt'], test_df['response_a'], test_df['response_b']):
    prompt_encoded_dict = tokenizer(
        prompt,
        add_special_tokens = False,
        max_length = 100,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt',
        truncation=True
    )

    response1_encoded_dict = tokenizer(
        response1,
        add_special_tokens = False,
        max_length = 204,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt',
        truncation=True
    )

    response2_encoded_dict = tokenizer(
        prompt,
        add_special_tokens = False,
        max_length = 204,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt',
        truncation=True
    )

    test_input_id = torch.cat(
        [torch.tensor([101]), 
         prompt_encoded_dict['input_ids'][0], 
         torch.tensor([102]), 
         response1_encoded_dict['input_ids'][0], 
         torch.tensor([102]),
         response2_encoded_dict['input_ids'][0],
         torch.tensor([102])], dim=0)
    test_attention_mask = torch.cat(
        [torch.tensor([1]), 
         prompt_encoded_dict['attention_mask'][0], 
         torch.tensor([1]), 
         response1_encoded_dict['attention_mask'][0], 
         torch.tensor([1]),
         response2_encoded_dict['attention_mask'][0],
         torch.tensor([1])], dim=0)
    test_input_encodings.append(test_input_id)
    test_attention_masks.append(test_attention_mask)
    
test_dataset = TensorDataset(
    torch.stack(test_input_encodings), 
    torch.stack(test_attention_masks),
    torch.tensor(test_df['id']))

#### 4.4 Train-Val Split

In [None]:
train_dataset, val_dataset = random_split(dataset, [int(0.8*len(dataset)), len(dataset)-int(0.8*len(dataset))])

#### 4.5 Dataloader

In [None]:
train_dataloader = DataLoader(
    train_dataset,
    sampler = RandomSampler(train_dataset),
    batch_size = 32
)

val_dataloader = DataLoader(
    val_dataset,
    sampler = SequentialSampler(val_dataset),
    batch_size = 32
)

test_dataloader = DataLoader(
    test_dataset,
    sampler = SequentialSampler(test_dataset),
    batch_size = 32
)

#### 4.6 Load Bert

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 3,
    output_attentions = False,
    output_hidden_states = False
)
model = model.to(device)

In [None]:
optimizer = torch.optim.Adam(model.parameters())

#### 4.6 Finetuning Bert - Training loop

In [None]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
def get_accuracy(predictions, ground_truths):
    pred = np.argmax(predictions, axis=1)
    return np.sum(pred==ground_truths) / len(ground_truths)

In [None]:
best_eval_accuracy = 0.0
for epoch in range(0, 2):
    #train
    model.train()
    train_loss = 0.0
    for (step, batch) in enumerate(train_dataloader):
        batch_input = batch[0].to(device)
        batch_attentionmask = batch[1].to(device)
        batch_labels = batch[2].to(device)

        # print(batch_input.shape)
        # print(batch_attentionmask.shape)
        # print(batch_labels.shape)

        optimizer.zero_grad()
        output = model(batch_input, 
                       token_type_ids = None, 
                       attention_mask = batch_attentionmask, 
                       labels = batch_labels)
        #print(output.shape)
        
        loss = output.loss
        train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
    print(f"Epoch : {epoch} | Training loss : {train_loss / len(train_dataloader)}")

    #val
    model.eval()
    val_loss = 0.0
    val_accuracy = 0.0
    for (step, batch) in enumerate(val_dataloader):
        batch_input = batch[0].to(device)
        batch_attentionmask = batch[1].to(device)
        batch_labels = batch[2].to(device)
        with torch.no_grad():
            output = model(batch_input, 
                       token_type_ids = None, 
                       attention_mask = batch_attentionmask, 
                       labels = batch_labels)
        loss = output.loss
        val_loss += loss.item()
        logits = output.logits
        logits = logits.detach().cpu().numpy()
        ground_truths = batch_labels.to('cpu').numpy()
        val_accuracy += get_accuracy(logits, ground_truths)
    print(f"Val loss : {val_loss / len(val_dataloader)} | Val accuracy : {val_accuracy / len(val_dataloader)}")
    if ((val_accuracy / len(val_dataloader)) > best_eval_accuracy):
        torch.save(model, 'bert-model')
        best_eval_accuracy = val_accuracy / len(val_dataloader)

In [None]:
model = torch.load('bert-model', weights_only=False)

#### 4.7 Get predictions on test set

In [None]:
model.eval()
predictions = None
for (step, batch) in enumerate(test_dataloader):
    batch_input = batch[0].to(device)
    batch_attentionmask = batch[1].to(device)
    batch_id = batch[2].reshape(-1,1)
    with torch.no_grad():
        output = model(batch_input, 
                    token_type_ids = None, 
                    attention_mask = batch_attentionmask)
    logits = output.logits
    logits = logits.detach().cpu().numpy()
    pred = torch.tensor(softmax(logits, axis=1))
    pred = torch.cat([batch_id, pred], dim=1).numpy()
    if (predictions==None):
        predictions = pred
    else:
        predictions = np.concatenate((predictions, pred), axis=0)

In [None]:
submission = pd.DataFrame({
    'id' : predictions[:,0],
    'winner_model_a' : predictions[:,1],
    'winner_model_b' : predictions[:,2],
    'winner_tie' : predictions[:,3]
})
submission.to_csv('submission.csv', index=False)
submission.head()