### 0. Imports

In [1]:
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
import emoji
import torch
import re
from transformers import BertForSequenceClassification, BertConfig,BertTokenizer, get_linear_schedule_with_warmup, BertModel, AutoTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
import random
import torch.nn as nn
from scipy.special import softmax

2025-07-05 06:14:14.627685: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751696054.811666      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751696054.865865      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

### 1. Dataset Load

In [3]:
train_df = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")
test_df = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")
submission_template = pd.read_csv("/kaggle/input/llm-classification-finetuning/sample_submission.csv")

### 2. EDA

Check Exp1 notebook for data analysis

### 3. Preprocessing

In [4]:
def clean_text(text):
    #1. case folding
    text = text.lower()
    #handling non-words
    #text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) 
    #2. remove html tags
    text = re.sub(r"<.*?>", "", text)
    #3. remove URLs
    text = re.sub(r"https?:\/\/\S+|www\.\S+", "", text)
    #4. remove punctutation
    text = text.translate(str.maketrans('', '', string.punctuation))
    #5. remove stopwords
    stopword_list = stopwords.words('english')
    text = [word for word in text.split() if word not in stopword_list]
    text = " ".join(text)
    #6. handle emojis
    text = emoji.demojize(text)

    return text

In [5]:
def replace_emptystring(text):
    if (text == ''):
        return "NA"
    return text

In [6]:
#One hot encoded
#train_df['label'] = [[train_df['winner_model_a'][i], train_df['winner_model_b'][i], train_df['winner_tie'][i]] for i in range(0, len(train_df))]

In [7]:
#Class indices
train_df['label'] = [0 if train_df['winner_model_a'][i]==1 else(1 if train_df['winner_model_b'][i]==1 else 2) for i in range(0, len(train_df))]

In [8]:
train_df['prompt'] = train_df['prompt'].apply(lambda x: clean_text(x))
train_df['response_a'] = train_df['response_a'].apply(lambda x: clean_text(x))
train_df['response_b'] = train_df['response_b'].apply(lambda x: clean_text(x))

test_df['prompt'] = test_df['prompt'].apply(lambda x: clean_text(x))
test_df['response_a'] = test_df['response_a'].apply(lambda x: clean_text(x))
test_df['response_b'] = test_df['response_b'].apply(lambda x: clean_text(x))

In [9]:
train_df['prompt'] = train_df['prompt'].apply(lambda x: replace_emptystring(x))
train_df['response_a'] = train_df['response_a'].apply(lambda x: replace_emptystring(x))
train_df['response_b'] = train_df['response_b'].apply(lambda x: replace_emptystring(x))

test_df['prompt'] = test_df['prompt'].apply(lambda x: replace_emptystring(x))
test_df['response_a'] = test_df['response_a'].apply(lambda x: replace_emptystring(x))
test_df['response_b'] = test_df['response_b'].apply(lambda x: replace_emptystring(x))

### 4. LLM Finetune
#### 4.1 Tokenizer

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

#### 4.2 Encoding inputs

I create separate embeddings for Prompt-response1 and Prompt-response2 and then concatenate them. This is based on the fact that response1 and response2 only depend on given prompt, not on each other. A classification layer is added on top and I train the classification layer specifically, using bert just as an embedding layer.
#### Difference from exp2 : </br>
1. less batch_size due to memory constraints </br>
2. linear scheduler for learning rate

#### 4.3 Encoding train dataset

In [11]:
input_encodings  = [[], []]
attention_masks = [[], []]

for prompt, response1, response2 in zip(train_df['prompt'], train_df['response_a'], train_df['response_b']):
    prompt_response1_encoded_dict = tokenizer(
        prompt,
        response1,
        add_special_tokens = True,
        max_length = 512,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt',
        truncation=True
    )

    prompt_response2_encoded_dict = tokenizer(
        prompt,
        response2,
        add_special_tokens = True,
        max_length = 512,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt',
        truncation=True
    )
    input_encodings[0].append(prompt_response1_encoded_dict['input_ids'])
    attention_masks[0].append(prompt_response1_encoded_dict['attention_mask'])
    input_encodings[1].append(prompt_response2_encoded_dict['input_ids'])
    attention_masks[1].append(prompt_response2_encoded_dict['attention_mask'])
    
dataset = TensorDataset(
    torch.cat(input_encodings[0], dim=0), 
    torch.cat(attention_masks[0], dim=0),
    torch.cat(input_encodings[1], dim=0), 
    torch.cat(attention_masks[1], dim=0),
    torch.tensor(train_df['label']))

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [12]:
test_input_encodings  = [[], []]
test_attention_masks = [[], []]

for prompt, response1, response2 in zip(test_df['prompt'], test_df['response_a'], test_df['response_b']):
    test_prompt_response1_encoded_dict = tokenizer(
        prompt,
        response1,
        add_special_tokens = True,
        max_length = 512,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt',
        truncation=True
    )

    test_prompt_response2_encoded_dict = tokenizer(
        prompt,
        response2,
        add_special_tokens = True,
        max_length = 512,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt',
        truncation=True
    )
    test_input_encodings[0].append(test_prompt_response1_encoded_dict['input_ids'])
    test_attention_masks[0].append(test_prompt_response1_encoded_dict['attention_mask'])
    test_input_encodings[1].append(test_prompt_response2_encoded_dict['input_ids'])
    test_attention_masks[1].append(test_prompt_response2_encoded_dict['attention_mask'])
    
test_dataset = TensorDataset(
    torch.cat(test_input_encodings[0], dim=0), 
    torch.cat(test_attention_masks[0], dim=0),
    torch.cat(test_input_encodings[1], dim=0), 
    torch.cat(test_attention_masks[1], dim=0),
    torch.tensor(test_df['id']))

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


#### 4.4 Train-Val Split

In [13]:
train_dataset, val_dataset = random_split(dataset, [int(0.8*len(dataset)), len(dataset)-int(0.8*len(dataset))])

#### 4.5 Dataloader

In [14]:
train_dataloader = DataLoader(
    train_dataset,
    sampler = RandomSampler(train_dataset),
    batch_size = 16
)

val_dataloader = DataLoader(
    val_dataset,
    sampler = SequentialSampler(val_dataset),
    batch_size = 16
)

test_dataloader = DataLoader(
    test_dataset,
    sampler = SequentialSampler(test_dataset),
    batch_size = 16
)

#### 4.6 Create bert classifier

In [15]:
class BertClassifier(nn.Module):
    def __init__(self, num_labels=3, pretrained_model_name='bert-base-uncased'):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.classifier_head = nn.Linear(self.bert.config.hidden_size*2, num_labels)
        for param in self.bert.parameters():
            param.requires_grad = False

    def forward(self, input1, attentionmask1, input2, attentionmask2):
        output1 = self.bert(
            input1, 
            token_type_ids = None, 
            attention_mask = attentionmask1
        )
        embedding1 = output1.last_hidden_state[:,0,:]
        
        output2 = self.bert(
            input2, 
            token_type_ids = None, 
            attention_mask = attentionmask2
        )
        embedding2 = output2.last_hidden_state[:,0,:]

        final_embedding = torch.cat((embedding1, embedding2), dim=1)

        out = self.classifier_head(final_embedding)
        #out = self.softmax(out) nn.CrossEntropyLoss() expects logits not probabilities!!!
        
        return out

In [16]:
model = BertClassifier()
model = model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [17]:
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5, eps=1e-8)
criterion = nn.CrossEntropyLoss() # expects logits instead of prediction probabilites & class indices instead of one hot encoded targets!!!
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*3) #3=#epochs

#### 4.6 Finetuning Bert - Training loop

In [18]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [19]:
def get_accuracy(predictions, ground_truths):
    pred = np.argmax(predictions, axis=1)
    return np.sum(pred==ground_truths) / len(ground_truths)

In [20]:
best_eval_accuracy = 0.0
for epoch in range(0, 3):
    #train
    model.train()
    train_loss = 0.0
    for (step, batch) in enumerate(train_dataloader):
        batch_input1 = batch[0].to(device)
        batch_attentionmask1 = batch[1].to(device)
        batch_input2 = batch[2].to(device)
        batch_attentionmask2 = batch[3].to(device)
        batch_labels = batch[4].to(device)

        optimizer.zero_grad()
        output = model(batch_input1, 
                       batch_attentionmask1,
                       batch_input2,
                       batch_attentionmask2)
        
        loss = criterion(output, batch_labels)
        train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    print(f"Epoch : {epoch} | Training loss : {train_loss / len(train_dataloader)}")

    #val
    model.eval()
    val_loss = 0.0
    val_accuracy = 0.0
    for (step, batch) in enumerate(val_dataloader):
        batch_input1 = batch[0].to(device)
        batch_attentionmask1 = batch[1].to(device)
        batch_input2 = batch[2].to(device)
        batch_attentionmask2 = batch[3].to(device)
        batch_labels = batch[4].to(device)
        
        with torch.no_grad():
            output = model(batch_input1, 
                       batch_attentionmask1,
                       batch_input2,
                       batch_attentionmask2)
        
        loss = criterion(output, batch_labels)
        val_loss += loss.item()
        logits = output.detach().cpu().numpy()
        ground_truths = batch_labels.to('cpu').numpy()
        val_accuracy += get_accuracy(logits, ground_truths)
    print(f"Val loss : {val_loss / len(val_dataloader)} | Val accuracy : {val_accuracy / len(val_dataloader)}")
    if ((val_accuracy / len(val_dataloader)) > best_eval_accuracy):
        torch.save(model, 'bert-model')
        best_eval_accuracy = val_accuracy / len(val_dataloader)

Epoch : 0 | Training loss : 1.0871060572320916
Val loss : 1.0750037067788698 | Val accuracy : 0.4260257301808067
Epoch : 1 | Training loss : 1.0750144589627544
Val loss : 1.0681439211206079 | Val accuracy : 0.4400208623087622
Epoch : 2 | Training loss : 1.071449458785744
Val loss : 1.0669063422543945 | Val accuracy : 0.4416724617524339


In [21]:
model = torch.load('bert-model', weights_only=False)

#### 4.7 Get predictions on test set

In [22]:
model.eval()
predictions = None
for (step, batch) in enumerate(test_dataloader):
    batch_input1 = batch[0].to(device)
    batch_attentionmask1 = batch[1].to(device)
    batch_input2 = batch[2].to(device)
    batch_attentionmask2 = batch[3].to(device)
    batch_id = batch[4].reshape(-1,1)
    
    with torch.no_grad():
        output = model(batch_input1, 
                    batch_attentionmask1,
                    batch_input2,
                    batch_attentionmask2)
    logits = output.detach().cpu().numpy()
    pred = torch.tensor(softmax(logits, axis=1))
    pred = torch.cat([batch_id, pred], dim=1).numpy()
    if (predictions==None):
        predictions = pred
    else:
        predictions = np.concatenate((predictions, pred), axis=0)

In [23]:
submission = pd.DataFrame({
    'id' : predictions[:,0],
    'winner_model_a' : predictions[:,1],
    'winner_model_b' : predictions[:,2],
    'winner_tie' : predictions[:,3]
})
submission.to_csv('submission.csv', index=False)