### 0. Imports

In [1]:
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
import emoji
import torch
import re
from transformers import BertForSequenceClassification, BertConfig,BertTokenizer, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
import random
from scipy.special import softmax

2025-07-04 17:52:25.257201: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751651545.444974      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751651545.497375      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

### 1. Dataset Load

In [3]:
train_df = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")
test_df = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")
submission_template = pd.read_csv("/kaggle/input/llm-classification-finetuning/sample_submission.csv")

### 2. Univariate EDA

In [4]:
train_df['model_a'].value_counts()

model_a
gpt-4-1106-preview          3678
gpt-3.5-turbo-0613          3553
gpt-4-0613                  3099
claude-2.1                  2859
gpt-4-0314                  2087
                            ... 
falcon-180b-chat             145
openchat-3.5-0106            108
qwen1.5-7b-chat              106
qwen1.5-4b-chat              100
mistral-7b-instruct-v0.2      54
Name: count, Length: 64, dtype: int64

In [5]:
train_df['model_b'].value_counts()

model_b
gpt-4-1106-preview          3709
gpt-3.5-turbo-0613          3530
gpt-4-0613                  3066
claude-2.1                  2724
claude-instant-1            2051
                            ... 
falcon-180b-chat             141
openchat-3.5-0106            136
qwen1.5-7b-chat              102
qwen1.5-4b-chat              100
mistral-7b-instruct-v0.2      46
Name: count, Length: 64, dtype: int64

In [6]:
train_df.iloc[:,[6,7,8]].sum(axis=0)

winner_model_a    20064
winner_model_b    19652
winner_tie        17761
dtype: int64

### 3. Preprocessing + Encoding labels

In [7]:
def clean_text(text):
    #1. case folding
    text = text.lower()
    #2. remove html tags
    text = re.sub(r"<.*?>", "", text)
    #3. remove URLs
    text = re.sub(r"https?:\/\/\S+|www\.\S+", "", text)
    #4. remove punctutation
    text = text.translate(str.maketrans('', '', string.punctuation))
    #5. remove stopwords
    stopword_list = stopwords.words('english')
    text = [word for word in text.split() if word not in stopword_list]
    text = " ".join(text)
    #6. handle emojis
    text = emoji.demojize(text)

    return text

In [8]:
def replace_emptystring(text):
    if (text == ''):
        return "NA"
    return text

In [9]:
#One hot encoded
#train_df['label'] = [[train_df['winner_model_a'][i], train_df['winner_model_b'][i], train_df['winner_tie'][i]] for i in range(0, len(train_df))]

In [10]:
#Class indices
train_df['label'] = [0 if train_df['winner_model_a'][i]==1 else(1 if train_df['winner_model_b'][i]==1 else 2) for i in range(0, len(train_df))]

In [11]:
train_df['prompt'] = train_df['prompt'].apply(lambda x: clean_text(x))
train_df['response_a'] = train_df['response_a'].apply(lambda x: clean_text(x))
train_df['response_b'] = train_df['response_b'].apply(lambda x: clean_text(x))

test_df['prompt'] = test_df['prompt'].apply(lambda x: clean_text(x))
test_df['response_a'] = test_df['response_a'].apply(lambda x: clean_text(x))
test_df['response_b'] = test_df['response_b'].apply(lambda x: clean_text(x))

In [12]:
train_df['prompt'] = train_df['prompt'].apply(lambda x: replace_emptystring(x))
train_df['response_a'] = train_df['response_a'].apply(lambda x: replace_emptystring(x))
train_df['response_b'] = train_df['response_b'].apply(lambda x: replace_emptystring(x))

test_df['prompt'] = test_df['prompt'].apply(lambda x: replace_emptystring(x))
test_df['response_a'] = test_df['response_a'].apply(lambda x: replace_emptystring(x))
test_df['response_b'] = test_df['response_b'].apply(lambda x: replace_emptystring(x))

In [13]:
(train_df['prompt']=='').value_counts()

prompt
False    57477
Name: count, dtype: int64

### 4. LLM Finetuning
#### 4.1 Load Tokenizer

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

#### 4.2 Configure maxlength of encoder
Setting maxlength of encoder too high wastes memory, while setting it too low cuts off important context

In [15]:
#MAX LENGTH OF EACH INPUT
maxlength_encoded_prompt = 0
maxlength_encoded_response1 = 0
maxlength_encoded_response2 = 0
for prompt, response1, response2 in zip(train_df['prompt'], train_df['response_a'] ,train_df['response_b']):
    encoded_prompt = tokenizer.encode(prompt, add_special_tokens=True)
    encoded_response1 = tokenizer.encode(response1, add_special_tokens=True)
    encoded_response2 = tokenizer.encode(response2, add_special_tokens=True)
    maxlength_encoded_prompt = max(maxlength_encoded_prompt, len(encoded_prompt))
    maxlength_encoded_response1 = max(maxlength_encoded_response1, len(encoded_response1))
    maxlength_encoded_response2 = max(maxlength_encoded_response2, len(encoded_response2))


print(maxlength_encoded_prompt)
print(maxlength_encoded_response1)
print(maxlength_encoded_response2)

Token indices sequence length is longer than the specified maximum sequence length for this model (554 > 512). Running this sequence through the model will result in indexing errors


4477
8552
9657


In [16]:
#AVERAGE LENGTH OF EACH INPUT
avglength_encoded_prompt = 0
avglength_encoded_response1 = 0
avglength_encoded_response2 = 0
for prompt, response1, response2 in zip(train_df['prompt'], train_df['response_a'] ,train_df['response_b']):
    encoded_prompt = tokenizer.encode(prompt, add_special_tokens=True)
    encoded_response1 = tokenizer.encode(response1, add_special_tokens=True)
    encoded_response2 = tokenizer.encode(response2, add_special_tokens=True)
    avglength_encoded_prompt += len(encoded_prompt)
    avglength_encoded_response1 += len(encoded_response1)
    avglength_encoded_response2 += len(encoded_response2)

print(avglength_encoded_prompt / len(train_df))
print(avglength_encoded_response1 / len(train_df))
print(avglength_encoded_response2 / len(train_df))

52.93219896654314
181.41056422569028
182.86933903996382


In [17]:
#MEDIAN LENGTH OF EACH INPUT
lengths_encoded_prompt = []
lengths_encoded_response1 = []
lengths_encoded_response2 = []
for prompt, response1, response2 in zip(train_df['prompt'], train_df['response_a'] ,train_df['response_b']):
    encoded_prompt = tokenizer.encode(prompt, add_special_tokens=True)
    encoded_response1 = tokenizer.encode(response1, add_special_tokens=True)
    encoded_response2 = tokenizer.encode(response2, add_special_tokens=True)
    lengths_encoded_prompt.append(len(encoded_prompt))
    lengths_encoded_response1.append(len(encoded_response1))
    lengths_encoded_response2.append(len(encoded_response2))

lengths_encoded_prompt.sort()
lengths_encoded_response1.sort()
lengths_encoded_response2.sort()

lengths_encoded_prompt = np.array(lengths_encoded_prompt)
lengths_encoded_response1 = np.array(lengths_encoded_response1)
lengths_encoded_response2 = np.array(lengths_encoded_response2)
print(lengths_encoded_prompt[len(lengths_encoded_prompt)//2])
print(lengths_encoded_response1[len(lengths_encoded_response1)//2])
print(lengths_encoded_response2[len(lengths_encoded_response2)//2])

#900 prompts exceed 512 size [out of 57K], 2700 response1s and 2731 response2s => we can ignore these cases where only prompt context fits in the max_length
print((lengths_encoded_prompt > 512).sum())
print((lengths_encoded_response1 > 512).sum())
print((lengths_encoded_response2 > 512).sum())

14
134
135
900
2700
2731


#### Conclusion:
Maximum length, average length, median length comparison for each input: </br>

| | Maximum length | Average length | Median length |
| ------ | ------ | ------ | ------ |
| Prompt | 4477 | 53 | 14 |
| Response1 | 8552 | 181 | 134 |
| Response2 | 9657 | 183 | 135 |

Since bert is limited to max #subtokens to be 512, we keep first 100 tokens reserved for prompt, followed by 204 tokens for response1 and response2 each. This is done to ensure that some context for all 3 inputs - prompt, response1 and response2 - is available to bert to take a decision and the context is not overpowered by a single input. The division is done based on average and median values of each input.


#### 4.3 Encoding train dataset

In [18]:
input_encodings  = []
attention_masks = []

for prompt, response1, response2 in zip(train_df['prompt'], train_df['response_a'], train_df['response_b']):
    prompt_encoded_dict = tokenizer(
        prompt,
        add_special_tokens = False,
        max_length = 100,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt',
        truncation=True
    )

    response1_encoded_dict = tokenizer(
        response1,
        add_special_tokens = False,
        max_length = 204,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt',
        truncation=True
    )

    response2_encoded_dict = tokenizer(
        prompt,
        add_special_tokens = False,
        max_length = 204,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt',
        truncation=True
    )

    input_id = torch.cat(
        [torch.tensor([101]), 
         prompt_encoded_dict['input_ids'][0], 
         torch.tensor([102]), 
         response1_encoded_dict['input_ids'][0], 
         torch.tensor([102]),
         response2_encoded_dict['input_ids'][0],
         torch.tensor([102])], dim=0)
    attention_mask = torch.cat(
        [torch.tensor([1]), 
         prompt_encoded_dict['attention_mask'][0], 
         torch.tensor([1]), 
         response1_encoded_dict['attention_mask'][0], 
         torch.tensor([1]),
         response2_encoded_dict['attention_mask'][0],
         torch.tensor([1])], dim=0)
    input_encodings.append(input_id)
    attention_masks.append(attention_mask)
    
dataset = TensorDataset(
    torch.stack(input_encodings), 
    torch.stack(attention_masks),
    torch.tensor(train_df['label']))

In [19]:
test_input_encodings  = []
test_attention_masks = []

for prompt, response1, response2 in zip(test_df['prompt'], test_df['response_a'], test_df['response_b']):
    prompt_encoded_dict = tokenizer(
        prompt,
        add_special_tokens = False,
        max_length = 100,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt',
        truncation=True
    )

    response1_encoded_dict = tokenizer(
        response1,
        add_special_tokens = False,
        max_length = 204,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt',
        truncation=True
    )

    response2_encoded_dict = tokenizer(
        prompt,
        add_special_tokens = False,
        max_length = 204,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt',
        truncation=True
    )

    test_input_id = torch.cat(
        [torch.tensor([101]), 
         prompt_encoded_dict['input_ids'][0], 
         torch.tensor([102]), 
         response1_encoded_dict['input_ids'][0], 
         torch.tensor([102]),
         response2_encoded_dict['input_ids'][0],
         torch.tensor([102])], dim=0)
    test_attention_mask = torch.cat(
        [torch.tensor([1]), 
         prompt_encoded_dict['attention_mask'][0], 
         torch.tensor([1]), 
         response1_encoded_dict['attention_mask'][0], 
         torch.tensor([1]),
         response2_encoded_dict['attention_mask'][0],
         torch.tensor([1])], dim=0)
    test_input_encodings.append(test_input_id)
    test_attention_masks.append(test_attention_mask)
    
test_dataset = TensorDataset(
    torch.stack(test_input_encodings), 
    torch.stack(test_attention_masks),
    torch.tensor(test_df['id']))

#### 4.4 Train-Val Split

In [20]:
train_dataset, val_dataset = random_split(dataset, [int(0.8*len(dataset)), len(dataset)-int(0.8*len(dataset))])

#### 4.5 Prepare Dataloader

In [21]:
train_dataloader = DataLoader(
    train_dataset,
    sampler = RandomSampler(train_dataset),
    batch_size = 32
)

val_dataloader = DataLoader(
    val_dataset,
    sampler = SequentialSampler(val_dataset),
    batch_size = 32
)

test_dataloader = DataLoader(
    test_dataset,
    sampler = SequentialSampler(test_dataset),
    batch_size = 32
)

#### 4.6 Define model + optimizer

In [22]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 3,
    output_attentions = False,
    output_hidden_states = False
)
model = model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
optimizer = torch.optim.Adam(model.parameters())

#### 4.7 Training loop

In [24]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [25]:
def get_accuracy(predictions, ground_truths):
    pred = np.argmax(predictions, axis=1)
    return np.sum(pred==ground_truths) / len(ground_truths)

In [26]:
best_eval_accuracy = 0.0
for epoch in range(0, 3):
    #train
    model.train()
    train_loss = 0.0
    for (step, batch) in enumerate(train_dataloader):
        batch_input = batch[0].to(device)
        batch_attentionmask = batch[1].to(device)
        batch_labels = batch[2].to(device)

        optimizer.zero_grad()
        output = model(batch_input, 
                       token_type_ids = None, 
                       attention_mask = batch_attentionmask, 
                       labels = batch_labels)
        
        loss = output.loss
        train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
    print(f"Epoch : {epoch} | Training loss : {train_loss / len(train_dataloader)}")

    #val
    model.eval()
    val_loss = 0.0
    val_accuracy = 0.0
    for (step, batch) in enumerate(val_dataloader):
        batch_input = batch[0].to(device)
        batch_attentionmask = batch[1].to(device)
        batch_labels = batch[2].to(device)
        with torch.no_grad():
            output = model(batch_input, 
                       token_type_ids = None, 
                       attention_mask = batch_attentionmask, 
                       labels = batch_labels)
        loss = output.loss
        val_loss += loss.item()
        logits = output.logits
        logits = logits.detach().cpu().numpy()
        ground_truths = batch_labels.to('cpu').numpy()
        val_accuracy += get_accuracy(logits, ground_truths)
    print(f"Val loss : {val_loss / len(val_dataloader)} | Val accuracy : {val_accuracy / len(val_dataloader)}")
    if ((val_accuracy / len(val_dataloader)) > best_eval_accuracy):
        torch.save(model, 'bert-model')
        best_eval_accuracy = val_accuracy / len(val_dataloader)

Epoch : 0 | Training loss : 1.1092377686218495
Val loss : 1.1156544884045918 | Val accuracy : 0.3451388888888889
Epoch : 1 | Training loss : 1.0998592019993643
Val loss : 1.0984774175617429 | Val accuracy : 0.35260416666666666
Epoch : 2 | Training loss : 1.0988804048851455
Val loss : 1.0969997240437401 | Val accuracy : 0.35260416666666666


In [27]:
model = torch.load('bert-model', weights_only=False)

#### 4.7 Get predictions on test set

In [28]:
model.eval()
predictions = None
for (step, batch) in enumerate(test_dataloader):
    batch_input = batch[0].to(device)
    batch_attentionmask = batch[1].to(device)
    batch_id = batch[2].reshape(-1,1)
    with torch.no_grad():
        output = model(batch_input, 
                    token_type_ids = None, 
                    attention_mask = batch_attentionmask)
    logits = output.logits
    logits = logits.detach().cpu().numpy()
    pred = torch.tensor(softmax(logits, axis=1))
    pred = torch.cat([batch_id, pred], dim=1).numpy()
    if (predictions==None):
        predictions = pred
    else:
        predictions = np.concatenate((predictions, pred), axis=0)

In [29]:
submission = pd.DataFrame({
    'id' : predictions[:,0],
    'winner_model_a' : predictions[:,1],
    'winner_model_b' : predictions[:,2],
    'winner_tie' : predictions[:,3]
})
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,136060.0,0.373595,0.315166,0.311239
1,211333.0,0.373595,0.315166,0.311239
2,1233961.0,0.373595,0.315166,0.311239
