In [None]:
import os, math, numpy as np
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
# !pip install transformers datasets accelerate

!pip install accelerate --no-index --find-links=file:///kaggle/input/packages-llm-classification/accelerate
!pip install transformers --no-index --find-links=file:///kaggle/input/packages-llm-classification/transformers
!pip install datasets --no-index --find-links=file:///kaggle/input/packages-llm-classification/datasets

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import random
from transformers import AutoTokenizer
### Now, we have the datasets done, onto the actual cool stuff
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
test_df = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")

# splits the x and y data
def x_y_split(input_df, train_test = "test"):
    X = input_df[['prompt', 'response_a', 'response_b']]
    if (train_test == "train"):
        Y = input_df[['winner_model_a', 'winner_model_b', 'winner_tie']]
        return (X,Y)
    return X

X_test = x_y_split(test_df)

In [None]:
### Custom Dataset class
class TestDatasetSingleBertModel(Dataset):
    def __init__(self, X, tokenizer, MAX_LENGTH=None, split_ratios=[1/3,1/3,1/3], LOGS = False):

        if MAX_LENGTH is None:
            MAX_LENGTH = tokenizer.model_max_length

        self.tokenized_inputs, self.attention_masks = self.tokenize_and_truncate(X, MAX_LENGTH, split_ratios,LOGS)


    def __len__(self):
        return len(self.tokenized_inputs)

    def __getitem__(self, idx):
        # create the item as a dictionary for easier access
        item = {
            'input_ids': torch.tensor(self.tokenized_inputs[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
        }
        return item
        
    ### Custom function to tokenize and pad, ensuring 
    def tokenize_and_truncate(self, X, MAX_LENGTH, split_ratios, LOGS = False):
        # Create an empty list to store tokenized inputs
        tokenized_inputs = []
        attention_masks = []  # List to store the attention masks
        
        # Iterate through the DataFrame rows
        for index, row in X.iterrows():
            # Tokenize each part separately with truncation and no padding
            prompt_tokens = tokenizer(row['prompt'], add_special_tokens=False, truncation=True, max_length=MAX_LENGTH)['input_ids']
            response_a_tokens = tokenizer(row['response_a'], add_special_tokens=False, truncation=True, max_length=MAX_LENGTH)['input_ids']
            response_b_tokens = tokenizer(row['response_b'], add_special_tokens=False, truncation=True, max_length=MAX_LENGTH)['input_ids']
    
            # Reserve space for [CLS] and [SEP] tokens (3 tokens in total)
            total_available = MAX_LENGTH - 3  # 1 [CLS] + 2 [SEP] tokens
            
            # Split available space proportionally between the 3 inputs
            prompt_max = int(total_available * split_ratios[0])
            response_a_max = int(total_available * split_ratios[1])
            response_b_max = total_available - (prompt_max + response_a_max)
    
            if (LOGS == True):
                if (response_b_max != split_ratios[2]):
                    print(f"Calculated response_b_max ratio {response_b_max/MAX_LENGTH} vs. split_ratios {split_ratios[2]}")
            
            # Truncate inputs according to the available space
            prompt_tokens = prompt_tokens[:prompt_max]
            response_a_tokens = response_a_tokens[:response_a_max]
            response_b_tokens = response_b_tokens[:response_b_max]
    
            # Concatenate the inputs
            input_ids = prompt_tokens + response_a_tokens + response_b_tokens
            
            # Add special tokens [CLS] and [SEP] to the input_ids
            input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)
    
            # Create the attention mask: 1 for real tokens, 0 for padding tokens
            attention_mask = [1] * len(input_ids)  # Start by marking all tokens as 1
            
            # Pad the remaining sequence with 0s for the attention mask (if needed)
            while len(input_ids) < MAX_LENGTH:
                input_ids.append(tokenizer.pad_token_id)  # Pad token ID
                attention_mask.append(0)  # Padding token gets 0 in the attention mask
    
            # Append to the lists
            tokenized_inputs.append(input_ids)
            attention_masks.append(attention_mask)
        
        return tokenized_inputs, attention_masks

In [None]:
import torch
import re
### Custom Dataset class
class TestDatasetSingleBertModelV2(Dataset):
    def __init__(self, X, tokenizer, MAX_LENGTH=None, split_ratios=[1/3,1/3,1/3], LOGS=False):

        self.split_ratios = split_ratios
        self.tokenizer = tokenizer

        self.LOGS = LOGS

        self.cls_token = [tokenizer.cls_token_id]  # [CLS]
        self.sep_token = [tokenizer.sep_token_id]  # [SEP]
        
        if MAX_LENGTH is None:
            self.MAX_LENGTH = tokenizer.model_max_length

        self.X_processed = X.map(self.process)
        
        # self.tokenized_inputs, self.attention_masks = self.tokenize_and_truncate(X_processed, MAX_LENGTH, split_ratios, LOGS)

    def process(self, text):

        text = text.encode('utf-8').decode()
        text = text.replace("\\n", " ")  # Replace newline artifacts
        text = re.sub(r"\s+", " ", text).strip()  # Normalize extra spaces
        text = re.sub(r"\*\*(.*?)\*\*", r"\1", text)  # Remove markdown bold formatting (**text** → text)
        text = text.replace("[", "").replace("]", "")  # Remove stray brackets (if unintended)
        
        return text

    def __len__(self):
        return len(self.X_processed)

    def __getitem__(self, idx):
        row = self.X_processed.iloc[idx]

        response_b_length = round(self.split_ratios[2]*self.MAX_LENGTH)
        response_a_length = round(self.split_ratios[1]*self.MAX_LENGTH)

        prompt_length = self.MAX_LENGTH - response_b_length - response_a_length 

        if self.LOGS:
            print(f"prompt_length: {prompt_length}, response_a_length: {response_a_length}, response_b_length: {response_b_length}")

        prompt_tokens = self.tokenizer(row['prompt'], add_special_tokens=False, truncation=True, max_length=prompt_length)['input_ids']
        response_a_tokens = self.tokenizer(row['response_a'], add_special_tokens=False, truncation=True, max_length=response_a_length)['input_ids']
        response_b_tokens = self.tokenizer(row['response_b'], add_special_tokens=False, truncation=True, max_length=response_b_length)['input_ids']

        if len(prompt_tokens) + len(response_a_tokens) + len(response_b_tokens) > self.MAX_LENGTH-3: # [CLS] and [SEP] tokens,
            prompt_tokens = prompt_tokens[:-1]
            response_a_tokens = response_a_tokens[:-1]
            response_b_tokens = response_b_tokens[:-1]

        # print(len(prompt_tokens))
        # print(len(response_a_tokens))
        # print(len(response_b_tokens))
     
        total_tokens =  self.cls_token  + prompt_tokens
        
        if len(prompt_tokens) > 0:
            total_tokens = total_tokens + self.sep_token + response_a_tokens
        else:
            total_tokens = total_tokens + response_a_tokens
        
        if len(response_a_tokens) > 0:
            total_tokens = total_tokens + self.sep_token + response_b_tokens
        else:
            total_tokens = total_tokens + response_b_tokens

        attention_mask = [1.0]*len(total_tokens) + [0.]*(self.MAX_LENGTH - len(total_tokens))
        total_tokens = total_tokens + [0.]*(self.MAX_LENGTH - len(total_tokens))

        # print("t_t: ", len(total_tokens))
        
        # print("a_m", len(attention_mask))

    
        # create the item as a dictionary for easier access
        item = {
            'input_ids': torch.tensor(total_tokens, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long)
        }
        return item

In [None]:
import torch
import re
### Custom Dataset class
class TrainDatasetParallel(Dataset):
    def __init__(self, X, tokenizer, MAX_LENGTH=None, split_ratios=[1/3,1/3], LOGS=False):
        self.split_ratios = split_ratios

        self.LOGS = LOGS
        self.tokenizer = tokenizer

        self.cls_token = [self.tokenizer.cls_token_id]  # [CLS]
        self.sep_token = [self.tokenizer.sep_token_id]  # [SEP]
        
        if MAX_LENGTH is None:
            self.MAX_LENGTH = tokenizer.model_max_length

        self.X_processed = X.map(self.process)
    

    def process(self, text):

        text = text.encode('utf-8').decode()
        text = text.replace("\\n", " ")  # Replace newline artifacts
        text = re.sub(r"\s+", " ", text).strip()  # Normalize extra spaces
        text = re.sub(r"\*\*(.*?)\*\*", r"\1", text)  # Remove markdown bold formatting (**text** → text)
        text = text.replace("[", "").replace("]", "")  # Remove stray brackets (if unintended)
        
        return text

    def build_tokens(self,prompt_tokens, response_tokens):
        
        if len(prompt_tokens) + len(response_tokens) > self.MAX_LENGTH-2: # [CLS] and [SEP] tokens,
            prompt_tokens = prompt_tokens[:-1]
            response_tokens = response_tokens[:-1]

        total_tokens =  self.cls_token + prompt_tokens
        
        if len(prompt_tokens) > 0:
            total_tokens = total_tokens + self.sep_token + response_tokens
        else:
            total_tokens = total_tokens + response_tokens

        attention_mask = [1.0]*len(total_tokens) + [0.]*(self.MAX_LENGTH - len(total_tokens))
        total_tokens = total_tokens + [0.]*(self.MAX_LENGTH - len(total_tokens))

        # print(len(total_tokens))
        # print(len(attention_mask))

        return total_tokens, attention_mask

    def __len__(self):
        return len(self.X_processed)

    def __getitem__(self, idx):
        row = self.X_processed.iloc[idx]

        response_length = round(self.split_ratios[1]*self.MAX_LENGTH)
        prompt_length = self.MAX_LENGTH - response_length

        if self.LOGS:
            print(f"prompt_length: {prompt_length}, response_length: {response_length}")

        prompt_tokens = self.tokenizer(row['prompt'], add_special_tokens=False, truncation=True, max_length=prompt_length)['input_ids']
        response_a_tokens = self.tokenizer(row['response_a'], add_special_tokens=False, truncation=True, max_length=response_length)['input_ids']
        response_b_tokens = self.tokenizer(row['response_b'], add_special_tokens=False, truncation=True, max_length=response_length)['input_ids']

        input_ids_a, attention_mask_a = self.build_tokens(prompt_tokens, response_a_tokens)
        input_ids_b, attention_mask_b = self.build_tokens(prompt_tokens, response_b_tokens)
    
        # create the item as a dictionary for easier access
        item = {
            'input_ids_a': torch.tensor(input_ids_a, dtype=torch.long),
            'attention_mask_a': torch.tensor(attention_mask_a, dtype=torch.long),
            'input_ids_b': torch.tensor(input_ids_b, dtype=torch.long),
            'attention_mask_b': torch.tensor(attention_mask_b, dtype=torch.long),
        }
        return item

In [None]:
from transformers import AutoModel, PreTrainedModel, AutoConfig
import torch.nn as nn

class ParallelTransformer(PreTrainedModel):
    def __init__(self, model_name="distilbert/distilbert-base-uncased", num_labels=3):
        config = AutoConfig.from_pretrained(model_name)
        # print(config)
        super().__init__(config)
        
        self.bert = AutoModel.from_pretrained(model_name)  # Load BERT without a head
        self.dropout = nn.Dropout(0.1)
        
        hidden_size = config.hidden_size  # 768 for BERT-base
        self.classifier = nn.Linear(hidden_size * 2, num_labels)  # Merge resp_a & resp_b
        
    def forward(self, input_ids_a, attention_mask_a, input_ids_b, attention_mask_b, labels=None):
        # Encode resp_a
        outputs_a = self.bert(input_ids=input_ids_a, attention_mask=attention_mask_a)
        pooled_a = outputs_a.last_hidden_state[:, 0, :]  # Extract [CLS] embedding
        
        # Encode resp_b
        outputs_b = self.bert(input_ids=input_ids_b, attention_mask=attention_mask_b)
        pooled_b = outputs_b.last_hidden_state[:, 0, :]  # Extract [CLS] embedding
        
        # Merge resp_a and resp_b (Concatenation)
        merged = torch.cat([pooled_a, pooled_b], dim=1)
        merged = self.dropout(merged)

        # Classification layer
        logits = self.classifier(merged)
        
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
            loss = loss.unsqueeze(0)

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

In [None]:
class FineTuningBertFromLLMFineTune(PreTrainedModel):
    def __init__(self, model_path):
        config = AutoConfig.from_pretrained(model_path)

        super().__init__(config)
        self.hidden_size = config.hidden_size
        self.dropout = nn.Dropout(0.1)

        # "/pytorch_model.bin"
        self.bert = AutoModel.from_pretrained(model_path)  # Load BERT without a head
        

        self.classifier = nn.Linear(self.hidden_size, 3) 
        # self.custom_head = CustomClassificationHead(hidden_size, 3)
        self.freeze__first_nlayers = 0
      
    def freeze_my_layers(self, freeze__first_nlayers):
        self.freeze__first_nlayers = freeze__first_nlayers
        if freeze__first_nlayers > 0:
          print(f"Freezing {freeze__first_nlayers} layers")
          for i in range(freeze__first_nlayers): # Freeze transformer layers
              # for param in self.bert.transformer.layer[i].parameters():
              #     param.requires_grad = False
              for param in self.bert.encoder.layer[i].parameters():
                  param.requires_grad = False

    def as_dict(self):
        return {
            "hidden_size" : self.hidden_size,
            "parallel_model_settings" : self.bert.config.to_dict(),
            "freeze__first_nlayers" : self.freeze__first_nlayers}



    def forward(self, input_ids, attention_mask, labels=None):

        # Encode resp_a
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0, :]  # Extract [CLS] embedding

        pooled = self.dropout(pooled)
        logits = self.classifier(pooled)

        loss = None
        if labels is not None:

            if labels.dim() > 1:
                labels = labels.argmax(dim=1)
            if logits.shape[0] != labels.shape[0]:
                raise ValueError(f"Mismatch in logits ({logits.shape}) and labels ({labels.shape}) batch size")


            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        # print(logits)


        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
encoder_model_name = "/kaggle/input/distilbert-parallel-llm-distil/saved_model/distilbert-parallel-llm-distil-frozen3-epoc2" # "distilbert/distilbert-base-uncased" ""google-bert/bert-base-uncased
method = "parallel_bert" # parallel_bert

if method == "singe_bert":
    model = AutoModelForSequenceClassification.from_pretrained(encoder_model_name, num_labels=3)
    result = model.load_state_dict(torch.load(encoder_model_name + "/pytorch_model.bin")) # Crucial
    tokenizer = AutoTokenizer.from_pretrained(encoder_model_name)
    test_dataset = TestDatasetSingleBertModelV2(X_test, tokenizer, split_ratios=[0.1, 0.45,0.45], LOGS=False)

elif method == 'fine_tuned_single':
    model = FineTuningBertFromLLMFineTune(encoder_model_name)
    result = model.load_state_dict(torch.load(encoder_model_name + "/pytorch_model.bin"), strict=False) # Crucial
    tokenizer = AutoTokenizer.from_pretrained(encoder_model_name)
    test_dataset = TestDatasetSingleBertModelV2(X_test, tokenizer, split_ratios=[0.1, 0.45,0.45], LOGS=False)
elif method == "parallel_bert":

    model = ParallelTransformer(encoder_model_name)
    result = model.load_state_dict(torch.load(encoder_model_name + "/pytorch_model.bin"), strict=False) # Crucial

    tokenizer = AutoTokenizer.from_pretrained(encoder_model_name)
    
    test_dataset = TrainDatasetParallel(X_test, tokenizer, split_ratios=[0.1, 0.9], LOGS=False)

print(type(model))
print(result)
# print(train_val_dataset[0].keys())

In [None]:
from transformers import TrainingArguments, Trainer
import torch.nn.functional as F

training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=16,  # Important for inference speed
    fp16=True,  # Mixed precision (faster if using an NVIDIA GPU)
    report_to="none",  # No logging to external services
)

trainer = Trainer(
    model=model,  # Your trained model
    args=training_args
)

predictions = trainer.predict(test_dataset)

In [None]:
# trainer.save_model("./kaggle/working/bertmodel")

In [None]:
probabilities = F.softmax(torch.tensor(predictions.predictions), dim=-1)

In [None]:
sub = pd.read_csv("/kaggle/input/llm-classification-finetuning/sample_submission.csv")

sub[["winner_model_a","winner_model_b","winner_tie"]] = probabilities
    
sub.to_csv("submission.csv",index=False)
sub.head()