In [None]:
!pip install --no-index --find-links=file:///kaggle/input/bitsandbytes/ /kaggle/input/bitsandbytes/bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl


In [None]:
!ls gemma_model


In [None]:
!pip install --no-index --find-links=file:///kaggle/input/libraries/transformer_peft_accelrator_bite/ peft


In [None]:
ls /kaggle/input/libraries/transformer_peft_accelrator_bite

In [None]:
!pip install --no-index --find-links=file:///kaggle/input/libraries/transformer_peft_accelrator_bite/ transformers


In [None]:
!pip install --no-index --find-links=file:///kaggle/input/libraries/transformer_peft_accelrator_bite/ tqdm


In [None]:
!pip install --no-index --find-links=file:///kaggle/input/libraries/transformer_peft_accelrator_bite/ accelerate


In [None]:
!pip install transformers peft accelerate bitsandbytes -U --no-index --find-links /kaggle/input/lmsys-wheel-files

In [None]:
import os
import copy
from dataclasses import dataclass

import numpy as np
import torch
from datasets import Dataset
from transformers import (
    BitsAndBytesConfig,
    Gemma2ForSequenceClassification,
    GemmaTokenizerFast,
    Gemma2Config,
    PreTrainedTokenizerBase, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType, PeftModel

from sklearn.metrics import log_loss, accuracy_score

In [None]:
from dataclasses import dataclass

# "unsloth/gemma-2-2b-it-bnb-4bit"

# "/kaggle/input/gemma-2-2b-it-unsloth-bnb-4bit-namm/transformers/default/1"
@dataclass
class Config:
    gemma_dir =  "/kaggle/input/gemma-2-2b-it-unsloth-bnb-4bit-namm/transformers/default/1"
    lora_dir = '/kaggle/input/checkpoint9000/output/checkpoint-8000'
    max_length: int = 1024
# Instantiate the Config class
config = Config()

In [None]:
tokenizer = GemmaTokenizerFast.from_pretrained(config.gemma_dir)
tokenizer.add_eos_token = True  # We'll add <eos> at the end
tokenizer.padding_side = "right"


In [None]:
from transformers import BitsAndBytesConfig


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    quantization_method="bnb"
)

model = Gemma2ForSequenceClassification.from_pretrained(
    config.gemma_dir,
    num_labels=2,
    quantization_config=quant_config,
    device_map="auto"
)

model = PeftModel.from_pretrained(model, config.lora_dir)


In [None]:
import pandas as pd

# Load the parquet file
df = pd.read_parquet('/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet')

# Save the dataframe as a CSV file with escape characters
df.to_csv('/kaggle/working/test.csv', index=False, escapechar='\\')

ds = Dataset.from_csv('/kaggle/working/test.csv')

In [None]:
print(df.head())

In [None]:
# ds = ds.select(torch.arange(10000))  


In [None]:
from transformers import PreTrainedTokenizerBase

class CustomTokenizer:
    def __init__(self, tokenizer: PreTrainedTokenizerBase, max_length: int = 1024) -> None:
        """
        Custom tokenizer to process and tokenize prompt-response pairs with balanced token allocation.

        Args:
            tokenizer (PreTrainedTokenizerBase): The tokenizer to use.
            max_length (int): Maximum token length for the combined input.
        """
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.max_per_section = max_length // 3  # Allocate approximately 1/3 of max_length to each part
        self.instruction = "<instruction>: Compare two responses and decide which one answers the prompt better."

    def __call__(self, batch: dict) -> dict:
        """
        Tokenizes and processes a batch of data, ensuring balanced token distribution
        across prompt, response_a, and response_b.

        Args:
            batch (dict): A dictionary containing "prompt", "response_a", "response_b", and "winner".

        Returns:
            dict: Processed batch with tokenized input_ids, attention_mask, and labels.
        """
        processed_data = {"input_ids": [], "attention_mask": [], "labels": []}
        
        for i in range(len(batch["prompt"])):
            try:
                # Process text parts
                instruction = self.instruction
                prompt = self.process_text(batch["prompt"][i])
                response_a = self.process_text(batch["response_a"][i])
                response_b = self.process_text(batch["response_b"][i])
                
                # Combine instruction and text parts
                combined_text = (
                    instruction + "\n\n" +
                    "<prompt>: " + prompt + "\n\n" +
                    "<response_a>: " + response_a + "\n\n" +
                    "<response_b>: " + response_b
                )
                
                # Tokenize the combined text
                tokenized = self.tokenizer(
                    combined_text,
                    max_length=self.max_length,
                    truncation=True,
                    padding="max_length"
                )
                
                # Handle the winner label
                winner = batch["winner"][i]
                if winner == "model_a":
                    label = 0
                elif winner == "model_b":
                    label = 1
                else:
                    continue  # Skip rows with invalid winner labels
                
                # Append processed data
                processed_data["input_ids"].append(tokenized["input_ids"])
                processed_data["attention_mask"].append(tokenized["attention_mask"])
                processed_data["labels"].append(label)
            
            except Exception as e:
                # Log the error and continue (optional)
                print(f"Skipping row {i} due to error: {e}")
        
        return processed_data

    @staticmethod
    def process_text(text: str) -> str:
        """
        Cleans and preprocesses text by removing null values and extra spaces.

        Args:
            text (str): Input text to process.

        Returns:
            str: Cleaned text.
        """
        if text is None:
            return ""
        return text.replace("null", "").strip()

encode = CustomTokenizer(tokenizer, max_length=config.max_length)
ds = ds.map(encode, batched=True)


In [None]:
ds.save_to_disk('/kaggle/working/preprocessed_dataset')
ds = Dataset.load_from_disk('/kaggle/working/preprocessed_dataset')


In [None]:
print(next(model.parameters()).device)


In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm  # Import tqdm for progress bar

# Ensure the model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

# Define a PyTorch Dataset
class CustomDataset(Dataset):
    def __init__(self, ds):
        self.ds = ds
    
    def __len__(self):
        return len(self.ds['id'])
    
    def __getitem__(self, idx):
        return {
            'id': self.ds['id'][idx],
            'input_ids': torch.tensor(self.ds['input_ids'][idx]),
            'attention_mask': torch.tensor(self.ds['attention_mask'][idx]),
        }


# THIS was causing bottleneck


# Create DataLoader
# batch_size = 32  # Adjust based on your GPU memory
# dataset = CustomDataset(ds)
# dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=4,shuffle=False,
#                         persistent_workers=True,  pin_memory=torch.cuda.is_available
# )
# results = []

# #No gradient computation
# with torch.no_grad():
#     for batch in tqdm(dataloader, desc="Processing Batches"):
#         # Prepare batch inputs
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         example_ids = batch['id']
        
#         # Forward pass
#         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#         predictions = torch.argmax(outputs['logits'], dim=1)  # Get batch predictions
        
#         # Map predictions to labels
#         for example_id, prediction in zip(example_ids, predictions.cpu().numpy()):
#             winner = "model_a" if prediction == 0 else "model_b"
#             results.append({'id': example_id, 'winner': winner})

# # Convert results to DataFrame and save as CSV
# df = pd.DataFrame(results)
# df.to_csv('submission.csv', index=False)


In [None]:
import pandas as pd
import torch
from tqdm import tqdm  

batch_size = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

input_ids = torch.tensor(ds['input_ids']).to(device)
attention_mask = torch.tensor(ds['attention_mask']).to(device)
ids = ds['id']  

results = []

model.eval()
with torch.no_grad():
    for start_idx in tqdm(range(0, len(ids), batch_size), desc="Processing Batches"):
        batch_input_ids = input_ids[start_idx:start_idx + batch_size]
        batch_attention_mask = attention_mask[start_idx:start_idx + batch_size]
        batch_ids = ids[start_idx:start_idx + batch_size]

        # Forward pass
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
        predictions = torch.argmax(outputs['logits'], dim=1)

        # Map predictions to labels
        for example_id, prediction in zip(batch_ids, predictions.cpu().numpy()):
            winner = "model_a" if prediction == 0 else "model_b"
            results.append({'id': example_id, 'winner': winner})

# Save results to CSV
df = pd.DataFrame(results)
df.to_csv('submission.csv', index=False)
