In [1]:
import numpy as np
import pandas as pd

In [2]:
#!pip install -q -U torch --index-url https://download.pytorch.org/whl/cu117
#!pip install -q -U -i https://pypi.org/simple/ bitsandbytes
#!pip install -q -U transformers=="4.40.0"
#!pip install -q -U accelerate
#!pip install -q -U datasets
#!pip install -q -U trl
#!pip install -q -U peft
#!pip install -q -U tensorboard

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging,
                          DataCollatorWithPadding,
                          EarlyStoppingCallback
)

from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [5]:
print(f"pytorch version {torch.__version__}")

pytorch version 2.3.1


In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"working on {device}")

working on cuda:0


In [7]:
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [9]:
# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

In [10]:
torch_dtype

torch.bfloat16

# Data Preparation for Fine-tuning

In [11]:
import pandas as pd

In [12]:
submission = pd.read_csv(open('../input/lmsys-chatbot-arena/sample_submission.csv'))
test = pd.read_csv(open('../input/lmsys-chatbot-arena/test.csv'))
train = pd.read_csv("../input/lmsys-chatbot-arena/train.csv")

In [13]:
result = []
for index, row in train.iterrows():
    if(row['winner_model_a']==1): result.append('winner_model_a')
    elif(row['winner_model_b']==1): result.append('winner_model_b')
    elif(row['winner_tie']==1): result.append('winner_tie')
train.insert(6, "Preference", result, True)
train

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,Preference,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",winner_model_a,1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",winner_model_b,0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",winner_tie,0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",winner_model_a,1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",winner_model_b,0,1,0
...,...,...,...,...,...,...,...,...,...,...
57472,4294656694,gpt-4-0613,claude-1,"[""A simple mnemonic for \u03c0:\n\""How I wish ...","[""Sure, let's break it down:\n\n1. \""How\"" has...","[""Here is how that mnemonic represents the dig...",winner_model_a,1,0,0
57473,4294692063,claude-2.0,llama-2-13b-chat,"[""In python, implement a naive Bayes with gaus...","[""Here is an implementation of a naive Bayes c...","[""Sure! Here's an implementation of a naive Ba...",winner_model_a,1,0,0
57474,4294710549,claude-1,alpaca-13b,"[""is it unethical to work on building weapons?...","[""Working on weapons technology raises some et...","[""It depends on the context. Weapons can be us...",winner_model_a,1,0,0
57475,4294899228,palm-2,tulu-2-dpo-70b,"[""If a bait contains 0,0025% bromadiolon then ...","[""Bromadiolone is a rodenticide which is most ...","[""As an AI language model, I do not promote or...",winner_model_b,0,1,0


In [14]:
train_size = int(train.shape[0]*0.6)
test_val_size = train.shape[0]-train_size
train_, test_val  = train_test_split(train, train_size=train_size,
                                    test_size=test_val_size,
                                    random_state=42)

test_, val_ = train_test_split(test_val, train_size=int(test_val_size/2),
                                    test_size=int(test_val_size/2),
                                    random_state=42)

In [15]:
def generate_prompt(data_point):
    return f"""
            Analyze the prompt and responses(response de_a, response_b) from two chatbots(model_a, model_b).
            Then predict the human preference of those responses- if it is "winner_model_a", "winner_model_b" or
            "winner_tie". Return the answer as the correspoding preference label "winner_model_a", "winner_model_b" or
            "winner_tie".
            ----------------------------------------------------------------------------------------------------------
            prompt: {data_point["prompt"]}
            ----------------------------------------------------------------------------------------------------------
            model_a: {data_point["model_a"]}
            response_a: {data_point["response_a"]}
            ----------------------------------------------------------------------------------------------------------
            model_b: {data_point["model_b"]}
            response_b: {data_point["response_b"]}
            ----------------------------------------------------------------------------------------------------------
            Preference= {data_point["Preference"]} """.strip()

def generate_test_prompt(data_point):
    return f"""
            Analyze the prompt and responses(response_a, response_b) from two chatbots(model_a, model_b).
            Then predict the human preference of those responses- if it is "winner_model_a", "winner_model_b" or
            "winner_tie". Return the answer as the correspoding preference label "winner_model_a", "winner_model_b" or
            "winner_tie".
            ----------------------------------------------------------------------------------------------------------
            prompt: {data_point["prompt"]}
            ----------------------------------------------------------------------------------------------------------
            model_a: {data_point["model_a"]}
            response_a: {data_point["response_a"]}
            ----------------------------------------------------------------------------------------------------------
            model_b: {data_point["model_b"]}
            response_b: {data_point["response_b"]}
            ----------------------------------------------------------------------------------------------------------
            Preference: """.strip()
            

In [16]:
X_train = pd.DataFrame(train_.apply(generate_prompt, axis=1), columns=["text"])
X_eval = pd.DataFrame(val_.apply(generate_prompt, axis=1),  columns=["text"])
X_test = pd.DataFrame(test_.apply(generate_test_prompt, axis=1), columns=["text"])

y_true = test_.Preference

train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)

In [17]:
def evaluate(y_true, y_pred):
    labels = ['winner_model_a', 'winner_model_b', 'winner_tie']
    mapping = {'none':0, 'winner_model_a': 1, 'winner_model_b': 2, 'winner_tie': 3}
    def map_func(x):
        return mapping.get(x, 1)
    
    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true)) 
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [18]:
# model_name = "../input/llama-3/transformers/8b-chat-hf/1"
# model_name = "/kaggle/input/deberta_v3/keras/deberta_v3_extra_small_en/2"
model_name = "../8b-chat-hf/1"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    torch_dtype=compute_dtype,
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

max_seq_length = 512
tokenizer = AutoTokenizer.from_pretrained(model_name, max_seq_length=max_seq_length)
tokenizer.pad_token_id = tokenizer.eos_token_id

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [19]:
def predict(test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["text"]
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens = 1, 
                        temperature = 0.0,
                       )
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("=")[-1]
        if "positive" in answer:
            y_pred.append("positive")
        elif "negative" in answer:
            y_pred.append("negative")
        elif "neutral" in answer:
            y_pred.append("neutral")
        else:
            y_pred.append("none")
    return y_pred

In [20]:
output_dir="trained_weigths"

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=4,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

In [21]:
data_collator = DataCollatorWithPadding(tokenizer)

In [22]:
training_arguments = TrainingArguments(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=1,                       # number of training epochs
    per_device_train_batch_size=32,            # batch size per device during training
    gradient_accumulation_steps=8,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,             # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    save_steps=500,                           # Save checkpoint every 500 steps
    logging_steps=50,                         # log every 10 steps
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=False,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="tensorboard",                  # report metrics to tensorboard
    evaluation_strategy="epoch" , 
    save_strategy="epoch", 
    # save checkpoint every epoch
    save_total_limit=3,                       # Keep only the 2 most recent checkpoints
    load_best_model_at_end=True,              # Load the best model at the end of training
    dataloader_num_workers=4,  # Number of workers for data loading
    dataloader_pin_memory=True,  # Pin memory for faster data transfer to GPU
)

In [23]:
# Early stopping callback
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

In [24]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
    packing=False,
    callbacks=[early_stopping],              # Include early stopping callback
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    }
)

Map:   0%|          | 0/34486 [00:00<?, ? examples/s]

Map:   0%|          | 0/11495 [00:00<?, ? examples/s]

In [25]:
import torch
import os


import gc

torch.cuda.empty_cache()



gc.collect()

os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use the first GPU
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Prevent tokenizer parallelism issues
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"  # Optimize memory allocation
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'  # Allow TF to incrementally allocate GPU memory (if used)


In [26]:
# Train model
trainer.train()

Epoch,Training Loss,Validation Loss
0,0.9163,0.89529


TrainOutput(global_step=134, training_loss=0.9803051877377639, metrics={'train_runtime': 17392.8655, 'train_samples_per_second': 1.983, 'train_steps_per_second': 0.008, 'total_flos': 7.919881404190556e+17, 'train_loss': 0.9803051877377639, 'epoch': 0.9944341372912802})

In [27]:
# Save trained model and tokenizer
trainer.save_model()
tokenizer.save_pretrained(output_dir)

('trained_weigths\\tokenizer_config.json',
 'trained_weigths\\special_tokens_map.json',
 'trained_weigths\\tokenizer.json')

In [28]:
%load_ext tensorboard
%tensorboard --logdir logs/runs

In [29]:
y_pred = predict(test, model, tokenizer)
evaluate(y_true, y_pred)

100%|██████████████████████████████████████████████████████████████████████████| 11495/11495 [5:56:08<00:00,  1.86s/it]

Accuracy: 0.033
Accuracy for label 1: 0.093
Accuracy for label 2: 0.000
Accuracy for label 3: 0.000

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.37      0.09      0.15      4097
           2       0.00      0.00      0.00      3885
           3       0.00      0.00      0.00      3513

    accuracy                           0.03     11495
   macro avg       0.09      0.02      0.04     11495
weighted avg       0.13      0.03      0.05     11495


Confusion Matrix:
[[   0    0    0]
 [3714  383    0]
 [3527  358    0]]



