### Finetuning LLM
Model: **Phi2**\
Datset: **SNLI**\
Technique: **QLORA PEFT**

##### Importing libraries

In [1]:
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType, PeftModelForSequenceClassification

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
from transformers import BitsAndBytesConfig
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

from sklearn.metrics import accuracy_score, f1_score

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

import wandb
from tqdm.auto import tqdm
import time
import os
import re
import accelerate
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

tqdm.pandas()

2024-11-02 00:23:11.282578: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-02 00:23:11.282636: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-02 00:23:11.283513: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-02 00:23:11.289870: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msachinsharma[0m ([33miiitd-sachin[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

##### Setup variables

In [4]:
model_id = "microsoft/phi-2"

In [59]:
CHECKPOINTS_DIR_PATH = "../checkpoints/"
DATA_DIR_PATH = "../data/"

os.makedirs(CHECKPOINTS_DIR_PATH, exist_ok=True)
os.makedirs(DATA_DIR_PATH, exist_ok=True)

In [6]:
project = "LLM-A3-PEFT"
base_model_name = "phi2"
run_name = f'QLoRa-{base_model_name}-SNLI'
peft_output_dir = CHECKPOINTS_DIR_PATH + run_name
peft_output_dir

'../checkpoints/QLoRa-phi2-SNLI'

##### Load dataset

In [7]:
def load_dataset_and_split(name, train_steps=550, train_len=1000, val_steps=100, val_len=100, test_steps=100, test_len=100):
	dataset = load_dataset(name)
	train_dataset = dataset["train"].select(range(0, len(dataset["train"]), train_steps)).take(train_len)
	val_dataset = dataset["validation"].select(range(0, len(dataset["validation"]), val_steps)).take(val_len)
	test_dataset = dataset["test"].select(range(0, len(dataset["test"]), test_steps)).take(test_len)

	# train_dataset map all -1 labels to 0
	train_dataset = train_dataset.map(lambda e: {"label": 1 if e["label"] == -1 else e["label"]})
	val_dataset = val_dataset.map(lambda e: {"label": 1 if e["label"] == -1 else e["label"]})
	test_dataset = test_dataset.map(lambda e: {"label": 1 if e["label"] == -1 else e["label"]})
	
	return train_dataset, val_dataset, test_dataset

In [8]:
train_dataset, val_dataset, test_dataset = load_dataset_and_split("stanfordnlp/snli")

##### Tokenize and preprocess dataset

In [9]:
def init_tokenizer(model_id):
	tokenizer = AutoTokenizer.from_pretrained(
		model_id,
		use_fast=True,
	)
	tokenizer.pad_token = tokenizer.eos_token
	tokenizer.padding_side = "left"
	return tokenizer

In [10]:
tokenizer = init_tokenizer(model_id)

In [11]:
def tokenize(batch):
	prompt_template = "Premise: {}\nHypothesis: {}\nDoes the Hypothesis follows the Premise?".strip()
	inputs = [prompt_template.format(premise, hypothesis) for premise, hypothesis in zip(batch['premise'], batch['hypothesis'])]
	labels = batch['label']
	tokenized_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
	tokenized_inputs["labels"] = labels
	return tokenized_inputs

In [12]:
def get_tokenized_dataset(dataset, columns_to_remove=["premise", "hypothesis", "label"]):
	tokenized_dataset = dataset.map(tokenize, batched=True)
	tokenized_dataset = tokenized_dataset.remove_columns(columns_to_remove)
	tokenized_dataset.set_format("torch")
	return tokenized_dataset

In [13]:
tokenized_train_dataset = get_tokenized_dataset(train_dataset)
tokenized_val_dataset = get_tokenized_dataset(val_dataset)
tokenized_test_dataset = get_tokenized_dataset(test_dataset)

##### Helper functions

In [14]:
def display_model_size(model):
	total_params = sum(p.numel() for p in model.parameters())
	dtype = model.parameters().__next__().dtype
	total_size_bytes = total_params * dtype.itemsize
	total_size_gb = total_size_bytes / (1024 ** 3)
	print(f"Model size = {total_size_gb:.2f} GB")

In [15]:
def display_trainable_size(model):
	trainable_params = 0
	all_param = 0
	for _, param in model.named_parameters():
		all_param += param.numel()
		if param.requires_grad:
			trainable_params += param.numel()
	
	dtype = model.parameters().__next__().dtype
	total_size_bytes = all_param * dtype.itemsize
	trainable_size_bytes = trainable_params * dtype.itemsize
	total_size_gb = total_size_bytes / (1024 ** 3)
	trainable_size_gb = trainable_size_bytes / (1024 ** 3)

	print(
		f"# of total params: {all_param} || # of trainable params: {trainable_params} || trainable %: {(100 * trainable_params / all_param):.2f}%"
	)
	print(f"Total size = {total_size_gb:.2f} GB || Trainable size = {trainable_size_gb:.2f} GB")

In [16]:
def get_gpu_utilization(return_bytes=False):
	nvmlInit()
	handle = nvmlDeviceGetHandleByIndex(0)
	info = nvmlDeviceGetMemoryInfo(handle)
	print(f"GPU memory occupied: {info.used/(1024**3):.2f} GB.")
	if return_bytes:
		return info.used

In [17]:
def get_inference(model, dataset):
	model.eval()
	predictions, labels = [], []

	for i in tqdm(range(len(dataset)), desc="Inference", total=len(dataset)):
		input_ids = dataset[i]["input_ids"].unsqueeze(0).to(model.device)
		attention_mask = dataset[i]["attention_mask"].unsqueeze(0).to(model.device)
		label = dataset[i]["labels"].item()

		with torch.no_grad():
			outputs = model(input_ids=input_ids, attention_mask=attention_mask)
			predicted_class_id = torch.argmax(outputs.logits, dim=-1).item()

		predictions.append(predicted_class_id)
		labels.append(label)

	return pd.Series(predictions), pd.Series(labels)

In [18]:
def evaluate_prediction(preds, labels):
	accuracy = accuracy_score(labels, preds)
	f1 = f1_score(labels, preds, average='weighted')
	return {'accuracy': accuracy, 'f1': f1}

##### Base Model

In [19]:
def init_phi2_model(model_id):
	quantization_config = BitsAndBytesConfig(
		load_in_4bit=True,
		bnb_4bit_use_double_quant=True,
		bnb_4bit_quant_type='nf4',  # Can be 'nf4' or 'fp4'
		bnb_4bit_compute_dtype=torch.float16
	)

	model = AutoModelForSequenceClassification.from_pretrained(
		model_id,
		num_labels=3,
		torch_dtype=torch.float16,
		low_cpu_mem_usage=True,
		trust_remote_code=True,
		quantization_config=quantization_config,
		device_map='auto',
		# device_map={"": 0}
	)
	return model

In [20]:
get_gpu_utilization()

GPU memory occupied: 0.36 GB.


In [21]:
base_model = init_phi2_model(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of PhiForSequenceClassification were not initialized from the model checkpoint at microsoft/phi-2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
display_trainable_size(base_model)

# of total params: 1390277120 || # of trainable params: 131248640 || trainable %: 9.44%
Total size = 2.59 GB || Trainable size = 0.24 GB


In [23]:
base_model_preds, labels = get_inference(base_model, tokenized_test_dataset)

Inference:   0%|          | 0/100 [00:00<?, ?it/s]

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


In [24]:
base_model_preds.value_counts()

2    97
0     3
Name: count, dtype: int64

In [25]:
base_model_metrics = evaluate_prediction(base_model_preds, labels)
base_model_metrics

{'accuracy': 0.39, 'f1': 0.2494326725905673}

##### Setup QLORA

In [26]:
def get_lora_model(base_model):
	lora_config = LoraConfig(
		task_type=TaskType.SEQ_CLS,
		r=32,
		lora_alpha=64,
		target_modules='all-linear',
		lora_dropout=0.05,
	)
	
	base_model = prepare_model_for_kbit_training(base_model)
	base_model.gradient_checkpointing_enable()

	lora_model = get_peft_model(base_model, lora_config)

	for name, param in lora_model.named_parameters():
		if 'lora' not in name:
			param.requires_grad = False

	return lora_model

In [27]:
gpu_util_before_ft = get_gpu_utilization(return_bytes=True)

GPU memory occupied: 2.39 GB.


In [28]:
model = get_lora_model(base_model)

In [29]:
display_trainable_size(model)

# of total params: 1437470720 || # of trainable params: 47185920 || trainable %: 3.28%
Total size = 5.35 GB || Trainable size = 0.18 GB


In [30]:
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): PhiForSequenceClassification(
      (model): PhiModel(
        (embed_tokens): Embedding(51200, 2560)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x PhiDecoderLayer(
            (self_attn): PhiSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2560, out_features=2560, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=2560, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lor

In [31]:
model.device

device(type='cuda', index=0)

##### Finetuning

In [32]:
peft_training_args = TrainingArguments(
	num_train_epochs=5,
	output_dir = peft_output_dir,
	per_device_train_batch_size=32,
	per_device_eval_batch_size=32,  
	learning_rate=0.0001,
	weight_decay=0.001,
    optim="paged_adamw_8bit",
	logging_dir=CHECKPOINTS_DIR_PATH + "logs/",
	logging_strategy="epoch",
	save_strategy="epoch",
	eval_strategy="epoch",
	fp16=True,
	overwrite_output_dir = 'True',
)

In [33]:
model.config.use_cache = False
model.config.pad_token_id = tokenizer.pad_token_id

peft_trainer = Trainer(
	model=model,
	train_dataset=tokenized_train_dataset,
	eval_dataset=tokenized_val_dataset,
	args=peft_training_args,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [34]:
start_time = time.time()

peft_trainer.train()

end_time = time.time()
time_elapsed = end_time - start_time
print(f"Training time: {time_elapsed:.2f} seconds")



  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss
1,1.0494,0.591291
2,0.477,0.373745
3,0.3156,0.502573
4,0.2463,0.495568
5,0.1809,0.545783


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Training time: 722.81 seconds


In [51]:
peft_model_preds, labels = get_inference(model, tokenized_test_dataset)

Inference:   0%|          | 0/100 [00:00<?, ?it/s]

In [36]:
evaluate_prediction(peft_model_preds, labels)

{'accuracy': 0.88, 'f1': 0.878125}

In [37]:
gpu_util_after_ft = get_gpu_utilization(return_bytes=True)

GPU memory occupied: 8.83 GB.


In [38]:
gpu_util_diff = (gpu_util_after_ft - gpu_util_before_ft) / (1024 ** 3)
print(f"GPU memory consumed during fine-tuning: {gpu_util_diff:.2f} GB")

GPU memory consumed during fine-tuning: 6.43 GB


##### Inference

In [39]:
del base_model, model, tokenizer

In [40]:
tokenizer = init_tokenizer(model_id)

Base Model Testing

In [41]:
base_model = init_phi2_model(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of PhiForSequenceClassification were not initialized from the model checkpoint at microsoft/phi-2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
display_trainable_size(base_model)

# of total params: 1390277120 || # of trainable params: 131248640 || trainable %: 9.44%
Total size = 2.59 GB || Trainable size = 0.24 GB


In [45]:
base_model_preds, labels = get_inference(base_model, tokenized_test_dataset)
base_model_metrics = evaluate_prediction(base_model_preds, labels)
print("Base Model Test Metrics:")
print(f'Accuracy: {base_model_metrics["accuracy"]:.2f} || F1: {base_model_metrics["f1"]:.2f}')

Inference:   0%|          | 0/100 [00:00<?, ?it/s]

Base Model Test Metrics:
Accuracy: 0.33 || F1: 0.20


Failure Test Cases

In [46]:
test_case_examples = test_dataset.select(range(0, 5))
tokenized_test_case_examples = get_tokenized_dataset(test_case_examples)
base_model_test_case_preds, labels = get_inference(base_model, tokenized_test_case_examples)

Inference:   0%|          | 0/5 [00:00<?, ?it/s]

In [47]:
base_model_test_case_failures = test_case_examples.to_pandas()[base_model_test_case_preds != labels]
base_model_test_case_failures['predicted_label'] = base_model_test_case_preds.iloc[base_model_test_case_failures.index]
base_model_test_case_failures

Unnamed: 0,premise,hypothesis,label,predicted_label
0,This church choir sings to the masses as they ...,The church has cracks in the ceiling.,1,0
2,Two men climbing on a wooden scaffold.,Two sad men climbing on a wooden scaffold.,1,0
3,"A man in a black shirt, in a commercial kitche...","A man in a black shirt, in a commercial kitche...",1,0
4,a woman in a black shirt looking at a bicycle.,A woman dressed in black shops for a bicycle.,1,0


In [60]:
base_model_test_case_failures.to_csv(DATA_DIR_PATH + "base_model_test_case_failures.csv", index=False)

Finetuned Model Testing

In [48]:
def find_latest_checkpoint(output_dir):
	checkpoint_files = os.listdir(output_dir)
	latest_checkpoint = max(checkpoint_files, key=lambda x: int(re.findall(r'\d+', x)[0]))
	return latest_checkpoint

In [49]:
latest_checkpoint = find_latest_checkpoint(peft_output_dir)
latest_checkpoint

'checkpoint-160'

In [50]:
model = PeftModelForSequenceClassification.from_pretrained(base_model, peft_output_dir + '/' + latest_checkpoint, is_trainable=False, dtype=torch.float16)

In [52]:
display_trainable_size(model)

# of total params: 1437470720 || # of trainable params: 7680 || trainable %: 0.00%
Total size = 2.68 GB || Trainable size = 0.00 GB


In [53]:
model_preds, labels = get_inference(model, tokenized_test_dataset)
model_metrics = evaluate_prediction(model_preds, labels)
print("PEFT Finetuned Model Test Metrics:")
print(f'Accuracy: {model_metrics["accuracy"]:.2f} || F1: {model_metrics["f1"]:.2f}')

Inference:   0%|          | 0/100 [00:00<?, ?it/s]

PEFT Finetuned Model Test Metrics:
Accuracy: 0.88 || F1: 0.88


In [54]:
model_test_case_preds, labels = get_inference(model, tokenized_test_case_examples)
evaluate_prediction(model_test_case_preds, labels)

Inference:   0%|          | 0/5 [00:00<?, ?it/s]

{'accuracy': 0.8, 'f1': 0.8857142857142858}

In [72]:
model_test_case_failures = test_case_examples.to_pandas()[model_test_case_preds != labels]
model_test_case_failures['predicted_label'] = model_test_case_preds.iloc[model_test_case_failures.index]
model_test_case_misses = model_test_case_failures[model_test_case_failures['premise'].isin(base_model_test_case_failures['premise'])]
model_test_case_misses

Unnamed: 0,premise,hypothesis,label,predicted_label
0,This church choir sings to the masses as they ...,The church has cracks in the ceiling.,1,2


In [73]:
model_test_case_misses.to_csv(DATA_DIR_PATH + "model_test_case_misses.csv", index=False)

In [70]:
model_test_case_success = test_case_examples.to_pandas()[model_test_case_preds == labels]
model_test_case_success['predicted_label'] = model_test_case_preds.iloc[model_test_case_success.index]
model_test_case_improvs = model_test_case_success[model_test_case_success['premise'].isin(base_model_test_case_failures['premise'])]
model_test_case_improvs

Unnamed: 0,premise,hypothesis,label,predicted_label
2,Two men climbing on a wooden scaffold.,Two sad men climbing on a wooden scaffold.,1,1
3,"A man in a black shirt, in a commercial kitche...","A man in a black shirt, in a commercial kitche...",1,1
4,a woman in a black shirt looking at a bicycle.,A woman dressed in black shops for a bicycle.,1,1


In [71]:
model_test_case_improvs.to_csv(DATA_DIR_PATH + "model_test_case_improvs.csv", index=False)