# Finetuning LLAMA using PEFT

An article on PEFT: https://www.leewayhertz.com/parameter-efficient-fine-tuning/

#Hyperparameters to be tuned:

1. Learning rate
2. Batch Size
3. Number of epochs
4. Weight decay

In [1]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

import os, torch, logging
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m225.3/244.2 kB[0m [31m7.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m95.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━

In [2]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
data = load_dataset("ShreeyaVenneti/avg_selfprom")['train'].train_test_split(train_size=0.90, test_size=0.10)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/788k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
# Dataset
training_data = data['train']
validation_data = data['test']

In [5]:
data

DatasetDict({
    train: Dataset({
        features: ['overallComments', 'interview'],
        num_rows: 299
    })
    test: Dataset({
        features: ['overallComments', 'interview'],
        num_rows: 34
    })
})

In [6]:
# Model and tokenizer names
base_model_name = "NousResearch/Llama-2-7b-chat-hf"

#Llama-2-Chat, is an optimized version of llama - 2 for dialogue use cases

refined_model = "llama-2-7b-ShreeyaVenneti-enhanced"

In [7]:
# Tokenizer

#initializes a tokenizer named llama_tokenizer. It uses the AutoTokenizer.from_pretrained() method from the Hugging Face Transformers library
#trust_remote_code=True is a parameter that indicates to trust remote code when loading the tokenizer
#the library may need to download resources or configurations from a remote server like model weights, tokenizer configurations, and related files.
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)

#The pad_token is used to pad sequences to a uniform length during tokenization.
#By setting it to the eos_token, we are ensuring that the padding tokens will be added to the end of sequences.
#we are essentially specifying that when sequences need to be padded, the eos_token should be used as the padding token.
llama_tokenizer.pad_token = llama_tokenizer.eos_token

#specifies that padding tokens should be added to the right side of the input sequence.
llama_tokenizer.padding_side = "right"  # Fix for fp16

Downloading (…)okenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [8]:
# num_added_toks = llama_tokenizer.add_tokens(['<selfprom>', '<csr>', '<opcon>', '<opcom>', '<operfimp>', '<operfhir>', '<operfanx>', '<operfconf>'])
num_added_toks = llama_tokenizer.add_tokens(['<selfprom>'])

In [9]:
# Quantization Config

#Quantization is a technique used to reduce the memory and computational requirements of deep neural networks
#It represents weights and activations with fewer bits

#load_in_4bit is a parameter that, when set to True, indicates that the model's weights should be loaded in 4-bit quantized format.
#Quantizing weights to 4 bits can significantly reduce memory usage compared to full-precision weights.

#bnb_4bit_quant_type specifies the type of 4-bit quantization to be used.
#In this case, it's set to "nf4," which appears to be a specific quantization method.

#bnb_4bit_compute_dtype sets the data type used for intermediate computations during 4-bit quantization.
#It's set to torch.float16, which is a 16-bit floating-point data type

#bnb_4bit_use_double_quant is a boolean parameter.
#When set to False, it means that double quantization (quantizing weights twice) is not used in this configuration
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

In [10]:
# Model

#AutoModelForCausalLM is a class specifically designed for causal language modeling tasks
#The model generates text one token at a time, taking into account the context.

#device map specifies the device mapping for the model.
#it indicates that the model should be placed on a specific device (a GPU) with device index 0.
#The empty string "" corresponds to the default model device.
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map={"": 0}
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

In [11]:
#use_cache set to False, the model won't use cached hidden states from previous computations.
#Caching can be useful for optimizing text generation, but it consumes memory.
#By setting it to False, we are indicating that you don't want to use cache, which might save memory but potentially slow down generation tasks
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

In [12]:
base_model.resize_token_embeddings(len(llama_tokenizer))

Embedding(32002, 4096)

In [13]:
# LoRA Config

#LORA or Low rank adaptation is a method of parameter efficiient finetuning
#PEFT balances computational efficiency and task performance
#This makes it feasible to fine-tune even the largest LLMs without compromising on quality.

#LoRA is a technique designed to efficiently fine-tune pre-trained language models
#It injects trainable low-rank matrices into each layer of the Transformer architecture.
#By using low-rank matrices instead of fine-tuning all parameters,
#LoRA significantly reduces the number of trainable parameters, making it much more memory-efficient and computationally cheaper.

peft_parameters = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM"
)

In [14]:
# Training Params

#gradient_accumulation_steps determines how many batches are accumulated before performing a gradient update.
#In this case, gradients are updated after every batch (1 accumulation step).

#optim specifies the optimization algorithm to use during training.
#"paged_adamw_32bit: "AdamW" is a variant of the Adam optimizer, and "32bit" suggests the use of 32-bit precision.

#save_steps controls how often model checkpoints are saved during training. In this case, a checkpoint will be saved every 25 steps.

#logging_steps determines how often training logs and metrics are logged. In this case, logs will be generated and recorded every 25 steps.

#weight_decay is a regularization term added to the loss function during training to prevent overfitting. It's set to 0.001.

#fp16=False and bf16=False
#These parameters control whether to use mixed-precision training with 16-bit floating-point (fp16) or 16-bit bfloat16 (bf16) precision.
#In this case, both are set to False, indicating the use of full-precision training.

#max_grad_norm is used to clip gradients during training to prevent them from becoming too large.
#It sets a threshold (0.3 in this case) for gradient values.

#warmup_ratio specifies the ratio of warmup steps to the total number of training steps.
#Warmup is a technique that gradually increases the learning rate at the beginning of training to help stabilize training.
train_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant"
)

In [15]:
# Trainer

#hugginface's supervised fine tuning trainer
#Supervised fine-tuning, involves adapting a pre-trained Language Model (LLM) to a specific downstream task using labeled data.
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=training_data,
    peft_config=peft_parameters,
    dataset_text_field="overallComments",
    tokenizer=llama_tokenizer,
    args=train_params
)



Map:   0%|          | 0/299 [00:00<?, ? examples/s]

In [None]:
# Training
fine_tuning.train()

# Save Model
fine_tuning.model.save_pretrained(refined_model)

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,3.1436
50,1.7314
75,1.4004
100,1.2131


In [None]:
from tqdm import tqdm

In [None]:
import re

In [None]:
generated_texts = []

for i in tqdm(range(len(validation_data))):
  text = validation_data['interview'][i]
  text = f"<s>[INST] {text} [/INST]"
  inputs = llama_tokenizer(text, return_tensors="pt").input_ids.to("cuda")
  outputs = base_model.generate(inputs, max_new_tokens=150, do_sample=False).to("cuda")
  generated_text = llama_tokenizer.decode(outputs[0], skip_special_tokens=True)

  generated_texts.append({"Transcript": validation_data['interview'][i], "Actual Text": validation_data['overallComments'][i], "Generated Text": generated_text})

In [None]:
len(generated_texts)

In [None]:
import csv

In [None]:
output_file = "LLAMA_avgselfprom_90-10_10_2e-4_0.001.csv"
with open(output_file, "w", newline="", encoding="utf-8") as csv_file:
    fieldnames = ["Row Number", "Actual Text", "Generated Text"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    for i, generated_row in enumerate(generated_texts):
        writer.writerow({"Row Number": i + 1, "Actual Text": generated_row["Actual Text"], "Generated Text": generated_row["Generated Text"]})

print(f"Results saved in {output_file}")