<a href="https://colab.research.google.com/github/ShrayonTarafdar/Kaggle-Notebooks/blob/main/Llama2_7B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Fine-tuning Llama2 7B on personal dataset**

In [1]:
#Step 1: Installations


!pip install peft # needed as it has LoRA in it- helps to fine-tune only some low rank matrices and not all
!pip install accelerate # to optimize hardware utilisation, billion parameter model so this is needed to be as efficient as possible
# it distributes things among various gpus
!pip install bitsandBytes # For 4 bit quantization
!pip install transformers # main library, we will get autotokenizers and other things for SFT training
!pip install datasets # load_dataset model, for dataset preprocessing etc

# The pip dependency resolver error does not matter

Collecting bitsandBytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandBytes
Successfully installed bitsandBytes-0.45.5
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Do

In [2]:
# Step 2: Check the GPUs available
# How many gpus I have, how can i distribute, for college clusters adn company clusters etc

!pip install GPUtil



In [3]:
import torch
import GPUtil
import os

GPUtil.showUtilization()

if torch.cuda.is_available():
  print("GPU is available")
else:
    device= torch.device("cpu")
    print("GPU is not availabel, will be using CPU")

#If there are multiple GPUs then PCI bus id numbers the gpus based on their PCI bus locations. Here only 1 GPU so cuda:0 is the only one
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

| ID | GPU | MEM |
------------------
|  0 |  0% |  0% |
GPU is available


In [5]:
import torch
import transformers
# Transformers will allow us to autotokenize the input, we can load the model using automodelforcausalLLm , bits and bytes enforces quantization layer before the input and after the output
# Llamatokenizer: the tokenizer supported by Llama, others will also have their own tokenizers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, LlamaTokenizer
# to use hugging face models, I need to get authorized first, for which I need notebook_login
from huggingface_hub import notebook_login
from datasets import load_dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

# This simply enhances the output formatting a lot, no other use
if "COLAB_GPU" in os.environ:
  from google.colab import output
  output.enable_custom_widget_manager()

In [7]:
if "COLAB_GPU" in os.environ:
  !huggingface-cli login
else:
  notebook_login()



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
The token `Llama2 7b` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `Llama2 7b`


In [8]:
# this is the model to be used
base_model_id = "meta-llama/Llama-2-7b-chat-hf"

#load in 4 bit means double quantization ie load once in 4 bit
# and then quantize again
# very small size without much loss in accuracy
# and all the values will be in the form of bfloat16
# nf4 is optimized for normal distribution
# doing double quant might give a big hit in the accuracy, but doing
# nf4 we are essentially saving the accuracy scores
# always use bfloat16 and no other datatype

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [9]:
# Get the dataset
# u can get the dataset from your got also like this easily
!git clone https://github.com/poloclub/Fine-tuning-LLMs.git

Cloning into 'Fine-tuning-LLMs'...
remote: Enumerating objects: 47, done.[K
remote: Counting objects: 100% (47/47), done.[K
remote: Compressing objects: 100% (42/42), done.[K
remote: Total 47 (delta 14), reused 29 (delta 4), pack-reused 0 (from 0)[K
Receiving objects: 100% (47/47), 9.34 MiB | 18.15 MiB/s, done.
Resolving deltas: 100% (14/14), done.


In [3]:
train_dataset = load_dataset("text", data_files={"train":
                                                 ["/content/Fine-tuning-LLMs/data/hawaii_wf_4.txt", "/content/Fine-tuning-LLMs/data/hawaii_wf_2.txt"]}, split="train")

NameError: name 'load_dataset' is not defined

In [2]:
train_dataset["text"][1]

NameError: name 'train_dataset' is not defined

In [None]:
# After dataset
# Get tokenizer
# Set up training arguments


# Ensure you add token endings, it will give well formed outputs
# Fast tokenizer is rust based, but is inaccurate
# Also allow the trust
tokenizer = LlamaTokenizer.from_pretrained(base_model_id, use_fast=False, trust_remote_code=True, add_eos_token=True)

# Important, Llama models dont have pad tokens , so this is required for Llama models
if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

In [None]:
tokenized_train_dataset = []
for phrase in train_dataset:
  tokenized_train_dataset.append(tokenizer(phrase["text"]))

In [None]:
tokenized_train_dataset[1]

In [None]:
tokenized_train_dataset[2]

In [None]:
tokenizer.eos_token

In [None]:

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)


# r=8 is a good balance, lesser means low expressiveness
config = LoraConfig(
    r=8,
    lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

In [None]:
#max steps should be atleast 1500 steps
#gradient accumatation: how to process the entire dataset: in 1st step grad calc, in 2nd step grad calc and then together update the model, not in between.
# standard is around 2
# num_train_epochs is 3 as in see entire dataset 3 times
# optimizer is paged 8bit adamw optimizer- the entire thing is not in memory
# all are similar
# datacollator: pads all sequences so that all have the same width throughout

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    args=transformers.TrainingArguments(
        output_dir="./finetunedModel",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        num_train_epochs=3,
        learning_rate=1e-4,
        max_steps=1500,
        bf16=False,
        optim="paged_adamw_8bit",
        logging_dir="./log",
        save_strategy="epoch",
        save_steps=50,
        logging_steps=10

),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache=False
trainer.train()

# here after all the checkpoints all will be stored in wandb

In [None]:
#inference written completely so that I can  make it in test.py
# ie learn how to pass it to the model adn then collate the outputs


import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, LlamaTokenizer
from peft import PeftModel

base_model_id = "meta-llama/Llama-2-7b-chat-hf"

nf4Config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = LlamaTokenizer.from_pretrained(base_model_id, use_fast=False, trust_remote_code=True, add_eos_token=True)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=nf4Config,
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=True
  )


In [None]:
tokenizer = LlamaTokenizer.from_pretrained(base_model_id, use_fast=False, trust_remote_code=True, add_eos_token=True

                              )

modelFinetuned = PeftModel.from_pretrained(base_model, "finetunedModel/checkpoint-20")

In [None]:
user_question = "When did Hawaii wildfires start?"

eval_prompt = f"Question: {user_question} Just answer this question accurately and concisely.\n"

promptTokenized = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

modelFinetuned.eval()

with torch.no_grad():
  print(tokenizer.decode(modelFinetuned.generate(**promptTokenized, max_new_tokens=1024)[0], skip_special_tokens=True))
  torch.cuda.empty_cache()