<a href="https://colab.research.google.com/github/Umesh94kr/Finetuning-LLMs/blob/main/Finetuning_from_Hugging_Face/Loading_open_source_LLM_in_Quantized_form.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import re
import math

In [2]:
from google.colab import userdata
from huggingface_hub import login
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, set_seed
from peft import LoraConfig, PeftModel
from datasets import load_dataset, Dataset, DatasetDict
from datetime import datetime
import matplotlib.pyplot as plt

In [3]:
## lets take 4 different models
LLAMA_3_1 = "meta-llama/Meta-Llama-3.1-8B"
QWEN_2_5 = "Qwen/Qwen2.5-7B"
GEMMA_2 = "google/gemma-2-9b"
PHI_3 = "microsoft/Phi-3-medium-4k-instruct"

In [4]:
BASE_MODEL = LLAMA_3_1

HF_USER = "ed-donner"
DATASET_NAME = f"{HF_USER}/pricer-data"
MAX_SEQUENCE_LENGTH = 182
QUANT_4_BIT = True

In [5]:
# Log in to huggingface
hf_token = userdata.get('Hugging_face_API')
login(hf_token, add_to_git_credential=True)

In [6]:
# function to investigate tokenizer for different models

def check_tokenizer(model_name):
  print(f"Investigating tokenizer for model : {model_name}")
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
  for number in [0, 1, 10, 999, 1000]:
    # without special tokens
    without_special_tokens = tokenizer.encode(str(number), add_special_tokens=False)
    # with special tokens
    with_special_tokens = tokenizer.encode(str(number), add_special_tokens=True)
    print(f"The tokens for {number} >>> wihtout special tokens : {without_special_tokens} >>> with special tokens : {with_special_tokens}")

In [7]:
check_tokenizer(BASE_MODEL)

Investigating tokenizer for model : meta-llama/Meta-Llama-3.1-8B


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

The tokens for 0 >>> wihtout special tokens : [15] >>> with special tokens : [128000, 15]
The tokens for 1 >>> wihtout special tokens : [16] >>> with special tokens : [128000, 16]
The tokens for 10 >>> wihtout special tokens : [605] >>> with special tokens : [128000, 605]
The tokens for 999 >>> wihtout special tokens : [5500] >>> with special tokens : [128000, 5500]
The tokens for 1000 >>> wihtout special tokens : [1041, 15] >>> with special tokens : [128000, 1041, 15]


In [8]:
check_tokenizer(QWEN_2_5)

Investigating tokenizer for model : Qwen/Qwen2.5-7B


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

The tokens for 0 >>> wihtout special tokens : [15] >>> with special tokens : [15]
The tokens for 1 >>> wihtout special tokens : [16] >>> with special tokens : [16]
The tokens for 10 >>> wihtout special tokens : [16, 15] >>> with special tokens : [16, 15]
The tokens for 999 >>> wihtout special tokens : [24, 24, 24] >>> with special tokens : [24, 24, 24]
The tokens for 1000 >>> wihtout special tokens : [16, 15, 15, 15] >>> with special tokens : [16, 15, 15, 15]


In [9]:
check_tokenizer(GEMMA_2)

Investigating tokenizer for model : google/gemma-2-9b


tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

The tokens for 0 >>> wihtout special tokens : [235276] >>> with special tokens : [2, 235276]
The tokens for 1 >>> wihtout special tokens : [235274] >>> with special tokens : [2, 235274]
The tokens for 10 >>> wihtout special tokens : [235274, 235276] >>> with special tokens : [2, 235274, 235276]
The tokens for 999 >>> wihtout special tokens : [235315, 235315, 235315] >>> with special tokens : [2, 235315, 235315, 235315]
The tokens for 1000 >>> wihtout special tokens : [235274, 235276, 235276, 235276] >>> with special tokens : [2, 235274, 235276, 235276, 235276]


In [10]:
check_tokenizer(PHI_3)

Investigating tokenizer for model : microsoft/Phi-3-medium-4k-instruct
The tokens for 0 >>> wihtout special tokens : [29871, 29900] >>> with special tokens : [29871, 29900]
The tokens for 1 >>> wihtout special tokens : [29871, 29896] >>> with special tokens : [29871, 29896]
The tokens for 10 >>> wihtout special tokens : [29871, 29896, 29900] >>> with special tokens : [29871, 29896, 29900]
The tokens for 999 >>> wihtout special tokens : [29871, 29929, 29929, 29929] >>> with special tokens : [29871, 29929, 29929, 29929]
The tokens for 1000 >>> wihtout special tokens : [29871, 29896, 29900, 29900, 29900] >>> with special tokens : [29871, 29896, 29900, 29900, 29900]


### **Load the Data**

In [11]:
dataset = load_dataset(DATASET_NAME)

In [12]:
print(dataset.shape)

{'train': (400000, 2), 'test': (2000, 2)}


In [13]:
from pprint import pprint
train_data, test_data = dataset['train'], dataset['test']

pprint(f"First sample data : \n\n{train_data[0]}")

('First sample data : \n'
 '\n'
 "{'text': 'How much does this cost to the nearest dollar?\\n\\nDelphi FG0166 "
 'Fuel Pump Module\\nDelphi brings 80 years of OE Heritage into each Delphi '
 'pump, ensuring quality and fitment for each Delphi part. Part is validated, '
 'tested and matched to the right vehicle application Delphi brings 80 years '
 'of OE Heritage into each Delphi assembly, ensuring quality and fitment for '
 'each Delphi part Always be sure to check and clean fuel tank to avoid '
 'unnecessary returns Rigorous OE-testing ensures the pump can withstand '
 'extreme temperatures Brand Delphi, Fit Type Vehicle Specific Fit, Dimensions '
 'LxWxH 19.7 x 7.7 x 5.1 inches, Weight 2.2 Pounds, Auto Part Position '
 'Unknown, Operation Mode Mechanical, Manufacturer Delphi, Model FUEL PUMP, '
 "Dimensions 19.7\\n\\nPrice is $227.00', 'price': 226.95}")


### **Prepare our Base LLama model for evaluation**

Loading base model in 4bit quantization

In [5]:
pip install -U bitsandbytes



In [15]:
## pick the right quantization
from transformers import BitsAndBytesConfig

if QUANT_4_BIT:
  quant_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_compute_dtype=torch.bfloat16,
      bnb_4bit_quant_type='nf4'
  )
else:
  quant_config = BitsAndBytesConfig(
      load_in_8bit=True,
      bnb_8bit_compute_dtype=torch.bfloat16
  )

In [16]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto"
)

base_model.generation_config.pad_token_id = tokenizer.pad_token_id


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [17]:
print(f"Memory footprint : {base_model.get_memory_footprint() / 1e9:.1f} GB")

Memory footprint : 5.6 GB


In [18]:
def extract_price(s):
  if "Price is $" in s:
    contents = s.split("Price is $")[1]
    contents = contents.replace(',', '').replace('$', '')
    match = re.search(r"[-+]?\d*\.\d+|\d+", contents)
    return float(match.group()) if match else 0
  return 0

In [19]:
extract_price("Price is $999 for this good.")

999.0

In [20]:
# model prediction / inference
def model_prediction(prompt):
  set_seed(42)
  inputs = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
  print(inputs.shape)
  attention_mask = torch.ones(inputs.shape, device="cuda")
  outputs = base_model.generate(inputs, max_new_tokens=4, attention_mask=attention_mask, num_return_sequences=1)
  response = tokenizer.decode(outputs[0])
  return extract_price(response)

In [21]:
model_prediction(test_data[0]['text'])

torch.Size([1, 175])


1800.0

---
---
---
---

## **Setting Up the Training requirements**

In [1]:
pip install trl

Collecting trl
  Downloading trl-0.25.1-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.25.1-py3-none-any.whl (465 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.5/465.5 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.25.1


In [2]:
pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2


In [3]:
import os
import re
import math

from google.colab import userdata
from huggingface_hub import login

import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, set_seed

from peft import LoraConfig, PeftModel

from datasets import load_dataset, Dataset, DatasetDict
from datetime import datetime
import matplotlib.pyplot as plt

#### **Hyperparameters of LoRA and Q-LoRA**
- r (rank)
- alpha (scaling factor)
- dropout
- quantization (4 bit, 8 bit)
- Traget weights (attention, feed forward)

#### **Hyperparameters of Training process**
- Epochs
- Batch size
- Learning rate
- optimizer
- Gradient Accumulation

In [4]:
## Constants
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
PROJECT_NAME = "Finetuning-llama"
HF_USER = "UKR10"


## Data
DATASET_NAME = f"ed-donner/pricer-data"
MAX_SEQUENCE_LENGTH = 182

## Run name
RUN_NAME = f"{datetime.now():%Y-%m-%d_%H.%M.%S}"
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
HUB_MODEL_NAME = f"{HF_USER}/{PROJECT_RUN_NAME}"

## Hyperparameters for Q-LoRA
LORA_R = 8
LORA_ALPHA = 4
TARGET_MODULES = ['q_proj', 'v_proj', 'k_proj', 'o_proj']
LORA_DROPOUT = 0.1
QUANT_4_BIT = True

## Hyperparameters for Training
EPOCHS = 2
BATCH_SIZE = 16
GRADIENT_ACCUMULATION_STEPS = 1
LEARNING_RATE = 1e-4
# Warmup (0 → 300 steps)
# LR increases: 0 → 2e-4
# Scheduler (300 → 3000 steps)
# LR gradually decreases from 2e-4 → near 0
LR_SCHEDULER = "cosine" # After warmup finishes, the learning rate should NOT stay constant.
WARMUP_RATIO = 0.03 # percentage of total training steps during which the learning rate slowly increases from 0 → base LR.
OPTIMIZER = "paged_adamw_32bit"

STEPS = 50
SAVE_STEPS = 5000
LOG_TO_WANDB = True

In [5]:
# Log in to huggingface
hf_token = userdata.get('Hugging_face_API')
login(hf_token, add_to_git_credential=True)

In [6]:
# Log in to weights and biases platform
import wandb
wandb_api_key = userdata.get('Weights_n_biases')
os.environ['WANDB_API_KEY'] = wandb_api_key

wandb.login()

# configure weights and biases
os.environ['WANDB_PROJECT'] = PROJECT_NAME
os.environ['WANDB_LOG_MODEL'] = "end" if LOG_TO_WANDB else "false"
os.environ['WANDB_WATCH'] = "gradients"

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Currently logged in as: [33mumeshkumar94628[0m ([33mumeshkumar94628-ai[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
dataset = load_dataset(DATASET_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/416 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/185M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/914k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/400000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [8]:
dataset.shape

{'train': (400000, 2), 'test': (2000, 2)}

In [22]:
train_data = dataset['train'].select(range(5000))
test_data = dataset['test'].select(range(500))

from pprint import pprint
pprint(test_data[0])

{'price': 374.41,
 'text': 'How much does this cost to the nearest dollar?\n'
         '\n'
         'OEM AC Compressor w/A/C Repair Kit For Ford F150 F-150 V8 & Lincoln '
         'Mark LT 2007 2008 - BuyAutoParts NEW\n'
         "As one of the world's largest automotive parts suppliers, our parts "
         'are trusted every day by mechanics and vehicle owners worldwide. '
         'This A/C Compressor and Components Kit is manufactured and tested to '
         'the strictest OE standards for unparalleled performance. Built for '
         'trouble-free ownership and 100% visually inspected and quality '
         'tested, this A/C Compressor and Components Kit is backed by our 100% '
         'satisfaction guarantee. Guaranteed Exact Fit for easy installation '
         '100% BRAND NEW, premium ISO/TS 16949 quality - tested to meet or '
         'exceed OEM specifications Engineered for superior durability, backed '
         'by industry-leading unlimited-mileage warranty Included in

In [10]:
if LOG_TO_WANDB:
  wandb.init(project=PROJECT_NAME, name=RUN_NAME)

In [11]:
# USE 4 BIT quantization

if QUANT_4_BIT:
  quant_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_compute_dtype=torch.bfloat16,
      bnb_4bit_quant_type="nf4"
  )
else:
  quant_config = BitsAndBytesConfig(
      load_in_8bit=True,
      bnb_8bit_compute_dtype=torch.bfloat16,
  )

In [12]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto"
)

base_model.generation_config.pad_token_id = tokenizer.pad_token_id

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [17]:
print(f"Memory footprint : {base_model.get_memory_footprint() / 1e9:.1f} GB")

Memory footprint : 5.6 GB


#### **Data Collator**

We only want LLM to predict tokens after "Price is $",



In [18]:
import trl

from trl import SFTConfig, SFTTrainer

def formatting_func(example):
    return f"Price is ${example['price']}"

#### **Lets setup training configuration**

- One to be Lora config
- Other should be SFT config

In [24]:
lora_parameters = LoraConfig(
    lora_alpha = LORA_ALPHA,
    lora_dropout = LORA_DROPOUT,
    r = LORA_R,
    target_modules=TARGET_MODULES,
    bias="none",
    task_type="CAUSAL_LM"
)

train_parameters = SFTConfig(
    output_dir=PROJECT_RUN_NAME,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=1,
    eval_strategy='no',
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    optim=OPTIMIZER,
    save_steps=SAVE_STEPS,
    save_total_limit=10,
    logging_steps=STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=WARMUP_RATIO,
    group_by_length=True,
    lr_scheduler_type=LR_SCHEDULER,
    report_to="wandb" if LOG_TO_WANDB else None,
    run_name=RUN_NAME,
    max_length=MAX_SEQUENCE_LENGTH,
    dataset_text_field='text',
    save_strategy='steps',
    hub_strategy='every_save',
    push_to_hub=True,
    hub_model_id=HUB_MODEL_NAME,
    hub_private_repo=True
)

In [25]:
trainer = SFTTrainer(
    model=base_model,
    train_dataset=train_data,
    peft_config=lora_parameters,
    processing_class=tokenizer,
    args=train_parameters,
    formatting_func=formatting_func
)



Applying formatting function to train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [26]:
trainer.train()

# Push our finetuned model to huggingface
trainer.model.push_to_hub(PROJECT_RUN_NAME, private=True)
print(f"Save to the Hub : {PROJECT_RUN_NAME}")

  return fn(*args, **kwargs)


Step,Training Loss
50,3.5322
100,2.1962
150,1.2816
200,1.195
250,1.18
300,1.1791
350,1.1924
400,1.1882
450,1.2036
500,1.1898


README.md:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors: 100%|##########| 27.3MB / 27.3MB            

No files have been modified since last commit. Skipping to prevent empty commit.


Save to the Hub : Finetuning-llama-2025-11-29_19.40.11
