<a href="https://colab.research.google.com/github/Umesh94kr/Finetuning-LLMs/blob/main/Finetuning_from_Hugging_Face/Loading_open_source_LLM_in_Quantized_form.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import re
import math

In [2]:
from google.colab import userdata
from huggingface_hub import login
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, set_seed
from peft import LoraConfig, PeftModel
from datasets import load_dataset, Dataset, DatasetDict
from datetime import datetime
import matplotlib.pyplot as plt

In [3]:
## lets take 4 different models
LLAMA_3_1 = "meta-llama/Meta-Llama-3.1-8B"
QWEN_2_5 = "Qwen/Qwen2.5-7B"
GEMMA_2 = "google/gemma-2-9b"
PHI_3 = "microsoft/Phi-3-medium-4k-instruct"

In [4]:
BASE_MODEL = LLAMA_3_1

HF_USER = "ed-donner"
DATASET_NAME = f"{HF_USER}/pricer-data"
MAX_SEQUENCE_LENGTH = 182
QUANT_4_BIT = True

In [5]:
# Log in to huggingface
hf_token = userdata.get('Hugging_face_API')
login(hf_token, add_to_git_credential=True)

In [6]:
# function to investigate tokenizer for different models

def check_tokenizer(model_name):
  print(f"Investigating tokenizer for model : {model_name}")
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
  for number in [0, 1, 10, 999, 1000]:
    # without special tokens
    without_special_tokens = tokenizer.encode(str(number), add_special_tokens=False)
    # with special tokens
    with_special_tokens = tokenizer.encode(str(number), add_special_tokens=True)
    print(f"The tokens for {number} >>> wihtout special tokens : {without_special_tokens} >>> with special tokens : {with_special_tokens}")

In [7]:
check_tokenizer(BASE_MODEL)

Investigating tokenizer for model : meta-llama/Meta-Llama-3.1-8B


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


The tokens for 0 >>> wihtout special tokens : [15] >>> with special tokens : [128000, 15]
The tokens for 1 >>> wihtout special tokens : [16] >>> with special tokens : [128000, 16]
The tokens for 10 >>> wihtout special tokens : [605] >>> with special tokens : [128000, 605]
The tokens for 999 >>> wihtout special tokens : [5500] >>> with special tokens : [128000, 5500]
The tokens for 1000 >>> wihtout special tokens : [1041, 15] >>> with special tokens : [128000, 1041, 15]


In [8]:
check_tokenizer(QWEN_2_5)

Investigating tokenizer for model : Qwen/Qwen2.5-7B
The tokens for 0 >>> wihtout special tokens : [15] >>> with special tokens : [15]
The tokens for 1 >>> wihtout special tokens : [16] >>> with special tokens : [16]
The tokens for 10 >>> wihtout special tokens : [16, 15] >>> with special tokens : [16, 15]
The tokens for 999 >>> wihtout special tokens : [24, 24, 24] >>> with special tokens : [24, 24, 24]
The tokens for 1000 >>> wihtout special tokens : [16, 15, 15, 15] >>> with special tokens : [16, 15, 15, 15]


In [9]:
check_tokenizer(GEMMA_2)

Investigating tokenizer for model : google/gemma-2-9b
The tokens for 0 >>> wihtout special tokens : [235276] >>> with special tokens : [2, 235276]
The tokens for 1 >>> wihtout special tokens : [235274] >>> with special tokens : [2, 235274]
The tokens for 10 >>> wihtout special tokens : [235274, 235276] >>> with special tokens : [2, 235274, 235276]
The tokens for 999 >>> wihtout special tokens : [235315, 235315, 235315] >>> with special tokens : [2, 235315, 235315, 235315]
The tokens for 1000 >>> wihtout special tokens : [235274, 235276, 235276, 235276] >>> with special tokens : [2, 235274, 235276, 235276, 235276]


In [10]:
check_tokenizer(PHI_3)

Investigating tokenizer for model : microsoft/Phi-3-medium-4k-instruct
The tokens for 0 >>> wihtout special tokens : [29871, 29900] >>> with special tokens : [29871, 29900]
The tokens for 1 >>> wihtout special tokens : [29871, 29896] >>> with special tokens : [29871, 29896]
The tokens for 10 >>> wihtout special tokens : [29871, 29896, 29900] >>> with special tokens : [29871, 29896, 29900]
The tokens for 999 >>> wihtout special tokens : [29871, 29929, 29929, 29929] >>> with special tokens : [29871, 29929, 29929, 29929]
The tokens for 1000 >>> wihtout special tokens : [29871, 29896, 29900, 29900, 29900] >>> with special tokens : [29871, 29896, 29900, 29900, 29900]


### **Load the Data**

In [11]:
dataset = load_dataset(DATASET_NAME)

In [12]:
print(dataset.shape)

{'train': (400000, 2), 'test': (2000, 2)}


In [13]:
from pprint import pprint
train_data, test_data = dataset['train'], dataset['test']

pprint(f"First sample data : \n\n{train_data[0]}")

('First sample data : \n'
 '\n'
 "{'text': 'How much does this cost to the nearest dollar?\\n\\nDelphi FG0166 "
 'Fuel Pump Module\\nDelphi brings 80 years of OE Heritage into each Delphi '
 'pump, ensuring quality and fitment for each Delphi part. Part is validated, '
 'tested and matched to the right vehicle application Delphi brings 80 years '
 'of OE Heritage into each Delphi assembly, ensuring quality and fitment for '
 'each Delphi part Always be sure to check and clean fuel tank to avoid '
 'unnecessary returns Rigorous OE-testing ensures the pump can withstand '
 'extreme temperatures Brand Delphi, Fit Type Vehicle Specific Fit, Dimensions '
 'LxWxH 19.7 x 7.7 x 5.1 inches, Weight 2.2 Pounds, Auto Part Position '
 'Unknown, Operation Mode Mechanical, Manufacturer Delphi, Model FUEL PUMP, '
 "Dimensions 19.7\\n\\nPrice is $227.00', 'price': 226.95}")


### **Prepare our Base LLama model for evaluation**

Loading base model in 4bit quantization

In [14]:
pip install -U bitsandbytes



In [15]:
## pick the right quantization
from transformers import BitsAndBytesConfig

if QUANT_4_BIT:
  quant_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_compute_dtype=torch.bfloat16,
      bnb_4bit_quant_type='nf4'
  )
else:
  quant_config = BitsAndBytesConfig(
      load_in_8bit=True,
      bnb_8bit_compute_dtype=torch.bfloat16
  )

In [16]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto"
)

base_model.generation_config.pad_token_id = tokenizer.pad_token_id


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [17]:
print(f"Memory footprint : {base_model.get_memory_footprint() / 1e9:.1f} GB")

Memory footprint : 5.6 GB


In [18]:
def extract_price(s):
  if "Price is $" in s:
    contents = s.split("Price is $")[1]
    contents = contents.replace(',', '').replace('$', '')
    match = re.search(r"[-+]?\d*\.\d+|\d+", contents)
    return float(match.group()) if match else 0
  return 0

In [19]:
extract_price("Price is $999 for this good.")

999.0

In [20]:
# model prediction / inference
def model_prediction(prompt):
  set_seed(42)
  inputs = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
  print(inputs.shape)
  attention_mask = torch.ones(inputs.shape, device="cuda")
  outputs = base_model.generate(inputs, max_new_tokens=4, attention_mask=attention_mask, num_return_sequences=1)
  response = tokenizer.decode(outputs[0])
  return extract_price(response)

In [21]:
model_prediction(test_data[0]['text'])

torch.Size([1, 175])


1800.0