In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
!pip install transformers accelerate optimum auto-gptq --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m399.9/399.9 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.2/12.2 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.7/174.7 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━

# Quantization

There is actually no necessity to quantize a pretrained model. There exists all sorts of pre-quantized models on HuggingFace already. So, we could just explore hugging face to select an already quantized model. Nonetheless, I will give a short description on quantizing a model.

https://huggingface.co/docs/transformers/main_classes/quantization

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
import torch

In [4]:
model_id = "aisquared/dlite-v2-355m"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, device_map="auto")

model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", temperature = 0.5)

tokenizer_config.json:   0%|          | 0.00/262 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/735M [00:00<?, ?B/s]

In [None]:
# Calculate the size of the model parameters in bytes
model_size_bytes = sum(p.numel() * p.element_size() for p in model.parameters())

# Convert the size to gigabytes
model_size_gb = model_size_bytes / (1024**3)

print(f"Model size: {model_size_gb:.2f} GB")

Model size: 1.32 GB


In [None]:
# Move the model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Calculate the GPU memory usage
torch.cuda.empty_cache()
allocated_memory = torch.cuda.memory_allocated(device)

# Print the size in GB
size_gb = allocated_memory / (1024 ** 3)
print(f"Model size on GPU: {size_gb:.5f} GB")

Model size on GPU: 1.34528 GB


As you can see, the model's size if 1.32 GB for 355 million parameters with float-32 precision. lets verify this mathematically as well.

In [None]:
total_model_parameters = sum(p.numel() for p in model.parameters());total_model_parameters

354826240

In [None]:
# Get the data type of the first parameter
first_param_dtype = next(model.parameters()).dtype

print(f"Data Type for Parameters: {first_param_dtype}")

Data Type for Parameters: torch.float32


We know that there are 354826240 parameters with precision float32, we can compute model size as

    (number_of_parameters * 4) / (1024 ** 3)

we multiply by 4 because each parameter with 32 floating precision is 4 bytes in size. So, to convert the final multiplied value, which is in byte, we divide by 1024**3 to get final value in GB.

In [None]:
(total_model_parameters * 4) / (1024 ** 3)

1.3218307495117188

<b> Hence, the size of our model is 1.32 GB respectively with 355 million parameters(approx.) with float32 precision.</b> Lets also ask the model a question to see the results before and after quantization.

In order to make this model answer our question, we need to set up a pipeline, which can be done by downloading instruct_pipeline.py file from hugging face. After downloading the file, we need to instantiate InstructionTextGenerationPipeline class and pass our model and tokenizer, which will set up the question answering pipeline respectively.

In [5]:
from huggingface_hub import hf_hub_download
hf_hub_download(repo_id="aisquared/dlite-v2-1_5b", filename="instruct_pipeline.py")

instruct_pipeline.py:   0%|          | 0.00/6.96k [00:00<?, ?B/s]

'/root/.cache/huggingface/hub/models--aisquared--dlite-v2-1_5b/snapshots/97440ff1b6ef749423758e3495cdce1b5e68ee92/instruct_pipeline.py'

In [6]:
cd /root/.cache/huggingface/hub/models--aisquared--dlite-v2-1_5b/snapshots/97440ff1b6ef749423758e3495cdce1b5e68ee92

/root/.cache/huggingface/hub/models--aisquared--dlite-v2-1_5b/snapshots/97440ff1b6ef749423758e3495cdce1b5e68ee92


In [7]:
import os

os.listdir("/root/.cache/huggingface/hub/models--aisquared--dlite-v2-1_5b/snapshots/97440ff1b6ef749423758e3495cdce1b5e68ee92")

['instruct_pipeline.py']

In [8]:
from instruct_pipeline import InstructionTextGenerationPipeline

In [None]:
generate_text = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer)

In [None]:
question = "Who is Donald Trump?"

In [None]:
generate_text(question)

'Donald Trump is a businessman, real estate mogul, and reality TV star. He has been married to Marla Maples since 1987 and has three children. He has owned and operated several properties in New York City, including Trump Tower, Trump Plaza Hotel & Tower, and Trump SoHo. He has also run for president on three occasions, winning the Republican nomination in 2016, defeating the Democratic nominee, Hillary Clinton, in the general election. He was born on July 4, 1946.'

We will perform 4-bit quantization for dlite-v2-1_5b model, which has 1.5 billion parameters.

In [None]:
model_id

'aisquared/dlite-v2-355m'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
quantization_config = GPTQConfig(bits=4, dataset = "c4", tokenizer=tokenizer) #  4bit quantization

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
model1 = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", quantization_config=quantization_config)



Downloading readme:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1694 > 1024). Running this sequence through the model will result in indexing errors


Quantizing transformer.h blocks :   0%|          | 0/24 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/4 [00:00<?, ?it/s]

<b> As we can see the original 355 million parameter model has  1.34 GB size. Lets first push the model to huggingface and see the results.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model1.push_to_hub("dlite-v2-355m-bi4tQuantization")
tokenizer.push_to_hub("dlite-v2-355m-bi4tQuantization")

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Sujan42024/dlite-v2-355m-bi4tQuantization/commit/053d748bdb1d1e4413f1b6429d4c44963a9241e5', commit_message='Upload tokenizer', commit_description='', oid='053d748bdb1d1e4413f1b6429d4c44963a9241e5', pr_url=None, pr_revision=None, pr_num=None)

# Re-retrieving the quantized model from huggingface

In [9]:
quantized_model_id = "Sujan42024/dlite-v2-355m-bi4tQuantization"

tokenizer = AutoTokenizer.from_pretrained(quantized_model_id)
model_quantized = AutoModelForCausalLM.from_pretrained(quantized_model_id, device_map="cuda", temperature = 0.5)

tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.83k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

In [10]:
quantized_generate_text = InstructionTextGenerationPipeline(model=model_quantized, tokenizer=tokenizer)

In [12]:
quantized_generate_text("Who is Donald Trump? Give me one sentence answer with less than 30 words.")

'Donald Trump is a successful businessman and reality TV star. He has been married to his wife for over 40 years. He is a billionaire and has been in the public eye for decades. He is a Republican presidential candidate for president. He has made controversial comments about women, immigrants, and other groups. He has called for a ban on Muslims entering the United States. He has called for a temporary ban on all Muslims entering the United States. He has called for a temporary ban on Muslims entering the United States. He has called for a temporary ban on Muslims entering the United States. He has called for a temporary ban on Muslims entering the United States. He has called for a temporary ban on Muslims entering the United States. He has called for a temporary ban on Muslims entering the United States. He has called for a temporary ban on Muslims entering the United States. He has called for a temporary ban on Muslims entering the United States. He has called for a temporary ban on

<b> As you can see, we can use a quantized model to make predictions.