Quantization

In [None]:
!pip install -U transformers bitsandbytes accelerate

Collecting transformers
  Downloading transformers-4.53.1-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014

In [None]:
!pip install -U bitsandbytes



In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM , BitsAndBytesConfig
import torch

In [None]:
# Loading the model
model_id = "AdamLucek/Orpo-Llama-3.2-1B-15k"
base_model = AutoModelForCausalLM.from_pretrained(
    model_id).to("cuda")

# INT8 config
bnb_config_8bit = BitsAndBytesConfig(
    load_in_8bit=True)

model_8bit = AutoModelForCausalLM.from_pretrained(
    model_id, quantization_config=bnb_config_8bit, low_cpu_mem_usage=True,)

# 4 Bit config
bnb_config_4bit = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model_4bit = AutoModelForCausalLM.from_pretrained(
    model_id, quantization_config=bnb_config_4bit, low_cpu_mem_usage=True,)

# loading tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# print model size
print(f"Base model size: {base_model.get_memory_footprint():,} bytes\n")
print(f"INT8 model size: {model_8bit.get_memory_footprint():,} bytes\n")
print(f"4Bit model size: {model_4bit.get_memory_footprint():,} bytes")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/909 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/419 [00:00<?, ?B/s]

Base model size: 4,943,274,112 bytes

INT8 model size: 1,498,558,592 bytes

4Bit model size: 1,012,019,328 bytes


**INTERPRETATION**

| Version      | Size (bytes)  | Size (approx) | Notes                                                  |
| ------------ | ------------- | ------------- | ------------------------------------------------------ |
| Base (FP32)  | 4,943,274,112 | \~4.94 GB     | Full-precision model loaded into GPU                   |
| 8-bit (INT8) | 1,498,558,592 | \~1.50 GB     | \~70% memory reduction with little loss in quality     |
| 4-bit (NF4)  | 1,012,019,328 | \~1.01 GB     | \~80%+ memory reduction, best for constrained hardware |


**Comparing the first layer of each version of the model**

In [None]:
# Looking at the full model architecture
base_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128258, 2048, padding_idx=128257)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-0

In [None]:
# Weight from the first layer - Base model
base_weights = base_model.model.layers[0].self_attn.q_proj.weight.data
print("original weights :")
print(base_weights)
print("Shape: ", base_weights.shape,"\n")
print("-"*50 , "\n")

original weights :
tensor([[-0.0183,  0.0071,  0.0218,  ..., -0.0070, -0.0089,  0.0149],
        [ 0.0112,  0.0593,  0.0630,  ..., -0.0334, -0.0147,  0.0058],
        [ 0.0182,  0.0141,  0.0362,  ..., -0.0432, -0.0388, -0.0233],
        ...,
        [ 0.0305,  0.0289,  0.0801,  ..., -0.0768, -0.0311, -0.0333],
        [ 0.0242, -0.0325,  0.0369,  ..., -0.0124, -0.0268, -0.0150],
        [-0.0264, -0.0498, -0.0210,  ...,  0.0602,  0.0130, -0.0008]],
       device='cuda:0')
Shape:  torch.Size([2048, 2048]) 

-------------------------------------------------- 



In [None]:
# Weights From the First Layer - 8bit
weights_8bit = model_8bit.model.layers[0].self_attn.q_proj.weight.data
print("INT8 weights:")
print(weights_8bit)
print("Shape: ", weights_8bit.shape, "\n")

print("-" * 50, "\n")

# Weights From the First Layer - 4bit
weights_4bit = model_4bit.model.layers[0].self_attn.q_proj.weight.data
print("4Bit weights:")
print(weights_4bit)
print("Shape: ", weights_4bit.shape, "\n")

INT8 weights:
tensor([[-49,  19,  59,  ..., -19, -24,  40],
        [ 13,  69,  73,  ..., -39, -17,   7],
        [ 17,  13,  34,  ..., -40, -36, -22],
        ...,
        [ 16,  15,  43,  ..., -41, -17, -18],
        [ 36, -48,  54,  ..., -18, -39, -22],
        [-14, -27, -11,  ...,  32,   7,   0]], device='cuda:0',
       dtype=torch.int8)
Shape:  torch.Size([2048, 2048]) 

-------------------------------------------------- 

4Bit weights:
tensor([[ 41],
        [213],
        [ 65],
        ...,
        [ 92],
        [ 75],
        [135]], device='cuda:0', dtype=torch.uint8)
Shape:  torch.Size([2097152, 1]) 



**testing generation**


running the same prompt to see the difference in the output of each model

In [None]:
# funtion for generating with the models
def generate_response(model , message , tokenizer):

  # format message
  messages = [{"role": "user", "content": message}]
  prompt = tokenizer.apply_chat_template(messages , tokenize = False , add_generation_prompt = True)


  # tokenize and generate
  inputs = tokenizer(prompt , return_tensors="pt").to("cuda")
  outputs = model.generate(
      **inputs,
      max_new_tokens=256,
      temperature=0.7,
      do_sample=True,
      top_k = 50 ,
      top_p = 0.95
  )

  # Decode
  return tokenizer.decode(outputs[0], skip_special_tokens=True)

prompt = "what is large language model"

base_response = generate_response(base_model , prompt , tokenizer)
print("Base Model Response:\n")
print(base_response)
print("-" * 50)


int8_response = generate_response(model_8bit , prompt , tokenizer)
print("INT8 Model Response:\n")
print(int8_response)
print("-" * 50)


bit4_response = generate_response(model_4bit , prompt , tokenizer)
print("4Bit Model Response:\n")
print(bit4_response)

Base Model Response:

user
what is large language model
assistant
To answer your question, a large language model (LLM) is a type of artificial intelligence (AI) system that uses machine learning algorithms to understand and generate human-like text. These models are trained on large amounts of data, usually through a process called "language modeling," which involves predicting the next word in a sequence of words based on previous words. LLMs can be used for a variety of tasks, such as translation, summarization, question-answering, and more.
--------------------------------------------------
INT8 Model Response:

user
what is large language model
assistant
A large language model (LLM) is a type of artificial intelligence (AI) system that is able to understand and generate human language. It is capable of learning from data, processing it, and then using that information to generate new language. LLMs are typically trained on large amounts of data, which is then used to create a mode

## Calculating Perplexity

Perplexity is a measure of how well the model predicts the next token where:  
- Lower perplexity = model is more confident/accurate in its predictions
- Higher perplexity = model is more uncertain/confused

In [None]:
def calculate_perplexity(model, text):
    # Encode the text
    encodings = tokenizer(text, return_tensors='pt').to("cuda")

    # Define input_ids and target_ids
    input_ids = encodings.input_ids
    target_ids = input_ids.clone()

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

    # Loss calculation
    neg_log_likelihood = outputs.loss

    # Perplexity calculation
    ppl = torch.exp(neg_log_likelihood)

    return ppl

In [None]:
base_perplexity = calculate_perplexity(base_model, base_response[41:])
print(f"Base Model Perplexity:  {base_perplexity.item():.2f}", "\n")
int8_perplexity = calculate_perplexity(model_8bit, int8_response[41:])
print(f"INT8 Model Perplexity:  {int8_perplexity.item():.2f}", "\n")
bit4_perplexity = calculate_perplexity(model_4bit, bit4_response[41:])
print(f"4Bit Model Perplexity:  {bit4_perplexity.item():.2f}")

Base Model Perplexity:  3.09 

INT8 Model Perplexity:  3.04 

4Bit Model Perplexity:  2.94


**Quantizing and Converting to GGUF Format**

In [None]:
%cd /content
!rm -rf llama.cpp
!git clone https://github.com/ggerganov/llama.cpp
%cd llama.cpp


/content
Cloning into 'llama.cpp'...
remote: Enumerating objects: 56279, done.[K
remote: Counting objects: 100% (433/433), done.[K
remote: Compressing objects: 100% (225/225), done.[K
remote: Total 56279 (delta 358), reused 208 (delta 208), pack-reused 55846 (from 2)[K
Receiving objects: 100% (56279/56279), 133.25 MiB | 21.56 MiB/s, done.
Resolving deltas: 100% (41090/41090), done.
/content/llama.cpp


In [None]:
!mkdir build
%cd build


/content/llama.cpp/build


In [None]:
!cmake .. -DLLAMA_BUILD_QUANTIZE=ON


-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Found Git: /usr/bin/git (found version "2.34.1")
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE
-- CMAKE_SYSTEM_PROCESSOR: x86_64
-- GGML_SYSTEM_ARCH: x86
-- Including CPU backend
-- Found OpenMP_C: -fopenmp (found version "4.5")
-- Found OpenMP_CXX: -fopenmp (found version "4.5")
-- Found OpenMP: TRUE (found version "4.5")
-- x86 detected
-- Adding CPU backend variant ggml-cpu: -march=native 
-- ggml version: 0.0.58

In [None]:

!cmake --build . --config Release


[  0%] [32mBuilding C object ggml/src/CMakeFiles/ggml-base.dir/ggml.c.o[0m
[  0%] [32mBuilding CXX object ggml/src/CMakeFiles/ggml-base.dir/ggml.cpp.o[0m
[  1%] [32mBuilding C object ggml/src/CMakeFiles/ggml-base.dir/ggml-alloc.c.o[0m
[  1%] [32mBuilding CXX object ggml/src/CMakeFiles/ggml-base.dir/ggml-backend.cpp.o[0m
[  2%] [32mBuilding CXX object ggml/src/CMakeFiles/ggml-base.dir/ggml-opt.cpp.o[0m
[  2%] [32mBuilding CXX object ggml/src/CMakeFiles/ggml-base.dir/ggml-threading.cpp.o[0m
[  2%] [32mBuilding C object ggml/src/CMakeFiles/ggml-base.dir/ggml-quants.c.o[0m
[  3%] [32mBuilding CXX object ggml/src/CMakeFiles/ggml-base.dir/gguf.cpp.o[0m
[  3%] [32m[1mLinking CXX shared library ../../bin/libggml-base.so[0m
[  3%] Built target ggml-base
[  4%] [32mBuilding C object ggml/src/CMakeFiles/ggml-cpu.dir/ggml-cpu/ggml-cpu.c.o[0m
[  4%] [32mBuilding CXX object ggml/src/CMakeFiles/ggml-cpu.dir/ggml-cpu/ggml-cpu.cpp.o[0m
[  4%] [32mBuilding CXX object ggml/src/CMa

In [None]:
!find /content/llama.cpp -type f -name quantize


In [None]:
!/content/llama.cpp/build/bin/quantize \
/content/Orpo-Llama-3.2-1B-15k/Llama-3.2-1B-15k-F16.gguf \
/content/Orpo-Llama-3.2-1B-15k/Llama-3.2-1B-15k-Q4_K_M.gguf \
Q4_K_M


/bin/bash: line 1: /content/llama.cpp/build/bin/quantize: No such file or directory
