In [1]:
!pip install -q --upgrade bitsandbytes accelerate
!pip install -U bitsandbytes



In [2]:
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import gc

login(userdata.get('HF_TOKEN'),add_to_git_credential=True)


In [3]:
#instruct models and 1 reasoning model

#Llama requires being approved
LLAMA1 = 'meta-llama/Llama-3.1-8B-Instruct'

LLAMA2 = 'meta-llama/Llama-3.2-1B-Instruct'

PHI = "microsoft/Phi-4-mini-instruct"
QWEN = "Qwen/Qwen3-4B-Instruct-2507"

In [4]:
messages = [
    {"role": "user", "content":"Tell a joke about Lord of The Rings"}
]

Quantization is a technique used to reduce the memory footprint and computational cost of large language models (LLMs) by representing their weights and activations with lower precision data types, such as 8-bit integers (INT8) or 4-bit integers (INT4), instead of the standard 32-bit floating-point numbers (FP32).

If you have a continuous range of numbers (like real numbers between 0 and 100), and you decide to only represent them with integers (0, 1, 2, ... 100), you've 'quantized' the data. You've limited the possible values to a finite, discrete set.

In [5]:
#Quantization Config

quant_conf = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [6]:
#Tokenizer

tokenizer = AutoTokenizer.from_pretrained(LLAMA2)
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")

In [7]:
inputs

tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   1544,   4723,    220,   2366,     20,    271, 128009, 128006,
            882, 128007,    271,  41551,    264,  22380,    922,  10425,    315,
            578,  47380, 128009]], device='cuda:0')

In [8]:
#The model
model = AutoModelForCausalLM.from_pretrained(
    LLAMA2,
    quantization_config=quant_conf,
    device_map="auto")

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [9]:
memory = model.get_memory_footprint() / 1e6
print(f"Memory footprint: {memory:.2f} MB")

Memory footprint: 1012.01 MB


In [10]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm

In [11]:
#running the model

outputs = model.generate(inputs, max_new_tokens=80)
outputs[0]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


tensor([128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
            25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
           220,   1544,   4723,    220,   2366,     20,    271, 128009, 128006,
           882, 128007,    271,  41551,    264,  22380,    922,  10425,    315,
           578,  47380, 128009, 128006,  78191, 128007,    271,  10445,   1550,
         39421,   3181,    596,   5687,    733,    311,  15419,   1980,  18433,
           433,   1047,    264,   2763,    315,    330,  11029,      1,    323,
           330,   1402,      1,    311,    990,   1555,     13, 128009],
       device='cuda:0')

In [12]:
tokenizer.decode(outputs[0])

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 27 Nov 2025\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTell a joke about Lord of The Rings<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nWhy did Gandalf\'s staff go to therapy?\n\nBecause it had a lot of "fire" and "ember" to work through.<|eot_id|>'

In [13]:
#Clean up memory

del model, inputs, tokenizer, outputs
gc.collect()
torch.cuda.empty_cache()

In [None]:
#Wrapping all above into a function and using streaming and generating prompts

def generate(mode, messages, quant=True, max_new_tokens=80):
  tokenizer = AutoTokenizer.from_pretrained(model)
  tokenizer.pad_token = tokenizer.eos_token #to properly add spaces
  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
