<a href="https://colab.research.google.com/github/Soroushav/llm_basics/blob/main/modles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q --upgrade bitsandbytes accelerate

In [None]:
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM, TextStreamer
import torch
import gc

In [None]:
!git config --global credential.helper store

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
LLAMA = "meta-llama/Llama-3.1-8B-Instruct"
PHI = "microsoft/Phi-4-mini-instruct"
GEMMA = "google/gemma-3-270m-it"
QWEN = "Qwen/Qwen3-4B-Instruct-2507"
DEEPSEEK = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

In [None]:
messages = [
    {"role": "system", "content": "Tell a funny joke for a group of data scientists"}
]

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(LLAMA)
print(tokenizer.pad_token)
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")

In [None]:
model = AutoModelForCausalLM.from_pretrained(LLAMA, quantization_config=quant_config, device_map="auto")

In [None]:
memory = model.get_memory_footprint() / 1e6
print(f"Memory footprint: {memory} MB")

In [None]:
outputs = model.generate(inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0]))

In [None]:
del model, inputs, outputs, tokenizer
gc.collect()
torch.cuda.empty_cache()

In [None]:
def generate(model, message, quant=True, max_new_tokens=50):
  tokenizer = AutoTokenizer.from_pretrained(model)
  tokenizer.pad_token = tokenizer.eos_token
  inputs = tokenizer.apply_chat_template(message, return_tensors="pt", add_generation_prompt=True).to("cuda")
  attention_mask = torch.ones_like(inputs, dtype=torch.long, device="cuda")
  if quant:
    model = AutoModelForCausalLM.from_pretrained(model, quantization_config=quant_config, device_map="auto")
  else:
    model = AutoModelForCausalLM.from_pretrained(model, device_map="auto")
  outputs = model.generate(inputs, attention_mask=attention_mask, max_new_tokens=max_new_tokens)
  print(tokenizer.decode(outputs[0]))
  del model, inputs, outputs, tokenizer
  gc.collect()
  torch.cuda.empty_cache()

In [None]:
generate(PHI, messages)