<a href="https://colab.research.google.com/github/SadeghMahmoudAbadi/Open-Source-LLM-on-Colab/blob/main/5-Tokenizers/Tokenizers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q --upgrade bitsandbytes accelerate

In [None]:
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import gc

In [None]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
PHI = "microsoft/Phi-4-mini-instruct"
QWEN = "Qwen/Qwen3-4B-Instruct-2507"
DEEPSEEK = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
GEMMA = "google/gemma-3-270m-it"

In [None]:
messages = [
    {"role": "user", "content": "Tell me a fun fact."}
]

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
def generate(model, messages, quant=True, max_new_tokens=500):
    tokenizer = AutoTokenizer.from_pretrained(model)
    tokenizer.pad_token = tokenizer.eos_token
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
    attention_mask = torch.ones_like(input_ids, dtype=torch.long, device="cuda")
    streamer = TextStreamer(tokenizer)
    if quant:
        model = AutoModelForCausalLM.from_pretrained(model, quantization_config=quant_config).to("cuda")
    else:
        model = AutoModelForCausalLM.from_pretrained(model).to("cuda")
    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens, streamer=streamer)

    # Clean up memory
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
generate(PHI, messages)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<|user|>Tell me a fun fact.<|end|><|assistant|>Did you know that octopuses have three hearts? Two pump blood to the gills, while the third pumps it to the rest of the body. Additionally, octopuses have blue blood, which contains copper-based molecules that bind to oxygen. This adaptation allows them to survive in low-oxygen environments. Isn't that fascinating?<|end|>


In [None]:
generate(QWEN, messages)

In [None]:
generate(DEEPSEEK, messages, quant=False)

In [None]:
generate(GEMMA, messages, quant=False)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(QWEN, trust_remote_code=True)

In [None]:
text = "My name is Sadegh Mahmoud Abadi, and I'm eager about LLMs!"
tokens = tokenizer.encode(text)
tokens

[5050,
 829,
 374,
 30681,
 791,
 71,
 93186,
 3680,
 2767,
 11,
 323,
 358,
 2776,
 23541,
 911,
 444,
 10994,
 82,
 0]

In [None]:
character_count = len(text)
word_count = len(text.split(' '))
token_count = len(tokens)
print(f"There are {character_count} characters, {word_count} words and {token_count} tokens")

There are 58 characters, 11 words and 19 tokens


In [None]:
tokenizer.decode(tokens)

"My name is Sadegh Mahmoud Abadi, and I'm eager about LLMs!"

In [None]:
tokenizer.batch_decode(tokens)

['My',
 ' name',
 ' is',
 ' Sad',
 'eg',
 'h',
 ' Mahmoud',
 ' Ab',
 'adi',
 ',',
 ' and',
 ' I',
 "'m",
 ' eager',
 ' about',
 ' L',
 'LM',
 's',
 '!']

In [None]:
print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

<|im_start|>user
Tell me a fun fact.<|im_end|>
<|im_start|>assistant

