In [9]:
from huggingface_hub import login
from transformers import AutoTokenizer
from google.colab import userdata

In [23]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [24]:
microsoft_tokenizer = "microsoft/Phi-3-mini-4k-instruct"
qwen_tokenizer = "Qwen/Qwen3-0.6B"
starcoder_tokenizer = "bigcode/starcoder2-3b"
llama_tokenizer = "meta-llama/Llama-3.1-8B"

In [25]:
messages = [
    {'role': 'system', 'content': 'You are a helpful assistant.'},
    {'role': 'user', 'content': 'Tell a light-hearted joke related to Data Scientists.'}
]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(llama_tokenizer, trust_remote_code=True)

text = "I am excited to show tokenizer in action. Let's start."

tokens = tokenizer.encode(text)

print(tokens)
print()

print(tokenizer.batch_decode(tokens))
print()

for token_id in tokens:
  print(f"{token_id} = {tokenizer.decode(token_id)}")

print()

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(prompt)

In [26]:
tokenizer = AutoTokenizer.from_pretrained(starcoder_tokenizer)

code = """
def hello_world():
    print("Hello, world!")
"""

tokens = tokenizer.encode(code)

print(tokens)
print()

print(tokenizer.batch_decode(tokens))
print()

for token_id in tokens:
  print(f"{token_id} = {tokenizer.decode(token_id)}")

[222, 610, 17966, 100, 5879, 2284, 303, 1489, 459, 8302, 49, 5810, 16013, 222]

['\n', 'def', ' hello', '_', 'world', '():', '\n   ', ' print', '("', 'Hello', ',', ' world', '!")', '\n']

222 = 

610 = def
17966 =  hello
100 = _
5879 = world
2284 = ():
303 = 
   
1489 =  print
459 = ("
8302 = Hello
49 = ,
5810 =  world
16013 = !")
222 = 



In [27]:
tokenizer = AutoTokenizer.from_pretrained(qwen_tokenizer, trust_remote_code=True)

text = "I am excited to show tokenizer in action. Let's start."

tokens = tokenizer.encode(text)

print(tokens)
print()

print(tokenizer.decode(tokens))
print()


print(tokenizer.batch_decode(tokens))
print()

for token_id in tokens:
  print(f"{token_id} = {tokenizer.decode(token_id)}")

print()

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(prompt)

[40, 1079, 12035, 311, 1473, 45958, 304, 1917, 13, 6771, 594, 1191, 13]

I am excited to show tokenizer in action. Let's start.

['I', ' am', ' excited', ' to', ' show', ' tokenizer', ' in', ' action', '.', ' Let', "'s", ' start', '.']

40 = I
1079 =  am
12035 =  excited
311 =  to
1473 =  show
45958 =  tokenizer
304 =  in
1917 =  action
13 = .
6771 =  Let
594 = 's
1191 =  start
13 = .

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Tell a light-hearted joke related to Data Scientists.<|im_end|>
<|im_start|>assistant



In [28]:
tokenizer = AutoTokenizer.from_pretrained(microsoft_tokenizer, trust_remote_code=True)

text = "I am excited to show tokenizer in action. Let's start."

tokens = tokenizer.encode(text)

print(tokens)
print()

print(tokenizer.decode(tokens))
print()


print(tokenizer.batch_decode(tokens))
print()

for token_id in tokens:
  print(f"{token_id} = {tokenizer.decode(token_id)}")

print()

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(prompt)

[306, 626, 24173, 304, 1510, 5993, 3950, 297, 3158, 29889, 2803, 29915, 29879, 1369, 29889]

I am excited to show tokenizer in action. Let's start.

['I', 'am', 'excited', 'to', 'show', 'token', 'izer', 'in', 'action', '.', 'Let', "'", 's', 'start', '.']

306 = I
626 = am
24173 = excited
304 = to
1510 = show
5993 = token
3950 = izer
297 = in
3158 = action
29889 = .
2803 = Let
29915 = '
29879 = s
1369 = start
29889 = .

<|system|>
You are a helpful assistant.<|end|>
<|user|>
Tell a light-hearted joke related to Data Scientists.<|end|>
<|assistant|>

