In [17]:
!pip install -U huggingface_hub



In [20]:
# Imports

import torch

from huggingface_hub import login
from transformers import pipeline
from transformers import AutoTokenizer

from diffusers import DiffusionPipeline
from datasets import load_dataset
import soundfile as sf
from IPython.display import Audio
import ipywidgets as widgets

In [21]:
from dotenv import load_dotenv
import os

In [22]:
load_dotenv()

True

In [25]:
hf_token = os.getenv("HUGGINGFACE_TOKEN")
login(token=hf_token)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Llama Tokenizer

In [26]:
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Meta-Llama-3.1-8B",
    token = hf_token,
    trust_remote_code=True)

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [27]:
text = 'I am excited to show Tokenizers in action to my LLM engineers'
tokens = tokenizer.encode(text)

In [33]:
len(tokens)

15

In [None]:
tokens
# encode of the tokens

[128000,
 40,
 1097,
 12304,
 311,
 1501,
 9857,
 12509,
 304,
 1957,
 311,
 856,
 445,
 11237,
 25175]

In [None]:
tokenizer.decode(tokens)
# decode of the tokens 

'<|begin_of_text|>I am excited to show Tokenizers in action to my LLM engineers'

In [None]:
tokenizer.batch_decode(tokens)
# batch decode of the tokens

['<|begin_of_text|>',
 'I',
 ' am',
 ' excited',
 ' to',
 ' show',
 ' Token',
 'izers',
 ' in',
 ' action',
 ' to',
 ' my',
 ' L',
 'LM',
 ' engineers']

In [40]:
#tokenizer.vocab
tokenizer.get_added_vocab()

{'<|begin_of_text|>': 128000,
 '<|end_of_text|>': 128001,
 '<|reserved_special_token_0|>': 128002,
 '<|reserved_special_token_1|>': 128003,
 '<|finetune_right_pad_id|>': 128004,
 '<|reserved_special_token_2|>': 128005,
 '<|start_header_id|>': 128006,
 '<|end_header_id|>': 128007,
 '<|eom_id|>': 128008,
 '<|eot_id|>': 128009,
 '<|python_tag|>': 128010,
 '<|reserved_special_token_3|>': 128011,
 '<|reserved_special_token_4|>': 128012,
 '<|reserved_special_token_5|>': 128013,
 '<|reserved_special_token_6|>': 128014,
 '<|reserved_special_token_7|>': 128015,
 '<|reserved_special_token_8|>': 128016,
 '<|reserved_special_token_9|>': 128017,
 '<|reserved_special_token_10|>': 128018,
 '<|reserved_special_token_11|>': 128019,
 '<|reserved_special_token_12|>': 128020,
 '<|reserved_special_token_13|>': 128021,
 '<|reserved_special_token_14|>': 128022,
 '<|reserved_special_token_15|>': 128023,
 '<|reserved_special_token_16|>': 128024,
 '<|reserved_special_token_17|>': 128025,
 '<|reserved_special_to

### Instruct Variants of Models

many models have a variant that has veen trained for use in Chats
These are typically lavelled with the word "instruct" at the end.
They have been trained to expect prompts with a particular format that includes system, user and assistant prompts.
There is a utility method apply_chat_template that will convert from the messages list format we are familiar with into the right input prompt for this model

In [45]:
llama3_chat_template = (
    "{% set loop_messages = messages %}"
    "{% for message in loop_messages %}"
        "{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}"
        "{% if loop.index0 == 0 %}"
            "{% set content = bos_token + content %}"
        "{% endif %}"
        "{{ content }}"
    "{% endfor %}"
    "{% if add_generation_prompt %}"
        "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
    "{% endif %}"
)

In [46]:
tokenizer.chat_template = llama3_chat_template

In [47]:
messages = [
    {"role":"system", 'content': "you are a helpful assistant"},
    {"role":"user", 'content': "Tell me light-hearted joke for a room of data scientists"}
]

In [60]:
prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors='pt')
print(tokenizer.decode(prompt[0]))

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

you are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

Tell me light-hearted joke for a room of data scientists<|eot_id|><|start_header_id|>assistant<|end_header_id|>




### Trying new models

We will now work with 3 models
Phi3 from MicroSoft Qwen2 from Alibaba Cloud Starcoder2 from BigCode(ServiceNow + HuggingFace + Nvidia)

In [52]:
PHI3_MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
QWEN2_MODEL_NAME = "Qwen/Qwen2-72B-Instruct"
STARCODER2_MODEL_NAME = 'bigcode/starcoder2-3b'


In [77]:
phi3_tokenizer = AutoTokenizer.from_pretrained(PHI3_MODEL_NAME)
qwen2_tokenizer = AutoTokenizer.from_pretrained(QWEN2_MODEL_NAME)
starcoder2_tokenizer = AutoTokenizer.from_pretrained(STARCODER2_MODEL_NAME)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

In [78]:
phi3_chat_template = (
    "{{ bos_token }}"
    "{% for message in messages %}"
        "{{ '<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>' + '\n' }}"
    "{% endfor %}"
    "{% if add_generation_prompt %}"
        "{{ '<|assistant|>\n' }}"
    "{% endif %}"
)
qwen2_chat_template = (
    "{% for message in messages %}"
        "{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}"
    "{% endfor %}"
    "{% if add_generation_prompt %}"
        "{{ '<|im_start|>assistant\n' }}"
    "{% endif %}"
)
starcoder2_chat_template = (
    "{% for message in messages %}"
        "{{ '<|' + message['role'] + '|>\n' + message['content'] + '<|end|>\n' }}"
    "{% endfor %}"
    "{% if add_generation_prompt %}"
        "{{ '<|assistant|>' }}"
    "{% endif %}"
)



In [79]:
messages = [
    {"role":"system", 'content': "you are a helpful assistant"},
    {"role":"user", 'content': "Tell me light-hearted joke for a room of data scientists"}
]

In [86]:
phi3_tokenizer.chat_template = phi3_chat_template
qwen2_tokenizer.chat_template = qwen2_chat_template
starcoder2_tokenizer = starcoder2_chat_template

In [87]:
text = 'I am excited to show Tokenizers in action to my LLM engineers'

In [99]:
print(tokenizer.apply_chat_template(messages,
                                    tokenizer = False,
                                    add_generation_prompt= True,
                                   ))
print('-'* 100)
print(phi3_tokenizer.apply_chat_template(messages,
                                    tokenizer = False,
                                    add_generation_prompt= True,
                                   ))
print('-'* 100)
print(qwen2_tokenizer.apply_chat_template(messages,
                                    tokenizer = False,
                                    add_generation_prompt= True,
                                   ))
print('-'* 100)
print(starcoder2_tokenizer.apply_chat_template(messages,
                                    tokenizer = False,
                                    add_generation_prompt= True,)

)

[128000, 128006, 9125, 128007, 271, 9514, 527, 264, 11190, 18328, 128009, 128006, 882, 128007, 271, 41551, 757, 3177, 70395, 22380, 369, 264, 3130, 315, 828, 14248, 128009, 128006, 78191, 128007, 271]
----------------------------------------------------------------------------------------------------
[1, 32006, 366, 526, 263, 8444, 20255, 32007, 32010, 24948, 592, 3578, 29899, 23057, 287, 2958, 446, 363, 263, 5716, 310, 848, 9638, 2879, 32007, 32001]
----------------------------------------------------------------------------------------------------
[151644, 8948, 198, 9330, 525, 264, 10950, 17847, 151645, 198, 151644, 872, 198, 40451, 752, 3100, 69295, 21646, 369, 264, 3054, 315, 821, 13923, 151645, 198, 151644, 77091, 198]
----------------------------------------------------------------------------------------------------


AttributeError: 'str' object has no attribute 'apply_chat_template'

In [83]:
qwen2_tokenizer = AutoTokenizer.from_pretrained(QWEN2_MODEL_NAME)

text = 'I am excited to show Tokenizers in action to my LLM engineers'

In [84]:
print(tokenizer.encode(text))
print('-'* 100)
print(phi3_tokenizer.encode(text))
print('-'*100)
print(qwen2_tokenizer.encode(text))

[128000, 40, 1097, 12304, 311, 1501, 9857, 12509, 304, 1957, 311, 856, 445, 11237, 25175]
----------------------------------------------------------------------------------------------------
[306, 626, 24173, 304, 1510, 25159, 19427, 297, 3158, 304, 590, 365, 26369, 6012, 414]
----------------------------------------------------------------------------------------------------
[40, 1079, 12035, 311, 1473, 9660, 12230, 304, 1917, 311, 847, 444, 10994, 24198]


In [85]:
print(phi3_tokenizer.apply_chat_template(messages,
                                    tokenizer = False,
                                    add_generation_prompt= True)
)

[1, 32006, 366, 526, 263, 8444, 20255, 32007, 32010, 24948, 592, 3578, 29899, 23057, 287, 2958, 446, 363, 263, 5716, 310, 848, 9638, 2879, 32007, 32001]


In [97]:
phi3_prompt = phi3_tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors='pt')
qwen2_prompt = qwen2_tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors='pt')
starcoder2_prompt = starcoder2_tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt=True,
    return_tensors='pt')

AttributeError: 'str' object has no attribute 'apply_chat_template'

In [101]:
print(tokenizer.decode(prompt[0]))
print('-'* 100)
print(phi3_tokenizer.decode(phi3_prompt[0]))

" traditions <angon Gl WE getStringDESC-carustABLE-def.ToolStripMenuIteming File("owon ful           loglettersection getString crossorigin
----------------------------------------------------------------------------------------------------
<s><|system|> you are a helpful assistant<|end|><|user|> Tell me light-hearted joke for a room of data scientists<|end|><|assistant|>


In [102]:
print(qwen2_tokenizer.decode(qwen2_prompt[0]))

<|im_start|>system
you are a helpful assistant<|im_end|>
<|im_start|>user
Tell me light-hearted joke for a room of data scientists<|im_end|>
<|im_start|>assistant



In [103]:
starcoder2_tokenizer = AutoTokenizer.from_pretrained(
    STARCODER2_MODEL_NAME,
    trust_remote_code=True
)

In [104]:
code = """
def hello_world():
print("Hello, world!",person)
"""

In [105]:
tokens = starcoder2_tokenizer.encode(code)

In [106]:
tokens

[222,
 610,
 17966,
 100,
 5879,
 2284,
 222,
 1243,
 459,
 8302,
 49,
 5810,
 13700,
 6427,
 46,
 222]

In [107]:
for token in tokens:
    print(f"{token}: {starcoder2_tokenizer.decode(token)}")

222: 

610: def
17966:  hello
100: _
5879: world
2284: ():
222: 

1243: print
459: ("
8302: Hello
49: ,
5810:  world
13700: !",
6427: person
46: )
222: 

