In [None]:
# # Login for once in terminal and give the hugging face read credential
# !huggingface-cli login
# !huggingface-cli download \
# --local-dir=/scratch/users/barman/cryptollm_tests/llama_tokenizer meta-llama/Llama-2-7b-hf \
# tokenizer.model tokenizer.json tokenizer_config.json

In [1]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load the tokenizer
tokenizer_path = "llama_tokenizer" # Directory with files
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [24]:
# Check what type of tokens exits
# Constants
stdout_padding = "#" * 20
# Confirm vocabulary size
print(f"{stdout_padding} Llama2 Tokenizer Details {stdout_padding}\n")
print(f"Llama2 tokenizer overview: {tokenizer}\n")
print(f"Llama2 Vocabulary Size: {len(tokenizer.get_vocab().keys())}\n")
# print(f"{stdout_padding} End of Llama2 Tokenizer Details {stdout_padding}

#################### Llama2 Tokenizer Details ####################

Llama2 tokenizer overview: LlamaTokenizerFast(name_or_path='llama_tokenizer', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False)}, clean_up_tokenization_spaces=False)

Llama2 Vocabulary Size: 32000



### Note
- Vocabulary Size: 32000
- Model Maximum Length: 1000000000000000019884624838656
- Padding Side: Right (Important for finetuning)
- Truncation Side: Right
- Special Tokens:
    - Beginning of Sentence Token: \<s>
    - End of Sentence Token: \</s>
    - Unknown Token: \<unk>

In [25]:
# Verify token IDs for Llama2 special tokens
print(f"{stdout_padding} Llama2 Special Tokens {stdout_padding}\n")

UNK = "<unk>" # Unknown token
BOS, EOS = "<s>", "</s>" # Begin of sequnece and end of sequence tokens

special_tokens = [UNK, BOS, EOS]

for token in special_tokens:
    print(f'Token ID for the special token {token}: {tokenizer.get_vocab()[token]}')
    print(f'Encoded {token} becomes: {tokenizer.encode(token)}\n')

print(f"{stdout_padding} End of Llama2 Special Tokens {stdout_padding}\n")

#################### Llama2 Special Tokens ####################

Token ID for the special token <unk>: 0
Encoded <unk> becomes: [1, 0]

Token ID for the special token <s>: 1
Encoded <s> becomes: [1, 1]

Token ID for the special token </s>: 2
Encoded </s> becomes: [1, 2]

#################### End of Llama2 Special Tokens ####################



### Note
- The \<s> token represents the beginning of a sequence, so when we convert any sequence to its tokenized form, we'll prepend it with the beginning of sequence token, \<s> or 1.
- But note that there is no end of token.

In [26]:
# Verify token IDs for Llama2 prompt symbols
print(f"{stdout_padding} Llama2 Prompt Symbols {stdout_padding}\n")

B_INST, E_INST = "[INST]", "[/INST]" # Begin of instruction and end of instruction symbols
B_SYS, E_SYS = "<<SYS>>\n", "\n<<SYS>>\n\n" # Begin of system message and end of system message symbols

prompt_symbols = [B_INST, E_INST, B_SYS, E_SYS]

for symbol in prompt_symbols:
    encoded_symbol = tokenizer.encode(symbol)
    print(f'Encoded {repr(symbol)} becomes: {encoded_symbol}')
    
    for token in encoded_symbol:
        print(f"\tToken ID {token} --> {repr(tokenizer.decode(token))}")

print(f"\n{stdout_padding} End of Llama2 Prompt Symbols {stdout_padding}\n")

#################### Llama2 Prompt Symbols ####################

Encoded '[INST]' becomes: [1, 518, 25580, 29962]
	Token ID 1 --> '<s>'
	Token ID 518 --> '['
	Token ID 25580 --> 'INST'
	Token ID 29962 --> ']'
Encoded '[/INST]' becomes: [1, 518, 29914, 25580, 29962]
	Token ID 1 --> '<s>'
	Token ID 518 --> '['
	Token ID 29914 --> '/'
	Token ID 25580 --> 'INST'
	Token ID 29962 --> ']'
Encoded '<<SYS>>\n' becomes: [1, 3532, 14816, 29903, 6778, 13]
	Token ID 1 --> '<s>'
	Token ID 3532 --> '<<'
	Token ID 14816 --> 'SY'
	Token ID 29903 --> 'S'
	Token ID 6778 --> '>>'
	Token ID 13 --> '\n'
Encoded '\n<<SYS>>\n\n' becomes: [1, 29871, 13, 9314, 14816, 29903, 6778, 13, 13]
	Token ID 1 --> '<s>'
	Token ID 29871 --> ''
	Token ID 13 --> '\n'
	Token ID 9314 --> '<<'
	Token ID 14816 --> 'SY'
	Token ID 29903 --> 'S'
	Token ID 6778 --> '>>'
	Token ID 13 --> '\n'
	Token ID 13 --> '\n'

#################### End of Llama2 Prompt Symbols ####################



### Note
- \<\<SYS>>\n , \n\<\</SYS>\n\n :The beginning of system message and end of system message symbols
- [INST] , [/INST] : The beginning of instruction and end of instruction symbols
- Some words are broken as words and some as phrases.
- Every token has a beginning of sentence encoded (bos) but not eos.

In [28]:
# Lets try with custom sentence:
# Test tokenizer on a sentence
print(f"{stdout_padding} Llama2 Tokenizer Sentence Example {stdout_padding}\n")
sentence = "RHEL subscription manager let's you manage packages on RedHat."
encoded_output = tokenizer.encode(sentence)
print(f"Original sentence: {sentence}")
print(f"Encoded sentence: {encoded_output}")

# Verify what each token ID correlates to
for token in encoded_output:
    print(f"Token ID {token} --> {tokenizer.decode(token)}")

print(f"{stdout_padding} End of Llama2 Tokenizer Sentence Example {stdout_padding}\n")

#################### Llama2 Tokenizer Sentence Example ####################

Original sentence: RHEL subscription manager let's you manage packages on RedHat.
Encoded sentence: [1, 390, 29950, 6670, 25691, 8455, 1235, 29915, 29879, 366, 10933, 9741, 373, 4367, 29950, 271, 29889]
Token ID 1 --> <s>
Token ID 390 --> R
Token ID 29950 --> H
Token ID 6670 --> EL
Token ID 25691 --> subscription
Token ID 8455 --> manager
Token ID 1235 --> let
Token ID 29915 --> '
Token ID 29879 --> s
Token ID 366 --> you
Token ID 10933 --> manage
Token ID 9741 --> packages
Token ID 373 --> on
Token ID 4367 --> Red
Token ID 29950 --> H
Token ID 271 --> at
Token ID 29889 --> .
#################### End of Llama2 Tokenizer Sentence Example ####################



In [29]:
# Lets try for numerical prompts
# Test tokenizer on a sentence
print(f"{stdout_padding} Llama2 Tokenizer Sentence Example {stdout_padding}\n")
sentence = "Can you add 2+2?"
encoded_output = tokenizer.encode(sentence)
print(f"Original sentence: {sentence}")
print(f"Encoded sentence: {encoded_output}")

# Verify what each token ID correlates to
for token in encoded_output:
    print(f"Token ID {token} --> {tokenizer.decode(token)}")

print(f"{stdout_padding} End of Llama2 Tokenizer Sentence Example {stdout_padding}\n")

#################### Llama2 Tokenizer Sentence Example ####################

Original sentence: Can you add 2+2?
Encoded sentence: [1, 1815, 366, 788, 29871, 29906, 29974, 29906, 29973]
Token ID 1 --> <s>
Token ID 1815 --> Can
Token ID 366 --> you
Token ID 788 --> add
Token ID 29871 --> 
Token ID 29906 --> 2
Token ID 29974 --> +
Token ID 29906 --> 2
Token ID 29973 --> ?
#################### End of Llama2 Tokenizer Sentence Example ####################



In [30]:
# Try with < 4 digit numbers
# Test tokenizer on a sentence
print(f"{stdout_padding} Llama2 Tokenizer Sentence Example {stdout_padding}\n")
sentence = "Can you add 2222 + 2222?"
encoded_output = tokenizer.encode(sentence)
print(f"Original sentence: {sentence}")
print(f"Encoded sentence: {encoded_output}")

# Verify what each token ID correlates to
for token in encoded_output:
    print(f"Token ID {token} --> {tokenizer.decode(token)}")

print(f"{stdout_padding} End of Llama2 Tokenizer Sentence Example {stdout_padding}\n")

#################### Llama2 Tokenizer Sentence Example ####################

Original sentence: Can you add 2222 + 2222?
Encoded sentence: [1, 1815, 366, 788, 29871, 29906, 29906, 29906, 29906, 718, 29871, 29906, 29906, 29906, 29906, 29973]
Token ID 1 --> <s>
Token ID 1815 --> Can
Token ID 366 --> you
Token ID 788 --> add
Token ID 29871 --> 
Token ID 29906 --> 2
Token ID 29906 --> 2
Token ID 29906 --> 2
Token ID 29906 --> 2
Token ID 718 --> +
Token ID 29871 --> 
Token ID 29906 --> 2
Token ID 29906 --> 2
Token ID 29906 --> 2
Token ID 29906 --> 2
Token ID 29973 --> ?
#################### End of Llama2 Tokenizer Sentence Example ####################



In [31]:
# Lets try with even bigger numbers
num = 50
# Test tokenizer on a sentence
print(f"{stdout_padding} Llama2 Tokenizer Sentence Example {stdout_padding}\n")
sentence = "Do you know the number 50 *{22222222222222"
encoded_output = tokenizer.encode(sentence)
print(f"Original sentence: {sentence}")
print(f"Encoded sentence: {encoded_output}")

# Verify what each token ID correlates to
for token in encoded_output:
    print(f"Token ID {token} --> {tokenizer.decode(token)}")

print(f"{stdout_padding} End of Llama2 Tokenizer Sentence Example {stdout_padding}\n")

#################### Llama2 Tokenizer Sentence Example ####################

Original sentence: Do you know the number 222222222222222
Encoded sentence: [1, 1938, 366, 1073, 278, 1353, 29871, 29906, 29906, 29906, 29906, 29906, 29906, 29906, 29906, 29906, 29906, 29906, 29906, 29906, 29906, 29906]
Token ID 1 --> <s>
Token ID 1938 --> Do
Token ID 366 --> you
Token ID 1073 --> know
Token ID 278 --> the
Token ID 1353 --> number
Token ID 29871 --> 
Token ID 29906 --> 2
Token ID 29906 --> 2
Token ID 29906 --> 2
Token ID 29906 --> 2
Token ID 29906 --> 2
Token ID 29906 --> 2
Token ID 29906 --> 2
Token ID 29906 --> 2
Token ID 29906 --> 2
Token ID 29906 --> 2
Token ID 29906 --> 2
Token ID 29906 --> 2
Token ID 29906 --> 2
Token ID 29906 --> 2
Token ID 29906 --> 2
#################### End of Llama2 Tokenizer Sentence Example ####################

