# Simple word level tokenizer

In [20]:
with open('training-text.txt', 'r') as f:
    text = f.read()
words = sorted(list(set(text.split(' '))))
print("Vocabulary size: ", len(words), "words/tokens")

wtoi, itow = dict((word, index) for index, word in enumerate(words)), dict((index, word) for index, word in enumerate(words))
print("\nVocabulary: ",  wtoi)

encoder = lambda sentence: [wtoi[word] for word in sentence.split(' ')] # encode sentence to index
decoder = lambda indexes: ' '.join([itow[index] for index in indexes]) # decode index to sentence
print("example: ",encoder("some text for training"), decoder(encoder("some text for training")))

FileNotFoundError: [Errno 2] No such file or directory: 'training-text.txt'

# Simple character level tokenizer

In [None]:
with open('training-text.txt', 'r') as f:
    text = f.read()
chars = sorted(list(set(text)))
print("\nVocabulary size is ", len(chars), "chars/tokens: ", ''.join(chars))

ctoi, itoc = dict((char, index) for index, char in enumerate(chars)), dict((index, char) for index, char in enumerate(chars))
print("\nVocabulary: ",  ctoi)

encoder = lambda sentence: [ctoi[char] for char in sentence] # encode sentence to index
decoder = lambda indexes: ''.join([itoc[index] for index in indexes]) # decode index to sentence
print("example: ",encoder("some text for training"), decoder(encoder("some text for training")))


Vocabulary size is  13 chars/tokens:   aefgimnorstx

Vocabulary:  {' ': 0, 'a': 1, 'e': 2, 'f': 3, 'g': 4, 'i': 5, 'm': 6, 'n': 7, 'o': 8, 'r': 9, 's': 10, 't': 11, 'x': 12}
example:  [10, 8, 6, 2, 0, 11, 2, 12, 11, 0, 3, 8, 9, 0, 11, 9, 1, 5, 7, 5, 7, 4] some text for training


# tiktoken tokeniser
### tiktoken is a fast BPE tokeniser for use with OpenAI's models.
*Copyright (c) 2022 OpenAI, Shantanu Jain 'https://github.com/openai/tiktoken'*

pip install tiktoken
### To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("gpt-4")

In [39]:
import tiktoken

with open('shakespeare.txt', 'r') as f:
    text = f.read()
# text = 'Hello world, this is a test. '

print(len(text), 'lenth of chars')
print(len(set(text)), 'unique chars')
print(len(text.split(' ')), 'lenth of words, special chars are concidered and increased the vocabulary size!')
print(len(set(text.split(' '))), 'unique words, same problem as above!')

# gpt3.5 and gpt4
print("\n\nGPT3.5-4")
enc = tiktoken.get_encoding("cl100k_base") # gpt-4
#  nomber of tokens
print(len(enc.encode(text)), 'total tokens in the text')
print(len(set(enc.encode(text))), 'unique tokens in the text')
# print(enc.decode(enc.encode(text)))
# print(enc.encode(text))   

# gpt2 and gpt3
print("\n\nGPT2-3")
enc = tiktoken.get_encoding("gpt2")
# nomber of tokens
print(len(enc.encode(text)), 'total tokens in the text')
print(len(set(enc.encode(text))), 'unique tokens in the text')
# print(enc.decode(enc.encode(text)))
# print(enc.encode(text))

1115393 lenth of chars
65 unique chars
169893 lenth of words, special chars are concidered and increased the vocabulary size!
42196 unique words, same problem as above!


GPT3.5-4
301829 total tokens in the text
12111 unique tokens in the text


GPT2-3
338024 total tokens in the text
11706 unique tokens in the text


In [36]:
from tiktoken._educational import *

# Train a BPE tokeniser on a small amount of text
enc = train_simple_encoding()

# Visualise how the GPT-4 encoder encodes text
enc = SimpleBytePairEncoding.from_tiktoken("cl100k_base")
enc.encode("hello world aaaaaaaaaaaa")

The current most common pair is b' ' + b' '
So we made b'  ' our 257th token
Now the first fifty words in our training data look like:
[48;5;167m"[48;5;179m"[48;5;185m"[48;5;77mT[48;5;80mh[48;5;68mi[48;5;134ms[48;5;167m [48;5;179mi[48;5;185ms[48;5;77m [48;5;80ma[48;5;68mn[48;5;134m [48;5;167me[48;5;179md[48;5;185mu[48;5;77mc[48;5;80ma[48;5;68mt[48;5;134mi[48;5;167mo[48;5;179mn[48;5;185ma[48;5;77ml[48;5;80m [48;5;68mi[48;5;134mm[48;5;167mp[48;5;179ml[48;5;185me[48;5;77mm[48;5;80me[48;5;68mn[48;5;134mt[48;5;167ma[48;5;179mt[48;5;185mi[48;5;77mo[48;5;80mn[48;5;68m [48;5;134mo[48;5;167mf[48;5;179m [48;5;185mt[48;5;77mh[48;5;80me[48;5;68m [48;5;134mb[48;5;167my[48;5;179mt[48;5;185me[48;5;77m [48;5;80mp[48;5;68ma[48;5;134mi[48;5;167mr[48;5;179m [48;5;185me[48;5;77mn[48;5;80mc[48;5;68mo[48;5;134md[48;5;167mi[48;5;179mn[48;5;185mg[48;5;77m [48;5;80ma[48;5;68ml[48;5;134mg[48;5;167mo[48;5;179mr[48;5;185mi[48;5;77mt[48;5;8

[15339, 1917, 264, 70540, 33746]