In [21]:
# Now we will do Byte Pair Encoding (BPE) on a text file using the tiktoken library-used by OpenAI.
import importlib
import tiktoken


In [4]:
tiktoken = tiktoken.get_encoding("gpt2")
#This is similar to the SimpleTokenizer2

In [5]:
text = "This is a test. This is only a test ? |<endoftext>|"
tokens = tiktoken.encode(text)
print("Tokens:", tokens)

Tokens: [1212, 318, 257, 1332, 13, 770, 318, 691, 257, 1332, 5633, 930, 27, 437, 1659, 5239, 29, 91]


In [6]:
string = tiktoken.decode(tokens)
print("String:", string)

String: This is a test. This is only a test ? |<endoftext>|


In [7]:
# Lets see how this tokenizer works with unkown words
token = tiktoken.encode("This is a test. This is only a test ? |<endoftext>| AKssdniwdj")
print("Tokens with unknown word:", token)

Tokens with unknown word: [1212, 318, 257, 1332, 13, 770, 318, 691, 257, 1332, 5633, 930, 27, 437, 1659, 5239, 29, 91, 15837, 824, 67, 8461, 16993, 73]


In [8]:
string = tiktoken.decode(tokens)
print("String:", string)

String: This is a test. This is only a test ? |<endoftext>|


In [9]:
# Now we will create input target pairs for training. This is the last step before creating vector embeddings.
#given a sentence, the input goes to the LLM and the target will be the last token to be predicted. the input increase over time as targets of the previous step are added to the input.
import os
data_folder = "data"
raw_texts = []
for filename in os.listdir(data_folder):
    file_path = os.path.join(data_folder, filename)
    if os.path.isfile(file_path):
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                raw_texts.append(f.read())
        except UnicodeDecodeError:
            print(f"Skipped non-text file: {filename}")

raw_text = "\n".join(raw_texts)


Skipped non-text file: .DS_Store


In [10]:
enc_text = tiktoken.encode(raw_text)
print("Encoded text length:", len(enc_text))

Encoded text length: 76064


In [15]:
context_size = 10 # length of the input sequence

#the input is the first 10 tokens [1,2,3,4,5,6,7,8,9,10] and the target is the next token [11]

x = enc_text[:context_size]
y = enc_text[1:context_size+1]
print("Input tokens:", x)
print("Target tokens:", y)

Input tokens: [31478, 17034, 69, 16, 59, 504, 72, 59, 504, 291]
Target tokens: [17034, 69, 16, 59, 504, 72, 59, 504, 291, 6024]


In [16]:
for i in range(1, context_size +1):
    x = enc_text[i:i + context_size]
    y = enc_text[i + context_size]
    print(f"Input tokens {i}:", x)
    print(f"Target tokens {i}:", y)

Input tokens 1: [17034, 69, 16, 59, 504, 72, 59, 504, 291, 6024]
Target tokens 1: 1065
Input tokens 2: [69, 16, 59, 504, 72, 59, 504, 291, 6024, 1065]
Target tokens 2: 4309
Input tokens 3: [16, 59, 504, 72, 59, 504, 291, 6024, 1065, 4309]
Target tokens 3: 59
Input tokens 4: [59, 504, 72, 59, 504, 291, 6024, 1065, 4309, 59]
Target tokens 4: 66
Input tokens 5: [504, 72, 59, 504, 291, 6024, 1065, 4309, 59, 66]
Target tokens 5: 25634
Input tokens 6: [72, 59, 504, 291, 6024, 1065, 4309, 59, 66, 25634]
Target tokens 6: 433
Input tokens 7: [59, 504, 291, 6024, 1065, 4309, 59, 66, 25634, 433]
Target tokens 7: 69
Input tokens 8: [504, 291, 6024, 1065, 4309, 59, 66, 25634, 433, 69]
Target tokens 8: 2078
Input tokens 9: [291, 6024, 1065, 4309, 59, 66, 25634, 433, 69, 2078]
Target tokens 9: 1828
Input tokens 10: [6024, 1065, 4309, 59, 66, 25634, 433, 69, 2078, 1828]
Target tokens 10: 198
