In [1]:
import pandas as pd

In [22]:
#Reading in a short story as text sample into Python
with open("Portfolio_Projects/Building-LLMs-from-Scratch/the-verdict.txt", "r", encoding = "utf-8") as f:
          raw_text = f.read()
print("Total number of character:", len(raw_text))
print(raw_text[:99])

FileNotFoundError: [Errno 2] No such file or directory: 'Portfolio_Projects/Building-LLMs-from-Scratch/the-verdict.txt'

### 1. Tokenizing Text

In [None]:
import re
text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)
print(result)

In [None]:
#separting punction and spaces from the text
result = re.split(r'([,.]|\s)', text)
print(result)

In [None]:
#remvoing white spaces
result = [item for item in result if item.strip()]
print(result)

In [None]:
#extending the tokenizer to handle wider range of punctuations
text = "Hello, world. Is this-- a test."
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print("Tokenized text: ",result)

In [None]:
#applying this tokenizer to the full Edith Warton's story
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(f"length of the complete tokenized text from Edith Wharton story: {len(preprocessed)}")

In [None]:
#printing first 30 tokens
print(f"First 30 tokens: {preprocessed[:30]}")

### 2. Converting token into token IDs

#### create a list of all unique tokens and sort them alphabetically to determine the vocabulary

In [None]:
all_words = sorted(list(set(preprocessed)))
vocab_size = len(all_words)
print(f"Vocabulary Size: {vocab_size}")

In [None]:
#creating a vocabulary
vocab = {token: integer for integer, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i > 50:
        break

In [None]:
#implementing a tokenizer class to tokenize text into tokens, encode tokens to integers and decode integers to tokens

class SimpleTokenzierV1:
    def __init__(self, vocab):
        self.str_to_int = vocab #A
        self.int_to_str = {i:s for s,i in vocab.items()} #B

    def encode(self, text): #C
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids): #D
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) #E
        return text

In [None]:
#instantiating tokenizer object to test the SimpleTokenzierV1 class
tokenizer = SimpleTokenzierV1(vocab)

In [None]:
text = raw_text[:99]
ids = tokenizer.encode(text)
print(f"Token ids for sample text from Edith Warton's story: {ids}")

In [None]:
#testing decoder on the token ids above
print(f"text from decoding token ids using SimpleTokenzierV1 class: \n{tokenizer.decode(ids)}")

In [None]:
#running the tokenizer on a sample text which is not a part of the text used to create the vocab
text = "Hello, do you like tea?"
tokenizer.encode(text)

here, the key error suggests that Hello is not a part of the vocabulary and hence we need to use large and diverse text in order to extend the vocabulary when creating a LARGE language models

#### 3. Adding special context tokens

modifying SimpleTokenzierV1 to support new tokens for unknown words and document boundaries

In [None]:
'''adding tokens:
1. <unk> - to represent unknown or new words that are not part of the vocabulary
2. <|endoftext|> -  marker to separate two different text source from each other
'''
all_tokens =  sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<unk>"])
vocab = {token:integer for integer, token in enumerate(all_tokens)}

print(f"length of the vocabulary after extending the vocab with unknown words and end of text markers: {len(vocab.items())}")

In [None]:
#printing a sample from the end of the extended vocab
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

In [None]:
#SimpleTokenzierV2 replaces unknown words with the token "<unk>"
class SimpleTokenzierV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int
                       else "<unk>" for item in preprocessed] #A
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) #B
        return text

In [None]:
#concatenating two unrelated texts
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))
print(f"concatenated text with endoftext marker: \n{text}")

In [None]:
#testing the SimpleTokenzierV2 on a text that is concatenation of two unrelated texts
tokenizer = SimpleTokenzierV2(vocab)
print(tokenizer.encode(text))

we can see that the list of token IDs contains 1131 for the <|endoftext|> separator token as well as two 1132 tokens, which are used for unknown words.