<a href="https://colab.research.google.com/github/RCortez25/PhD/blob/main/LLM/0.%20Tokenizer/Tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re

# 1. Creating tokens

In [2]:
# Loading the sample text to work with
with open('/content/the-veredict.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

# Printing a sample
print(raw_text[:99])

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [3]:
# Preprocess the text to split it into tokens
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
print(preprocessed[:30])

['I', ' ', 'HAD', ' ', 'always', ' ', 'thought', ' ', 'Jack', ' ', 'Gisburn', ' ', 'rather', ' ', 'a', ' ', 'cheap', ' ', 'genius', '--', 'though', ' ', 'a', ' ', 'good', ' ', 'fellow', ' ', 'enough', '--']


In [4]:
# Remove whitespaces
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


The text is now tokenized

# 2. Creating the vocabulary

In [5]:
# Generate a list of unique words
unique_words = sorted(set(preprocessed))
print(unique_words[:30])

['!', '"', "'", '(', ')', ',', '--', '.', ':', ';', '?', 'A', 'Ah', 'Among', 'And', 'Are', 'Arrt', 'As', 'At', 'Be', 'Begin', 'Burlington', 'But', 'By', 'Carlo', 'Chicago', 'Claude', 'Come', 'Croft', 'Destroyed']


With this, we now create the vocabulary. In this case, we map each character/word to its index in the list of unique words.

In [6]:
# Create a dictionary of token:index pairs
vocabulary = {token:index for index,token in enumerate(unique_words)}

# Check the first 15 elements in the vocabulary
for token, index in vocabulary.items():
    print(f'{token}: {index}')
    if index == 15:
        break

!: 0
": 1
': 2
(: 3
): 4
,: 5
--: 6
.: 7
:: 8
;: 9
?: 10
A: 11
Ah: 12
Among: 13
And: 14
Are: 15


# Creating the tokenizer

Now that we have the vocabulary of tokens, we can then create a class that maps tokens to its numeric representation, i.e., its index in the vocabulary. The class will also decode a text, that is, convert numbers to tokens in the vocabulary. We call this class a **tokenizer**.

In [7]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        """
        vocab: Dictionary of word:index pairs
        """
        self.str_to_int = vocab
        # Create a dictionary as a lookup table
        # Contains index:word pairs
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        """
        Encodes a given text into numbers using the vocabulary
        text: Any text to be encoded
        """
        # Preprocess the input text by splitting it
        preprocessed_text = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        # Remove whitespace if present for every splitted word
        preprocessed_text = [
            element.strip() for element in preprocessed_text if element.strip()
        ]

        # Create the number ids for each word using the vocabulary
        # This creates a list of pure numbers which is the encoded text
        ids = [self.str_to_int[element] for element in preprocessed_text]
        return ids

    def decode(self, ids):
        """
        Decodes a given list of numbers into text
        ids: List of numbers to be decoded
        """
        # Decoding the numbers into text
        decoded_words = [self.int_to_str[element] for element in ids]
        # Joins all the words in the list decoded_words inserting a blank space
        # between them
        decoded_text = ' '.join(decoded_words)
        # Replace blank spaces before punctuations. "Word1 , word2 . Hello"
        # Becomes "Word1, word2. Hello"
        decoded_text = re.sub(r'\s+([,.?!"()\'])', r'\1', decoded_text)
        return decoded_text

Let's try this version of the tokenizer.

In [8]:
# Create an instance of the tokenizer
oTokenizerV1 = SimpleTokenizerV1(vocabulary)

# Create a sample text to test the tokenizer
sample_text = """"It's the last he painted, you know,"
                  Mrs. Gisburn said with pardonable pride."""

# Encode the sample text
ids = oTokenizerV1.encode(sample_text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [9]:
# Decode the ids of the sample text
decoded_text = oTokenizerV1.decode(ids)
print(decoded_text)

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


The tokenizer works just fine. However, this is limited to words in the vocabulary. To account for possible tokens not contained in the vocabulary one must include some special tokens.

In [10]:
# Test a new sample text that contains the word "Hello" not contained in the
# vocabulary

sample_text2 = "Hello, how are you?"
ids = oTokenizerV1.encode(sample_text2)
print(ids)

KeyError: 'Hello'

In [11]:
# Adding the special tokens <|unk|> for unknown tokens
# and <|endoftext|> for separating different texts

vocabulary['<|endoftext|>'] = len(vocabulary)
vocabulary['<|unk|>'] = len(vocabulary)

# Checking the last 4 tokens in the vocabulary
for token, index in list(vocabulary.items())[-4:]:
    print(f'{token}: {index}')

your: 1128
yourself: 1129
<|endoftext|>: 1130
<|unk|>: 1131


Now, we modify SimpleTokenizerV1 to include the new tokens

In [12]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        """
        vocab: Dictionary of word:index pairs
        """
        self.str_to_int = vocab
        # Create a dictionary as a lookup table
        # Contains index:word pairs
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        """
        Encodes a given text into numbers using the vocabulary
        text: Any text to be encoded
        """
        # Preprocess the input text by splitting it
        preprocessed_text = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        # Remove whitespace if present for every splitted word
        preprocessed_text = [
            element.strip() for element in preprocessed_text if element.strip()
        ]

        # Add the special token <|unk|> if a token is not present in the
        # vocabulary

        ids = [
            self.str_to_int.get(element, self.str_to_int['<|unk|>'])
            for element in preprocessed_text
        ]

        return ids

    def decode(self, ids):
        """
        Decodes a given list of numbers into text
        ids: List of numbers to be decoded
        """
        # Decoding the numbers into text
        decoded_words = [self.int_to_str[element] for element in ids]
        # Joins all the words in the list decoded_words inserting a blank space
        # between them
        decoded_text = ' '.join(decoded_words)
        # Replace blank spaces before punctuations. "Word1 , word2 . Hello"
        # Becomes "Word1, word2. Hello"
        decoded_text = re.sub(r'\s+([,.?!"()\'])', r'\1', decoded_text)
        return decoded_text

Let's test this new version of the tokenizer

In [13]:
oTokenizerV2 = SimpleTokenizerV2(vocabulary)

sample_text2 = "Hello, how are you?"
ids = oTokenizerV2.encode(sample_text2)
print(ids)

[1131, 5, 560, 169, 1126, 10]


In [14]:
decoded_text2 = oTokenizerV2.decode(ids)
print(decoded_text2)

<|unk|>, how are you?


In [15]:
# Now, let's try with two texts
sample_text3 = "I like rainy days"

two_sample_texts = " <|endoftext|> ".join([sample_text2, sample_text3])
print(two_sample_texts)

Hello, how are you? <|endoftext|> I like rainy days


In [16]:
# Encode the joined texts
ids = oTokenizerV2.encode(two_sample_texts)
print(ids)

[1131, 5, 560, 169, 1126, 10, 1130, 53, 628, 1131, 316]


In [17]:
# Decode the ids of the joined sample texts
decoded_joined_texts = oTokenizerV2.decode(ids)
print(decoded_joined_texts)

<|unk|>, how are you? <|endoftext|> I like <|unk|> days


# Byte pair encoding