# **Tokenization**

In [100]:
import os
import re
import urllib.request

import tiktoken

import random
import warnings

random.seed(42)
warnings.filterwarnings("ignore")

## Tokenizing Text

In [18]:
example_text = "Hello, world! This is, an example, of tokenizing text."
print(f"Original: {example_text}")
example_result = re.split(r'([,.!]|\s)', example_text)
example_result = [token for token in example_result if token.strip()]
print(f"Tokens: {example_result}")

Original: Hello, world! This is, an example, of tokenizing text.
Tokens: ['Hello', ',', 'world', '!', 'This', 'is', ',', 'an', 'example', ',', 'of', 'tokenizing', 'text', '.']


In [14]:
with open("the-verdict.txt", "r") as f:
    raw_text = f.read()
print(f"Length of raw text: {len(raw_text)}")

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [token.strip() for token in preprocessed if token.strip()]
print(f"Number of tokens: {len(preprocessed)}")
print(f"First 5 tokens: {preprocessed[:5]}")

Length of raw text: 20479
Number of tokens: 4690
First 5 tokens: ['I', 'HAD', 'always', 'thought', 'Jack']


## Building Token IDs

In [21]:
all_words = sorted(set(preprocessed))

In [22]:
vocab_size = len(all_words)
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 1130


In [36]:
vocab = {token: integer for integer, token in enumerate(all_words)}
print(f"Random tokens with IDs: {random.sample(list(vocab.items()), 5)}")

Random tokens with IDs: [('flung', 451), ('square', 919), ('incense', 569), ('Among', 13), ('deploring', 326)]


In [75]:
class Tokenizer_v1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {token: integer for integer, token in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [
            token.strip() for token in preprocessed if token.strip()
        ]
        ids = [self.str_to_int[token] for token in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[id] for id in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [82]:
tokenizer = Tokenizer_v1(vocab)

text = "I have mentioned that Mrs. Gisburn was rich;" # sample text from source

ids = tokenizer.encode(text)
print(f"Encoded IDs: {ids}")

tokens = tokenizer.decode(ids)
print(f"Decoded text: {tokens}")

pipeline = tokenizer.decode(tokenizer.encode(text))
print(f"Pipeline output: {pipeline}")

Encoded IDs: [53, 530, 667, 987, 67, 7, 38, 1077, 841, 9]
Decoded text: I have mentioned that Mrs. Gisburn was rich ;
Pipeline output: I have mentioned that Mrs. Gisburn was rich ;


## Adding Context Tokens

In [89]:
tokenizer = Tokenizer_v1(vocab)

example_text= "I have mentioned that Mrs. Smith was rich"

try:
    print(f"Encoded IDs: {tokenizer.encode(example_text)}")
except KeyError as e:
    print(f"Error: Token not found in vocabulary - {e}")

Error: Token not found in vocabulary - 'Smith'


In [90]:
all_tokens = sorted(set(preprocessed))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token: integer for integer, token in enumerate(all_tokens)}
print(f"New vocabulary size: {len(vocab)}")
print(f"Last 5 tokens in vocabulary: {list(vocab.items())[-5:]}")

New vocabulary size: 1132
Last 5 tokens in vocabulary: [('younger', 1127), ('your', 1128), ('yourself', 1129), ('<|endoftext|>', 1130), ('<|unk|>', 1131)]


In [97]:
class Tokenizer_v2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {token: integer for integer, token in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [
            item if item in self.str_to_int else "<|unk|>" for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[token] for token in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[id] for id in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [98]:
tokenizer = Tokenizer_v2(vocab)

print(f"Encoded IDs: {tokenizer.encode(example_text)}")
print(f"Decoded tokens: {tokenizer.decode(tokenizer.encode(example_text))}")

Encoded IDs: [53, 530, 667, 987, 67, 7, 1131, 1077, 841]
Decoded tokens: I have mentioned that Mrs. <|unk|> was rich


## Byte Pair Encoding (BPE)