# **Tokenization**

In [1]:
import os
import re
import urllib.request
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader
import random
import warnings

random.seed(42)
torch.manual_seed(123)
warnings.filterwarnings("ignore")

## Tokenizing Text

In [2]:
example_text = "Hello, world! This is, an example, of tokenizing text."
print(f"Original: {example_text}")
example_result = re.split(r'([,.!]|\s)', example_text)
example_result = [token for token in example_result if token.strip()]
print(f"Tokens: {example_result}")

Original: Hello, world! This is, an example, of tokenizing text.
Tokens: ['Hello', ',', 'world', '!', 'This', 'is', ',', 'an', 'example', ',', 'of', 'tokenizing', 'text', '.']


In [3]:
with open("the-verdict.txt", "r") as f:
    raw_text = f.read()
print(f"Length of raw text: {len(raw_text)}")

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [token.strip() for token in preprocessed if token.strip()]
print(f"Number of tokens: {len(preprocessed)}")
print(f"First 5 tokens: {preprocessed[:5]}")

Length of raw text: 20479
Number of tokens: 4690
First 5 tokens: ['I', 'HAD', 'always', 'thought', 'Jack']


## Building Token IDs

In [4]:
all_words = sorted(set(preprocessed))

In [5]:
vocab_size = len(all_words)
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 1130


In [6]:
vocab = {token: integer for integer, token in enumerate(all_words)}
print(f"Random tokens with IDs: {random.sample(list(vocab.items()), 5)}")

Random tokens with IDs: [('breaking', 228), ('His', 51), ('idea', 563), ('good-breeding', 501), ('forced', 457)]


In [7]:
class Tokenizer_v1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {token: integer for integer, token in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [
            token.strip() for token in preprocessed if token.strip()
        ]
        ids = [self.str_to_int[token] for token in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[id] for id in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [8]:
tokenizer = Tokenizer_v1(vocab)

text = "I have mentioned that Mrs. Gisburn was rich;" # sample text from source

ids = tokenizer.encode(text)
print(f"Encoded IDs: {ids}")

tokens = tokenizer.decode(ids)
print(f"Decoded text: {tokens}")

pipeline = tokenizer.decode(tokenizer.encode(text))
print(f"Pipeline output: {pipeline}")

Encoded IDs: [53, 530, 667, 987, 67, 7, 38, 1077, 841, 9]
Decoded text: I have mentioned that Mrs. Gisburn was rich ;
Pipeline output: I have mentioned that Mrs. Gisburn was rich ;


## Adding Context Tokens

In [9]:
tokenizer = Tokenizer_v1(vocab)

example_text= "I have mentioned that Mrs. Smith was rich"

try:
    print(f"Encoded IDs: {tokenizer.encode(example_text)}")
except KeyError as e:
    print(f"Error: Token not found in vocabulary - {e}")

Error: Token not found in vocabulary - 'Smith'


In [10]:
all_tokens = sorted(set(preprocessed))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token: integer for integer, token in enumerate(all_tokens)}
print(f"New vocabulary size: {len(vocab)}")
print(f"Last 5 tokens in vocabulary: {list(vocab.items())[-5:]}")

New vocabulary size: 1132
Last 5 tokens in vocabulary: [('younger', 1127), ('your', 1128), ('yourself', 1129), ('<|endoftext|>', 1130), ('<|unk|>', 1131)]


In [11]:
class Tokenizer_v2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {token: integer for integer, token in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [
            item if item in self.str_to_int else "<|unk|>" for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[token] for token in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[id] for id in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [12]:
tokenizer = Tokenizer_v2(vocab)

print(f"Encoded IDs: {tokenizer.encode(example_text)}")
print(f"Decoded tokens: {tokenizer.decode(tokenizer.encode(example_text))}")

Encoded IDs: [53, 530, 667, 987, 67, 7, 1131, 1077, 841]
Decoded tokens: I have mentioned that Mrs. <|unk|> was rich


## Byte Pair Encoding (BPE)

In [13]:
tokenizer = tiktoken.get_encoding("gpt2")

In [14]:
example_text = (
    "Hello, world! This is an example of tokenizing text using tiktoken from OpenAI. <|endoftext|> It can handle not only complex structures and words without defined meaning or spaces like someunknownword or jqndnhwevi."
)

tokenizer.encode(example_text, allowed_special={"<|endoftext|>", "<|unk|>"})

[15496,
 11,
 995,
 0,
 770,
 318,
 281,
 1672,
 286,
 11241,
 2890,
 2420,
 1262,
 256,
 1134,
 30001,
 422,
 4946,
 20185,
 13,
 220,
 50256,
 632,
 460,
 5412,
 407,
 691,
 3716,
 8573,
 290,
 2456,
 1231,
 5447,
 3616,
 393,
 9029,
 588,
 617,
 34680,
 4775,
 393,
 474,
 80,
 358,
 77,
 71,
 732,
 8903,
 13]

## Data Sampling

In [15]:
with open("the-verdict.txt", "r") as f:
    raw_text = f.read()

tokenizer = tiktoken.get_encoding("gpt2")
enc_text = tokenizer.encode(raw_text)
print(f"Encoded text length: {len(enc_text)}")

enc_sample = enc_text[50:] # for demo

Encoded text length: 5145


In [16]:
context_size = 4 # in a real LLM: ~10000

x = enc_sample[:context_size]
y = enc_sample[1:context_size + 1]

print(f"x: {x}")
print(f"y:      {y}")

print(f"\nIn action:")
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    target = enc_sample[i]
    print(f"{tokenizer.decode(context)} ----> {tokenizer.decode([target])}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]

In action:
 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


In [17]:
class Dataset_v1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        # works for only a few million tokens, not more
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>", "<|unk|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]

            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [18]:
def DataLoader_v1(txt, batch_size=2, max_length=256,
                    stride=128, shuffle=True, 
                    drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = Dataset_v1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

In [19]:
dataloader_test1 = DataLoader_v1(
    raw_text,
    batch_size = 1,
    max_length = 4,
    stride = 1,
    shuffle = False
)

print(f"Example 1 with Stride = 1:\n")

data_iter = iter(dataloader_test1)

batch1 = next(data_iter)
print(f"First batch Input: {batch1[0]}\nFirst batch Target: {batch1[1]}\n")

batch2 = next(data_iter)
print(f"Second batch Input: {batch2[0]}\nSecond batch Target: {batch2[1]}")

Example 1 with Stride = 1:

First batch Input: tensor([[  40,  367, 2885, 1464]])
First batch Target: tensor([[ 367, 2885, 1464, 1807]])

Second batch Input: tensor([[ 367, 2885, 1464, 1807]])
Second batch Target: tensor([[2885, 1464, 1807, 3619]])


In [20]:
dataloader_test2 = DataLoader_v1(
    raw_text,
    batch_size = 1,
    max_length = 4,
    stride = 4,
    shuffle = False
)

print(f"\nExample 2 with Stride = 4:\n")

data_iter = iter(dataloader_test2)

batch1 = next(data_iter)
print(f"First batch Input: {batch1[0]}\nFirst batch Target: {batch1[1]}\n")

batch2 = next(data_iter)
print(f"Second batch Input: {batch2[0]}\nSecond batch Target: {batch2[1]}")


Example 2 with Stride = 4:

First batch Input: tensor([[  40,  367, 2885, 1464]])
First batch Target: tensor([[ 367, 2885, 1464, 1807]])

Second batch Input: tensor([[1807, 3619,  402,  271]])
Second batch Target: tensor([[ 3619,   402,   271, 10899]])


In [21]:
dataloader = DataLoader_v1(
    raw_text,
    batch_size = 8,
    max_length = 4,
    stride = 4,
    shuffle = False
)

print(f"\nFinal data loader with Stride = 4, batch size = 8:\n")

data_iter = iter(dataloader)

batch1 = next(data_iter)
print(f"First batch Input: {batch1[0]}\n")

batch2 = next(data_iter)
print(f"Second batch Input: {batch2[0]}")


Final data loader with Stride = 4, batch size = 8:

First batch Input: tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Second batch Input: tensor([[  287,   262,  6001,   286],
        [  465, 13476,    11,   339],
        [  550,  5710,   465, 12036],
        [   11,  6405,   257,  5527],
        [27075,    11,   290,  4920],
        [ 2241,   287,   257,  4489],
        [   64,   319,   262, 34686],
        [41976,    13,   357, 10915]])


## Creating Token Embeddings

In [22]:
print(f"Number of unique tokens: {tokenizer.n_vocab}")

Number of unique tokens: 50257


In [23]:
example_input_ids = torch.tensor([2, 3, 5, 1])
example_vocab_size = 6
example_output_dim = 3

example_embedding_layer = torch.nn.Embedding(
    example_vocab_size,
    example_output_dim
)

print(f"Embedding layer weights: {example_embedding_layer.weight}")

Embedding layer weights: Parameter containing:
tensor([[-0.2196, -0.3792, -0.1606],
        [-0.4015,  0.6957, -1.8061],
        [ 1.8960, -0.1750,  1.3689],
        [-1.6033, -0.7849, -1.4096],
        [-0.4076,  0.7953,  0.9985],
        [ 0.2212,  1.8319, -0.3378]], requires_grad=True)


In [24]:
print(f"Input IDs: {example_input_ids}")
print(f"Input embeddings: \n{example_embedding_layer(example_input_ids)}")

Input IDs: tensor([2, 3, 5, 1])
Input embeddings: 
tensor([[ 1.8960, -0.1750,  1.3689],
        [-1.6033, -0.7849, -1.4096],
        [ 0.2212,  1.8319, -0.3378],
        [-0.4015,  0.6957, -1.8061]], grad_fn=<EmbeddingBackward0>)


In [25]:
vocab_size = tokenizer.n_vocab
output_dim = 256
token_embedding_layer = torch.nn.Embedding(
    vocab_size,
    output_dim
)

max_length = 4
dataloader = DataLoader_v1(
    raw_text,
    batch_size = 8,
    max_length = max_length,
    stride = max_length,
    shuffle = False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [26]:
print(f"Token IDs: \n{inputs}\n")
print(f"Input shape: {inputs.shape}")

Token IDs: 
tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Input shape: torch.Size([8, 4])


In [27]:
token_embeddings = token_embedding_layer(inputs)
print(f"Token embeddings shape: {token_embeddings.shape}")

Token embeddings shape: torch.Size([8, 4, 256])


## Adding Positional Info

In [28]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(
    context_length,
    output_dim
)

pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(f"Positional embeddings shape: {pos_embeddings.shape}")

Positional embeddings shape: torch.Size([4, 256])


In [29]:
input_embeddings = token_embeddings + pos_embeddings
print(f"Input embeddings shape: {input_embeddings.shape}")

Input embeddings shape: torch.Size([8, 4, 256])
