In [5]:
from importlib.metadata import version

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.6.0
tiktoken version: 0.9.0


## **Tokenizing text**

In [29]:
import os
import urllib.request

if not os.path.exists("the-verdict.txt"):
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)

In [30]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
print("Total number of character:", len(raw_text))

Total number of character: 20479


In [32]:
import re

text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)

print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [10]:
result = re.split(r'([,.]|\s)', text)

print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [11]:
# Strip whitespace from each item and then filter out any empty strings.
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [12]:
text = "Hello, world. Is this-- a test?"

result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [33]:
preprocessed_data = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed_data = [item.strip() for item in preprocessed_data if item.strip()]
print(preprocessed_data[:30])
print(len(preprocessed_data))

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']
4690


## **Converting tokens into token IDs**

In [34]:
all_words = sorted(set(preprocessed_data))
print(len(all_words))

1130


In [35]:
vocab = {token:integer for integer,token in enumerate(all_words)}

for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 10:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)


In [36]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
                                
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [37]:
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [38]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [39]:
tokenizer.decode(tokenizer.encode(text))

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

Now if we add any other word which will be outside of our vocab, our functions will through errors
below is an example

In [28]:
text = "Hello world-- welcome here!"

ids = tokenizer.encode(text)

KeyError: 'Hello'

In [40]:
all_tokens = sorted(list(set(preprocessed_data)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab_updated = {token:integer for integer,token in enumerate(all_tokens)}
len(vocab_updated.items())

1132

In [41]:
for i, item in enumerate(list(vocab_updated.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [42]:
class SimpleTokenizerV2:
    def __init__(self,vocab_updated):
        self.str_to_int = vocab_updated
        self.int_to_str = { i:s for s,i in vocab_updated.items()}

    def encode(self, text):
        preprocessed_data = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed_data = [item.strip() for item in preprocessed_data if item.strip()]
        preprocessed_data = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed_data
        ]
        ids = [self.str_to_int[s] for s in preprocessed_data]
        return ids
    
    def decode(self,ids):
        text = " ".join([self.int_to_str[i] ]for i in ids)
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [43]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [44]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

strings = tokenizer.decode(integers)

print(strings)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]
Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


## **Data Sampling using Sliding Windows**

In [45]:
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [46]:
enc_sample = enc_text[50:]
len(enc_sample)

5095

In [47]:
context_size = 10
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f'x: {x}')
print(f"y:      {y}")

x: [290, 4920, 2241, 287, 257, 4489, 64, 319, 262, 34686]
y:      [4920, 2241, 287, 257, 4489, 64, 319, 262, 34686, 41976]


In [48]:
#example
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a
 and established himself in a ---->  vill
 and established himself in a vill ----> a
 and established himself in a villa ---->  on
 and established himself in a villa on ---->  the
 and established himself in a villa on the ---->  Riv
 and established himself in a villa on the Riv ----> iera


In [49]:
import torch


In [50]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        
        for i in range(0, len(token_ids)-max_length, stride):
            input_chunk = token_ids[i:i + max_length + 1]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [51]:
def create_dataloader_v1(txt, batch_size = 4, max_length = 256,
                         stride = 128, shuffle = True, drop_lasT = True,
                         num_workers = 0):
    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset, batch_size = batch_size,
        shuffle=shuffle,
        drop_last=drop_lasT,
        num_workers=num_workers
    )

    return dataloader

In [52]:
with open("the-verdict.txt", "r", encoding="UTF-8") as f:
    raw_text = f.read()

In [53]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464, 1807]]), tensor([[ 367, 2885, 1464, 1807]])]


In [54]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807, 3619]]), tensor([[2885, 1464, 1807, 3619]])]


## Creating token embeddings

In [55]:
inputs_ids = torch.tensor([2, 3, 5, 1])

vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [56]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [57]:
embedding_layer(torch.tensor([3]))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)

In [58]:
embedding_layer(torch.tensor([2]))

tensor([[ 1.2753, -0.2010, -0.1606]], grad_fn=<EmbeddingBackward0>)

In [59]:
embedding_layer(inputs_ids)

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)

## Encoding word positions

In [60]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [61]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464,  1807],
        [ 1807,  3619,   402,   271, 10899],
        [10899,  2138,   257,  7026, 15632],
        [15632,   438,  2016,   257,   922],
        [  922,  5891,  1576,   438,   568],
        [  568,   340,   373,   645,  1049],
        [ 1049,  5975,   284,   502,   284],
        [  284,  3285,   326,    11,   287]])

Inputs shape:
 torch.Size([8, 5])


In [62]:
token_embeddings = token_embedding_layer(inputs_ids)
token_embeddings.shape

torch.Size([4, 256])

In [63]:
token_embeddings[0, 0]

tensor(1.3382, grad_fn=<SelectBackward0>)

In [64]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [65]:
torch.arange(max_length)

tensor([0, 1, 2, 3])

In [66]:
pos_embedding_layer.weight

Parameter containing:
tensor([[ 1.7375, -0.5620, -0.6303,  ..., -0.2277,  1.5748,  1.0345],
        [ 1.6423, -0.7201,  0.2062,  ...,  0.4118,  0.1498, -0.4628],
        [-0.4651, -0.7757,  0.5806,  ...,  1.4335, -0.4963,  0.8579],
        [-0.6754, -0.4628,  1.4323,  ...,  0.8139, -0.7088,  0.4827]],
       requires_grad=True)

In [67]:
pos_embedding_layer(torch.arange(max_length))

tensor([[ 1.7375, -0.5620, -0.6303,  ..., -0.2277,  1.5748,  1.0345],
        [ 1.6423, -0.7201,  0.2062,  ...,  0.4118,  0.1498, -0.4628],
        [-0.4651, -0.7757,  0.5806,  ...,  1.4335, -0.4963,  0.8579],
        [-0.6754, -0.4628,  1.4323,  ...,  0.8139, -0.7088,  0.4827]],
       grad_fn=<EmbeddingBackward0>)

In [68]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [69]:
token_embeddings[0] + pos_embeddings

tensor([[ 3.0757, -0.2914, -0.1232,  ..., -0.2102, -0.5769,  1.4269],
        [ 2.9805, -0.4495,  0.7133,  ...,  0.4292, -2.0019, -0.0704],
        [ 0.8732, -0.5052,  1.0877,  ...,  1.4509, -2.6481,  1.2503],
        [ 0.6628, -0.1923,  1.9394,  ...,  0.8314, -2.8605,  0.8751]],
       grad_fn=<AddBackward0>)

In [70]:
token_embeddings + pos_embeddings

tensor([[ 3.0757, -0.2914, -0.1232,  ..., -0.2102, -0.5769,  1.4269],
        [ 2.6989, -0.3514,  2.0421,  ..., -1.0591,  0.2438,  1.6354],
        [ 1.2462,  0.2463, -0.1070,  ...,  1.9197, -1.0387,  0.5496],
        [ 0.5523, -0.8925, -0.7798,  ...,  0.6499, -1.0436,  0.4606]],
       grad_fn=<AddBackward0>)

In [72]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([4, 256])
