In [1]:
import os #Working with os
import urllib.request #Downloading online files

#Downloading the dataset from an online url
if not os.path.exists("the-verdict-txt"):
    url = ("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/refs/heads/main/ch02/01_main-chapter-code/the-verdict.txt")
    file_path = "the-verdict-txt"
    urllib.request.urlretrieve(url, file_path)

In [2]:
#Opening the file (dataset)
with open("the-verdict-txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [3]:
#Looking at text
raw_text

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [4]:
#Checking for length of the dataset
len(raw_text)

20479

In [5]:
#Tokenization warmup
import re

text = "Hello guys welcome to my youtube channel."
result = re.split(r'(\s)', text)

print(result)

['Hello', ' ', 'guys', ' ', 'welcome', ' ', 'to', ' ', 'my', ' ', 'youtube', ' ', 'channel.']


In [6]:
result = re.split(r'([,.]|\s)', text)

In [7]:
print(result)

['Hello', ' ', 'guys', ' ', 'welcome', ' ', 'to', ' ', 'my', ' ', 'youtube', ' ', 'channel', '.', '']


In [8]:
#optional step to take out white space characters
result = [item for item in result if item.strip()]
print(result)

['Hello', 'guys', 'welcome', 'to', 'my', 'youtube', 'channel', '.']


In [9]:
text = "Hello guys welcome to my youtube channel."

result = re.split(r'([,.:;?_!"()\']--|\s)', raw_text)
result = [item.strip() for item in result if item.strip()]
print(result)

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius--though', 'a', 'good', 'fellow', 'enough--so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that,', 'in', 'the', 'height', 'of', 'his', 'glory,', 'he', 'had', 'dropped', 'his', 'painting,', 'married', 'a', 'rich', 'widow,', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera.', '(Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence.)', '"The', 'height', 'of', 'his', 'glory', '"--', 'that', 'was', 'what', 'the', 'women', 'called', 'it.', 'I', 'can', 'hear', 'Mrs.', 'Gideon', 'Thwing--his', 'last', 'Chicago', 'sitter--deploring', 'his', 'unaccountable', 'abdication.', '"Of', 'course', "it's", 'going', 'to', 'send', 'the', 'value', 'of', 'my', 'picture', "'way", 'up;', 'but', 'I', "don't", 'think', 'of', 'that,', 'Mr.', 'Rickham--the', 'loss', 'to', 'Arrt', 'is', 'all', 'I', 'think', 'of."', 'The', 'word,', 'on', 'Mrs.', "Th

In [10]:
len(result)

3646

In [11]:
#Converting the tokens into token ids
#Looking at all words in dataset
all_words = sorted(set(result))
all_words

['!--',
 '"--',
 '"Ah,',
 '"Ah--I',
 '"Be',
 '"By',
 '"Come',
 '"Destroyed',
 '"Don\'t',
 '"Gisburns"',
 '"Grindles."',
 '"Hang',
 '"Has',
 '"How',
 '"I',
 '"I\'d',
 '"If',
 '"It',
 '"It\'s',
 '"Jack',
 '"Money\'s',
 '"Moon-dancers"',
 '"Mr.',
 '"Mrs.',
 '"My',
 '"Never',
 '"Never,"',
 '"Of',
 '"Oh,',
 '"Once,',
 '"Only',
 '"Or',
 '"That',
 '"The',
 '"Then',
 '"There',
 '"There:',
 '"This',
 '"We',
 '"Well,',
 '"What',
 '"When',
 '"Why',
 '"Yes,',
 '"Yes--quite',
 '"Yes--she\'s',
 '"You',
 '"deadening',
 '"dragged',
 '"effects";',
 '"interesting":',
 '"lift',
 '"obituary"',
 '"strongest,"',
 '"strongly"',
 '"sweetly',
 "'Are",
 "'It's",
 "'coming'",
 "'done'",
 "'subject.'",
 "'technique'",
 "'way",
 '(I',
 '(Though',
 '.',
 '."',
 'A',
 'Among',
 'And',
 'And,',
 'Arrt',
 'As',
 'At',
 'Burlington',
 'But',
 'But,',
 'By',
 'Carlo,',
 'Carlo;',
 'Chicago',
 'Claude',
 'Croft',
 'Croft)',
 'Croft,',
 'Devonshire',
 "Don't",
 'Dubarry_',
 'Emperors',
 'Florence.)',
 'For',
 'Gallery',
 

In [12]:
len(all_words)

1486

In [13]:
#Building a vocabulary
vocab = {token:integer for integer,token in enumerate(all_words)}
print(vocab)

{'!--': 0, '"--': 1, '"Ah,': 2, '"Ah--I': 3, '"Be': 4, '"By': 5, '"Come': 6, '"Destroyed': 7, '"Don\'t': 8, '"Gisburns"': 9, '"Grindles."': 10, '"Hang': 11, '"Has': 12, '"How': 13, '"I': 14, '"I\'d': 15, '"If': 16, '"It': 17, '"It\'s': 18, '"Jack': 19, '"Money\'s': 20, '"Moon-dancers"': 21, '"Mr.': 22, '"Mrs.': 23, '"My': 24, '"Never': 25, '"Never,"': 26, '"Of': 27, '"Oh,': 28, '"Once,': 29, '"Only': 30, '"Or': 31, '"That': 32, '"The': 33, '"Then': 34, '"There': 35, '"There:': 36, '"This': 37, '"We': 38, '"Well,': 39, '"What': 40, '"When': 41, '"Why': 42, '"Yes,': 43, '"Yes--quite': 44, '"Yes--she\'s': 45, '"You': 46, '"deadening': 47, '"dragged': 48, '"effects";': 49, '"interesting":': 50, '"lift': 51, '"obituary"': 52, '"strongest,"': 53, '"strongly"': 54, '"sweetly': 55, "'Are": 56, "'It's": 57, "'coming'": 58, "'done'": 59, "'subject.'": 60, "'technique'": 61, "'way": 62, '(I': 63, '(Though': 64, '.': 65, '."': 66, 'A': 67, 'Among': 68, 'And': 69, 'And,': 70, 'Arrt': 71, 'As': 72, 

In [14]:
import re

class SimpleTokenizerV1:
    """A simple tokenizer that splits text and converts tokens to integer IDs."""
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i: s for s, i in vocab.items()}

    def encode(self, text):
        """Converts a string of text into a list of integer IDs."""
        # Split text by punctuation, '--', or whitespace, keeping delimiters
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        
        # Remove empty strings and strip whitespace from each token
        # Using a list comprehension preserves the order and duplicates
        preprocessed = [item.strip() for item in preprocessed if item.strip()]

        # Convert tokens to integers, ignoring any tokens not in the vocabulary
        ids = [self.str_to_int[s] for s in preprocessed if s in self.str_to_int]
        return ids

    def decode(self, ids):
        """Converts a list of integer IDs back into a string of text."""
        # Convert integers back to string tokens and join them with spaces
        text = " ".join([self.int_to_str[i] for i in ids])
        
        # Clean up spacing around punctuation for a more natural output
        text = re.sub(r'\s+([,.:;?_!"()\'])', r'\1', text)
        return text

In [15]:
vocab["Jack"]

119

In [16]:
int_to_str = {i: s for s, i in vocab.items()}

int_to_str[119]

'Jack'

In [17]:
tokenizer = SimpleTokenizerV1(vocab)

In [18]:
text = """"It's the last he painted, you know," Mrs. Gisburn said with pardonable pride,"""

In [19]:
ids = tokenizer.encode(text)
print(ids)

[118, 1292, 810, 691, 1004, 1478, 799, 65, 93, 1133, 1451, 1020]


In [20]:
tokenizer.decode(ids)

'It the last he painted you know. Gisburn said with pardonable'

In [21]:
text = "Hello, do you like tea, is this-- a test?"

tokenizer.encode(text)

[489, 1478, 843, 1277, 773, 1312, 212]

In [22]:
#Adding new tokens into vocab
all_wordss = sorted(list(set(result)))
all_words.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_words)}

In [23]:
#verify if new added tokens are in
len(vocab.items())

1488

In [24]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1483)
('your', 1484)
('yourself', 1485)
('<|endoftext|>', 1486)
('<|unk|>', 1487)


In [25]:
#New improved tokenizer to deal with unknown tokens
import re

class SimpleTokenizerV2:
    """A simple tokenizer that splits text and converts tokens to integer IDs."""
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i: s for s, i in vocab.items()}

    def encode(self, text):
        """Converts a string of text into a list of integer IDs."""
        # Split text by punctuation, '--', or whitespace, keeping delimiters
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        
        # Remove empty strings and strip whitespace from each token
        preprocessed = [item.strip() for item in preprocessed if item.strip()]

        #Dealing with unknown tokens
        unk_token_id = self.str_to_int["<|unk|>"]
        ids = [self.str_to_int.get(s, unk_token_id) for s in preprocessed]
        return ids

    def decode(self, ids):
        """Converts a list of integer IDs back into a string of text."""
        # Convert integers back to string tokens and join them with spaces
        text = " ".join([self.int_to_str[i] for i in ids])
        
        # Clean up spacing around punctuation for a more natural output
        text = re.sub(r'\s+([,.:;?_!"()\'])', r'\1', text)
        return text

In [26]:
text = "Hello this is japan"
tokenizer = SimpleTokenizerV2(vocab)

tokenizer.encode(text)

[1487, 1312, 773, 1487]

In [27]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|> this is <|unk|>'

In [28]:
#Byte Pair Encoding

import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

In [29]:
tokenizer.encode("Hello World!")

[15496, 2159, 0]

In [30]:
tokenizer.decode(tokenizer.encode("Hello World!"))

'Hello World!'

In [31]:
text = "Hello world <|endoftext|> i eat food"

tokenizer.encode(text)

ValueError: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


In [32]:
#EOD causes error unless specified otherwise
text = "Hello world <|endoftext|> i eat food"

tokenizer.encode(text, allowed_special={"<|endoftext|>"})

[15496, 995, 220, 50256, 1312, 4483, 2057]

In [33]:
#Data sampling with a sliding window

with open("the-verdict-txt", "r", encoding="utf=8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [34]:
enc_text

[40,
 367,
 2885,
 1464,
 1807,
 3619,
 402,
 271,
 10899,
 2138,
 257,
 7026,
 15632,
 438,
 2016,
 257,
 922,
 5891,
 1576,
 438,
 568,
 340,
 373,
 645,
 1049,
 5975,
 284,
 502,
 284,
 3285,
 326,
 11,
 287,
 262,
 6001,
 286,
 465,
 13476,
 11,
 339,
 550,
 5710,
 465,
 12036,
 11,
 6405,
 257,
 5527,
 27075,
 11,
 290,
 4920,
 2241,
 287,
 257,
 4489,
 64,
 319,
 262,
 34686,
 41976,
 13,
 357,
 10915,
 314,
 2138,
 1807,
 340,
 561,
 423,
 587,
 10598,
 393,
 28537,
 2014,
 198,
 198,
 1,
 464,
 6001,
 286,
 465,
 13476,
 1,
 438,
 5562,
 373,
 644,
 262,
 1466,
 1444,
 340,
 13,
 314,
 460,
 3285,
 9074,
 13,
 46606,
 536,
 5469,
 438,
 14363,
 938,
 4842,
 1650,
 353,
 438,
 2934,
 489,
 3255,
 465,
 48422,
 540,
 450,
 67,
 3299,
 13,
 366,
 5189,
 1781,
 340,
 338,
 1016,
 284,
 3758,
 262,
 1988,
 286,
 616,
 4286,
 705,
 1014,
 510,
 26,
 475,
 314,
 836,
 470,
 892,
 286,
 326,
 11,
 1770,
 13,
 8759,
 2763,
 438,
 1169,
 2994,
 284,
 943,
 17034,
 318,
 477,
 314,
 892,


In [35]:
enc_sample = enc_text[50:]

In [36]:
#We will assume as 4 window but generally its very large like 1024 for gpt-2.0

len(enc_sample)

5095

In [37]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f" x: {x}")
print(f" y:      {y}")

 x: [290, 4920, 2241, 287]
 y:      [4920, 2241, 287, 257]


In [38]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "---->", desired)

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [39]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


In [40]:
#PyTorch Usage

import torch

In [41]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        #Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        #Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i+1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids) 

    def __getitem__(self,idx):
        return self.input_ids[idx], self.target_ids[idx]

In [45]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    #Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")
    
    #Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    
    #Create dataloader
    dataloader = DataLoader(
    dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
    
    return dataloader

In [46]:
with open("the-verdict-txt", "r", encoding="utf=8") as f:
    raw_text = f.read()

In [47]:
dataloader = create_dataloader_v1( raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [92]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [93]:
#this stride=1 can lead to overfitting so lets do with stride= 4 instead
dataloader = create_dataloader_v1( raw_text, batch_size=1, max_length=4, stride=4, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [94]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[1807, 3619,  402,  271]]), tensor([[ 3619,   402,   271, 10899]])]


In [95]:
#with different batch size
dataloader = create_dataloader_v1(raw_text, batch_size=4, max_length=4, stride=4, shuffle=False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:", inputs)
print("Targets:", targets)

Inputs: tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257]])
Targets: tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922]])


In [48]:
#Creating token embeddings
input_ids = torch.tensor([ 2,   3,    5,    1])

In [49]:
tokenizer.n_vocab

50257

In [50]:
vocab_size = 6
output_dim = 3
torch.manual_seed(123) #Random seed 
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [51]:
#Getting weight parameters
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [52]:
#This is the 3rd index row in the previous output
embedding_layer(torch.tensor([3]))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)

In [53]:
#This is the 2nd index row in the previous output
embedding_layer(torch.tensor([2]))

tensor([[ 1.2753, -0.2010, -0.1606]], grad_fn=<EmbeddingBackward0>)

In [54]:
embedding_layer(input_ids)

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)

In [55]:
vocab_size = 6
output_dim = 3
torch.manual_seed(123) #Random seed 
embedding_layer = torch.nn.Embedding(tokenizer.n_vocab, output_dim)

In [56]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035],
        [-0.5880,  0.3486,  0.6603],
        [-0.2196, -0.3792,  0.7671],
        ...,
        [-0.5931,  1.0895, -0.6854],
        [ 0.7447,  0.5803, -0.4246],
        [-0.3130,  0.7558, -1.2656]], requires_grad=True)


In [57]:
#Encoding word positions
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [58]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [59]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [60]:
#Each token id now gets even more dimensions (256 here)
token_embeddings = token_embedding_layer(inputs)
token_embeddings.shape

torch.Size([8, 4, 256])

In [61]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [62]:
torch.arange(max_length)

tensor([0, 1, 2, 3])

In [63]:
pos_embedding_layer.weight

Parameter containing:
tensor([[-1.3798,  1.3476, -0.3612,  ..., -0.7712,  0.1523, -0.5973],
        [ 0.3611, -0.5228, -0.2888,  ...,  0.8571,  0.2221,  0.1976],
        [ 1.2194,  0.8234,  0.2277,  ...,  2.5752, -1.7081, -0.5515],
        [-0.5765, -1.6450, -1.3456,  ...,  0.7075,  0.0123, -1.2205]],
       requires_grad=True)

In [64]:
pos_embedding_layer(torch.arange(max_length))

tensor([[-1.3798,  1.3476, -0.3612,  ..., -0.7712,  0.1523, -0.5973],
        [ 0.3611, -0.5228, -0.2888,  ...,  0.8571,  0.2221,  0.1976],
        [ 1.2194,  0.8234,  0.2277,  ...,  2.5752, -1.7081, -0.5515],
        [-0.5765, -1.6450, -1.3456,  ...,  0.7075,  0.0123, -1.2205]],
       grad_fn=<EmbeddingBackward0>)

In [65]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [66]:
token_embeddings.shape

torch.Size([8, 4, 256])

In [67]:
token_embeddings[0] + pos_embeddings

tensor([[-0.9206,  2.1135, -2.0881,  ..., -2.4569, -0.8164, -1.1871],
        [ 0.3567,  1.1969,  0.0850,  ...,  0.3700,  0.8277,  0.8440],
        [ 1.1740, -0.0410, -1.7390,  ...,  3.6197, -4.8771,  1.4966],
        [-0.9454, -1.5052, -1.5424,  ...,  0.6199,  1.6592, -1.4935]],
       grad_fn=<AddBackward0>)

In [68]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
