In [24]:
import pandas as pd

In [25]:
#Reading in a short story as text sample into Python
with open("the-verdict.txt", "r", encoding = "utf-8") as f:
          raw_text = f.read()
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20398
﻿I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no


### Preprocessing Steps (Chapter 2)

#### 1. Tokenizing Text

In [27]:
import re
text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)
print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [28]:
#separting punction and spaces from the text
result = re.split(r'([,.]|\s)', text)
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [29]:
#remvoing white spaces
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [30]:
#extending the tokenizer to handle wider range of punctuations
text = "Hello, world. Is this-- a test."
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print("Tokenized text: ",result)

Tokenized text:  ['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '.']


In [31]:
#applying this tokenizer to the full Edith Warton's story
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(f"length of the complete tokenized text from Edith Wharton story: {len(preprocessed)}")

length of the complete tokenized text from Edith Wharton story: 4690


In [32]:
#printing first 30 tokens
print(f"First 30 tokens: {preprocessed[:30]}")

First 30 tokens: ['\ufeffI', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


#### 2. Converting token into token IDs

create a list of all unique tokens and sort them alphabetically to determine the vocabulary

In [35]:
all_words = sorted(list(set(preprocessed)))
vocab_size = len(all_words)
print(f"Vocabulary Size: {vocab_size}")

Vocabulary Size: 1131


In [36]:
#creating a vocabulary
vocab = {token: integer for integer, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i > 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)
('His', 51)


In [37]:
#implementing a tokenizer class to tokenize text into tokens, encode tokens to integers and decode integers to tokens

class SimpleTokenzierV1:
    def __init__(self, vocab):
        self.str_to_int = vocab #A
        self.int_to_str = {i:s for s,i in vocab.items()} #B

    def encode(self, text): #C
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids): #D
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) #E
        return text

In [38]:
#instantiating tokenizer object to test the SimpleTokenzierV1 class
tokenizer = SimpleTokenzierV1(vocab)

In [39]:
text = raw_text[:99]
ids = tokenizer.encode(text)
print(f"Token ids for sample text from Edith Warton's story: {ids}")

Token ids for sample text from Edith Warton's story: [1130, 44, 149, 1003, 57, 38, 818, 115, 256, 486, 6, 1002, 115, 500, 435, 392, 6, 908, 585, 1077, 709]


In [40]:
#testing decoder on the token ids above
print(f"text from decoding token ids using SimpleTokenzierV1 class: \n{tokenizer.decode(ids)}")

text from decoding token ids using SimpleTokenzierV1 class: 
﻿I HAD always thought Jack Gisburn rather a cheap genius -- though a good fellow enough -- so it was no


In [41]:
#running the tokenizer on a sample text which is not a part of the text used to create the vocab
text = "Hello, do you like tea?"
tokenizer.encode(text)

KeyError: 'Hello'

here, the key error suggests that Hello is not a part of the vocabulary and hence we need to use large and diverse text in order to extend the vocabulary when creating a LARGE language models

#### 3. Adding special context tokens

modifying SimpleTokenzierV1 to support new tokens for unknown words and document boundaries

In [42]:
'''adding tokens:
1. <unk> - to represent unknown or new words that are not part of the vocabulary
2. <|endoftext|> -  marker to separate two different text source from each other
'''
all_tokens =  sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<unk>"])
vocab = {token:integer for integer, token in enumerate(all_tokens)}

print(f"length of the vocabulary after extending the vocab with unknown words and end of text markers: {len(vocab.items())}")

length of the vocabulary after extending the vocab with unknown words and end of text markers: 1133


In [44]:
#printing a sample from the end of the extended vocab
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('your', 1128)
('yourself', 1129)
('\ufeffI', 1130)
('<|endoftext|>', 1131)
('<unk>', 1132)


In [46]:
#SimpleTokenzierV2 replaces unknown words with the token "<unk>"
class SimpleTokenzierV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int
                       else "<unk>" for item in preprocessed] #A
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) #B
        return text

In [48]:
#concatenating two unrelated texts
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))
print(f"concatenated text with endoftext marker: \n{text}")

concatenated text with endoftext marker: 
Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [50]:
#testing the SimpleTokenzierV2 on a text that is concatenation of two unrelated texts
tokenizer = SimpleTokenzierV2(vocab)
print(tokenizer.encode(text))

[1132, 5, 355, 1126, 628, 975, 10, 1131, 55, 988, 956, 984, 722, 988, 1132, 7]


we can see that the list of token IDs contains 1131 for the <|endoftext|> separator token as well as two 1132 tokens, which are used for unknown words.

In [53]:
#testing detokenizer for SimpleTokenzierV2
print(tokenizer.decode(tokenizer.encode(text)))

<unk>, do you like tea? <|endoftext|> In the sunlit terraces of the <unk>.


#### 4. BPE: Byte pair encoding

unlike the simple tokenizer implemented above, GPT uses byte pair encoding tokenizer which does not replace unknown words with "unk" token but breaks down the word into subword units

In [63]:
pip install tiktoken

Note: you may need to restart the kernel to use updated packages.


In [67]:
from importlib.metadata import version
import tiktoken
print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.9.0


In [71]:
tokenizer = tiktoken.get_encoding("gpt2") #similar to SimpleTokenzierV2

In [75]:
#encoding using BPE
text = "Hello, do you like tea? <|endoftext|> In the sunlit terra"
integers = tokenizer.encode(text, allowed_special = {"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 1059, 430]


here, the tokenzier handles unknown words by breaking them down into subword units or even individual characters provinding them the largest token id

In [77]:
#decoding using BPE
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terra


In [84]:
text = "Akwirw ier"
integers = tokenizer.encode(text)
strings = tokenizer.decode(integers)
print(f"token id of unknown words broken down into subword units: {integers}")
print(f"text of unknown words from token ids: {strings}")

token id of unknown words broken down into subword units: [33901, 86, 343, 86, 220, 959]
text of unknown words from token ids: Akwirw ier


BPE is able to handle unknown words by building a vocab by iteratively merging frequent characters into subwords and frequest subwords into words

#### 5. Data Sampling with a sliding window

In [89]:
#tokenizing the entire Edith Warton story usinf the BPE tokenizer
with open("the-verdict.txt", "r", encoding = "utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(f"length of tokenized Edith Warton Story {len(enc_text)}")

length of tokenized Edith Warton Story 5066


In [91]:
#taking a sample of first 50 tokens
enc_sample = enc_text[50:]

In [99]:
#creating input-target pairs for next word prediction
context_size = 4 #A

x = enc_sample[:context_size] #contains the input tokens
y = enc_sample[1:context_size+1] #contains targets' inputs shiftd by 1

print(f"x: {x}")
print(f"y:        {y}")

x: [5527, 27075, 11, 290]
y:        [27075, 11, 290, 4920]


In [101]:
#creating the next-word prediction
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "------>" , desired)

[5527] ------> 27075
[5527, 27075] ------> 11
[5527, 27075, 11] ------> 290
[5527, 27075, 11, 290] ------> 4920


In [105]:
#creating the next-word prediction - visualizing the decoded text
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "------>" , tokenizer.decode([desired]))

 rich ------>  widow
 rich widow ------> ,
 rich widow, ------>  and
 rich widow, and ------>  established


In [121]:
#implementing a data loader that iterates over the input dataset and returns inputs & targets as pytorch tensors
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt) #A

        for i in range(0, len(token_ids) - max_length, stride): #B
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self): #C
        return len(self.input_ids)

    def __getitem__(self, idx): #D
        return self.input_ids[idx], self.target_ids[idx]

In [129]:
#loading the input in batches via PyTorch DataLoader
def create_dataloader_v1(txt, batch_size = 4, max_length = 256, stride = 128, shuffle = True, drop_last = True):
    tokenizer = tiktoken.get_encoding("gpt2") #A
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) #B
    dataloader = DataLoader(dataset, batch_size = batch_size, shuffle = shuffle, drop_last = drop_last)
    return dataloader

In [135]:
#testing create_dataloader_v1 with batch_size with 1 for an LLM with context size of 4
with open("the-verdict.txt", "r", encoding = "utf-8") as f:
    raw_text = f.read()

dataloader = create_dataloader_v1(raw_text, batch_size = 1, max_length = 4, stride = 1, shuffle = False, drop_last = False)
data_iter = iter(dataloader) #A
first_batch = next(data_iter)
print(first_batch)

[tensor([[171, 119, 123,  40]]), tensor([[119, 123,  40, 367]])]


In [137]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[119, 123,  40, 367]]), tensor([[ 123,   40,  367, 2885]])]


- if we compare the first and the second batch - we can observe that batch token IDs in the second batch have shifted by one position as compared to the first batch
- stride setting determines the number of positons the tokens shift across batches, thereby emulating a sliding window approach
- if the stride is set equal to the input window size it prevents overlap between batches

In [143]:
#using the dataloader to sample with a batch size>1
dataloader = create_dataloader_v1(raw_text, batch_size = 8, max_length = 4, stride = 4, shuffle = False, drop_last = False)
data_iter = iter(dataloader) #A
inputs, targets = next(data_iter)
print(f"Inputs: \n{inputs}")
print(f"Targets: \n{targets}")

Inputs: 
tensor([[  171,   119,   123,    40],
        [  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284]])
Targets: 
tensor([[  119,   123,    40,   367],
        [ 2885,  1464,  1807,  3619],
        [  402,   271, 10899,  2138],
        [  257,  7026, 15632,   438],
        [ 2016,   257,   922,  5891],
        [ 1576,   438,   568,   340],
        [  373,   645,  1049,  5975],
        [  284,   502,   284,  3285]])


increasing the stride to 4 ensure no word is skipped and avoids any overlap between batches (more overlap increase overfitting)

#### 6. Creating token embeddings

Vector representation of token IDs along with token positions (absolute positional embedding) that will serve as an input data format for LLMs

In [150]:
#taking a sample of four input tokens
input_ids = torch.tensor([2, 3, 5, 1])

In [152]:
#taking a vocab size of 6 and embedding size of 3
vocab_size = 6
output_dim = 3

In [154]:
'''taking the vocab_size and output_dim to instantiate a embedding layer in PyTorch 
(setting random seed to 123 for reproducibility)'''

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


- the above matrix is of 6x3 with 6 rows one for each token and 3 columns one for each embedding dimension
- these random values are further optimized in the LLM training process via backpropagation

In [159]:
#applying the instantiated embedding layer to a token ID to obtain the embedding vector
print(embedding_layer(torch.tensor([3]))) #lookup the embedding vector for this token id in the embedding layer weight matrix above

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [162]:
#applying the instantiated embedding layer to all token ID to obtain the embedding vector
print(embedding_layer(input_ids)) 

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


#### 7. Encoding word positions

In [172]:
output_dim = 256
vocab_size = 50257
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [176]:
#instantiating the data loader
max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size = 8, max_length = max_length, stride = max_length, 
                                  shuffle = False, drop_last = False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print(f"Token IDs: \n{inputs}")
print(f"Inputs shape: \n{inputs.shape}")

Token IDs: 
tensor([[  171,   119,   123,    40],
        [  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284]])
Inputs shape: 
torch.Size([8, 4])


the token ID tensor of shape 8X4 indicates that the data batch consists 8 text samples with 4 tokens each

In [181]:
#using the embedding layer to embed these token ids in 256 dimensional vectors
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [183]:
'''following the GPT model's absolute positional embedding approach - creating another embedding layer 
    with same dimension as token_embedding_layer'''

context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [189]:
#adding the pos_embeddings to token_embeddings in each of the 8 batches
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])


these input_embeddings are now ready to be processed by the main LLM Module

### Coding attention mechanism (Chapter 3)

In [200]:
#taking a sample input sentence
import torch
inputs = torch.tensor(
[[0.43, 0.15, 0.89], # Your (x^1) 
 [0.55, 0.87, 0.66], # journey (x^2)
 [0.57, 0.85, 0.64], # starts (x^3)
 [0.22, 0.58, 0.33], # with (x^4)
 [0.77, 0.25, 0.10], # one (x^5)
 [0.05, 0.80, 0.55]] # step (x^6)
)

#### 8. Computing intermediate attention scores

In [208]:
#computing intermediate attention scores between query and input token by taking dot product of query with input token
query = inputs[1] #A
attn_scores_2 = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    attn_scores_2[i] = torch.dot(x_i, query)

print(f"the computed intermediate attention scores are: \n{attn_scores_2}")

the computed intermediate attention scores are: 
tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


#### 9. Obtaining attention weights

In [210]:
#obtaining attention weights (that sum up to 1) by normalizing the attention scores
attn_weights_2_tmp = attn_scores_2 / attn_scores_2.sum()
print(f"attention weights: {attn_weights_2_tmp}")
print(f"sum of attention weights: {attn_weights_2_tmp.sum()}")

attention weights: tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
sum of attention weights: 1.0000001192092896


In [219]:
'''softmax function is commonly used to normalize the attention scores 
(can handle extreme values, ensures attn. weights are alays positive etc.)'''
def softmax_naive(x):
    return torch.exp(x) / torch.exp(x).sum(dim = 0)

attn_weights_2_naive = softmax_naive(attn_scores_2)
print(f"attention weights: {attn_weights_2_naive}")
print(f"sum of attention weights: {attn_weights_2_naive.sum()}")

attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
sum of attention weights: 1.0


In [221]:
#using PyTorch softmax funtion to prevent overflow and underflow while dealing with large or small input values
attn_weights_2 = torch.softmax(attn_scores_2, dim = 0)
print(f"attention weights: {attn_weights_2}")
print(f"sum of attention weights: {attn_weights_2.sum()}")

attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
sum of attention weights: 1.0


#### 10. Calculating context vector

In [225]:
'''calculatin context vector by multiplying embedded input tokens with corresponding attention weights 
and then summing the resulting vectors'''


'calculatin context vector by multiplying embedded input tokens with corresponding attention weights \nand then summing the resulting vectors'