<a href="https://colab.research.google.com/github/TanviSree/22b1050_llm_from_scratch/blob/main/chapter2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import os
with open("verdict.txt","r",encoding="utf-8") as f:
  raw_text = f.read()
print("Total characters is :",len(raw_text))
print(raw_text[:100])

Total characters is : 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


# Stage 1 : Creating the Embeddings from raw text input

Creating the vocabulary - mapping of all unique tokens(words/special characters) to their token IDs

In [7]:
import re
result = re.split(r'([.,:;_?!"()\']|--|\s)',raw_text)
fin_res = [item for item in result if item.strip()]
all_words = sorted(list(set(fin_res)))
all_words.extend(["<|unk|>","<|endoftext|>"])
vocab_size = len(all_words)
vocab = {token:integer for integer,token in enumerate(all_words)}
print("Vocabulary size : ",vocab_size)
for i,item in enumerate(vocab.items()):
  print(item)
  if i==25:
    break

Vocabulary size :  1132
('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)


Tokenizer class that can encode as well as decode

In [8]:
class tokenizerv1:
  def __init__(self,vocab):
    self.str_to_int = vocab
    self.int_to_str = {v:k for k,v in vocab.items()}
  def encode(self,text):
    tokens = re.split(r'([.,:;_?!"()\']|--|\s)',text)
    fin_res = [item for item in tokens if item.strip()]
    fin_res = [item if item in self.str_to_int else "<|unk|>" for item in fin_res]
    return [self.str_to_int[item] for item in fin_res]
  def decode(self,ids):
    text = " ".join([self.int_to_str[id] for id in ids])
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text


In [9]:
tokenizer = tokenizerv1(vocab)
text = "Well!--even through the prism of Hermia's tears I felt able to face the fact with equanimity. Poor Jack Gisburn! The women had made him--it was fitting that they should mourn him."
ids = tokenizer.encode(text)
print(ids)

[108, 0, 6, 399, 1007, 988, 795, 722, 50, 2, 850, 976, 53, 436, 117, 1016, 418, 988, 420, 1108, 395, 7, 80, 57, 38, 0, 93, 1112, 514, 654, 546, 6, 585, 1077, 444, 987, 994, 879, 687, 546, 7]


In [10]:
tokenizer.decode(ids)

"Well! -- even through the prism of Hermia' s tears I felt able to face the fact with equanimity. Poor Jack Gisburn! The women had made him -- it was fitting that they should mourn him."

Now testing out tokenizer after adding tokens for unknown and endoftext:

In [11]:
print(len(vocab))
text = "even through the prism of Hermia's tears I felt Tanvi to face"
print(tokenizer.encode(text))
text2 = "Poor Jack Gisburn! The women had made him"
text3 = " <|endoftext|> ".join((text,text2))
ids = tokenizer.encode(text3)
print(tokenizer.decode(ids))

1132
[399, 1007, 988, 795, 722, 50, 2, 850, 976, 53, 436, 1130, 1016, 418]
even through the prism of Hermia' s tears I felt <|unk|> to face <|endoftext|> Poor Jack Gisburn! The women had made him


Trying out the BPE tokenizer

In [12]:
import importlib
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
text = "I felt the happiest at DiagonAlley with Hagrid <|endoftext|>"
ids = tokenizer.encode(text,allowed_special={"<|endoftext|>"})
print(ids)

[40, 2936, 262, 49414, 379, 6031, 1840, 2348, 1636, 351, 21375, 6058, 220, 50256]


In [13]:
tokenizer.decode(ids)

'I felt the happiest at DiagonAlley with Hagrid <|endoftext|>'

Creating input-target pairs using sliding window approach

In [14]:
enc1 = tokenizer.encode(raw_text)
enc1 = enc1[50:]
context_size = 4
for i in range(1,context_size):
  x = enc1[:i]
  y = enc1[i]
  print(tokenizer.decode(x),"=>",tokenizer.decode([y]))

 and =>  established
 and established =>  himself
 and established himself =>  in


Using the pytorch Dataset and DataLoader class to create a custom class

In [4]:
from torch.utils.data import Dataset,DataLoader
class GPTDataset(Dataset):
  def __init__(self,text,tokenizer,max_length,stride):
    self.input_ids = []
    self.target_ids = []
    token_ids = tokenizer.encode(text,allowed_special={"<|endoftext|>"})
    for i in range(0,len(token_ids)-max_length,stride):
        input_chunk = token_ids[i:i+max_length]
        target_chunk = token_ids[i+1:i+max_length+1]
        self.input_ids.append(torch.tensor(input_chunk,dtype=torch.long))
        self.target_ids.append(torch.tensor(target_chunk,dtype=torch.long))
  def __len__(self):
    return len(self.input_ids)
  def __getitem__(self,idx):
    return self.input_ids[idx],self.target_ids[idx]

In [5]:
def gpt_dataloader(txt,batch_size=4,max_length=256,stride=128,shuffle=True,drop_last=True,num_workers=0):
  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = GPTDataset(txt,tokenizer,max_length,stride)
  dataloader = DataLoader(dataset,batch_size=batch_size,shuffle=shuffle,drop_last=drop_last,num_workers=num_workers)
  return dataloader

Testing this with our verdict short story

In [18]:
dataloader = gpt_dataloader(raw_text,batch_size=4,max_length=4,stride=4,shuffle=False,drop_last=True,num_workers=0)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)
#basically batch size is number of samples u want to process in the batch, max_length is no.words u want the input sentence to have and stride is offset by which pointer moves to get nex input

[tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257]]), tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922]])]


In [20]:
inputs,targets= next(data_iter)
print("Inputs : ",inputs)
print("Targets : ",targets)

Inputs :  tensor([[  287,   262,  6001,   286],
        [  465, 13476,    11,   339],
        [  550,  5710,   465, 12036],
        [   11,  6405,   257,  5527]])
Targets :  tensor([[  262,  6001,   286,   465],
        [13476,    11,   339,   550],
        [ 5710,   465, 12036,    11],
        [ 6405,   257,  5527, 27075]])


Converting token IDs to embeddings

In [21]:
#for a sample of lets say 20 words in a vocabulary, and output dim of 3, we are creating an embedding matrix
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(20,3)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035],
        [-0.5880,  0.3486,  0.6603],
        [-0.2196, -0.3792,  0.7671],
        [-1.1925,  0.6984, -1.4097],
        [ 0.1794,  1.8951,  0.4954],
        [ 0.2692, -0.0770, -1.0205],
        [-0.1690,  0.9178,  1.5810],
        [ 1.3010,  1.2753, -0.2010],
        [ 0.4965, -1.5723,  0.9666],
        [-1.1481, -1.1589,  0.3255],
        [-0.6315, -2.8400, -1.3250],
        [ 0.1784, -2.1338,  1.0524],
        [-0.3885, -0.9343, -0.4991],
        [-1.0867,  0.8805,  1.5542],
        [ 0.6266, -0.1755,  1.3111],
        [-0.2199,  0.2190,  0.2045],
        [ 0.5146,  0.9938, -0.2587],
        [-1.0826,  0.1036, -2.1996],
        [-0.0885, -0.5612,  0.6716],
        [ 0.6933, -0.9487, -0.0765]], requires_grad=True)


Encoding positions into embeddings

In [25]:
#just create another embedding matrix with appropriate dimensions and addit to tokens embedding matrix
torch.manual_seed(123)
max_length = 4
token_embedding_layer = torch.nn.Embedding(50257,256)
dataloader= gpt_dataloader(raw_text,batch_size=8,max_length=max_length,stride=max_length,shuffle=False,drop_last=True,num_workers=0)
data_iter = iter(dataloader)
inputs,target = next(data_iter)
position_embedding_layer = torch.nn.Embedding(max_length,256)
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape) #expected = 8,4,256
pos_embeddings = position_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape) #expected = 4,256
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape) #expected = 8,4,256

torch.Size([8, 4, 256])
torch.Size([4, 256])
torch.Size([8, 4, 256])
