## Hi This is my data preparation worksheet

In [1]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
      raw_text = f.read()

print("Total number of characters: ", len(raw_text))
raw_text[:99]

Total number of characters:  20479


'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no '

In [2]:
import re

In [3]:
text = "Hello I am just trying this out!"
result = re.split(r'(\s)', text)
print(result)

['Hello', ' ', 'I', ' ', 'am', ' ', 'just', ' ', 'trying', ' ', 'this', ' ', 'out!']


In [4]:
result = re.split(r'[,.]|(\s)', text)
print(result)

['Hello', ' ', 'I', ' ', 'am', ' ', 'just', ' ', 'trying', ' ', 'this', ' ', 'out!']


In [5]:
result = [item for item in result if item.strip()]
result

['Hello', 'I', 'am', 'just', 'trying', 'this', 'out!']

This is a simple example of how a tokenizer works

In [6]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',raw_text)

In [7]:
preprocessed = [item for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [8]:
print(len(preprocessed))

4690


Convert Tokens into Token IDs

In [9]:
allwords = sorted(set(preprocessed))

In [10]:
vocab_size = len(allwords)
vocab_size

1130

In [11]:
vocab = {token:integer for integer,token in enumerate(allwords)}

In [12]:
for i,item in enumerate(vocab.items()):
    print(i,"->",item)
    if i>=20:
        break

0 -> ('!', 0)
1 -> ('"', 1)
2 -> ("'", 2)
3 -> ('(', 3)
4 -> (')', 4)
5 -> (',', 5)
6 -> ('--', 6)
7 -> ('.', 7)
8 -> (':', 8)
9 -> (';', 9)
10 -> ('?', 10)
11 -> ('A', 11)
12 -> ('Ah', 12)
13 -> ('Among', 13)
14 -> ('And', 14)
15 -> ('Are', 15)
16 -> ('Arrt', 16)
17 -> ('As', 17)
18 -> ('At', 18)
19 -> ('Be', 19)
20 -> ('Begin', 20)


In [13]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',text)
        preprocessed = [item for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?_!"()\'])', r'\1',text)
        return text

In [14]:
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know,"
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [15]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [16]:
sampleText = "Hello, do you like tea?"
print(tokenizer.encode(sampleText))

KeyError: 'Hello'

Adding Special Context tokens

In [17]:
alltokens = sorted(set(preprocessed))
alltokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer,token in enumerate(alltokens)}

In [18]:
len(vocab.items())

1132

In [19]:
class SimpleTokenizerV2 :
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed [
            item if item in self.str_to_int
            else [ "<|unk|>" for item in preprocessed ]
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?_!"()\'])', r'\1',text)
        return text

In [20]:
tokenizer = SimpleTokenizerV2
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1,text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [21]:
tokenizer.encode(text)

TypeError: SimpleTokenizerV2.encode() missing 1 required positional argument: 'text'

## Byte Pair Encoding (BPE)

Used in GPT-2 and GPT-3

In [54]:
pip install tiktoken

Defaulting to user installation because normal site-packages is not writeable
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp312-cp312-win_amd64.whl.metadata (6.8 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2024.11.6-cp312-cp312-win_amd64.whl.metadata (41 kB)
Downloading tiktoken-0.9.0-cp312-cp312-win_amd64.whl (894 kB)
   ---------------------------------------- 0.0/894.9 kB ? eta -:--:--
   ---------------------------------------- 894.9/894.9 kB 5.8 MB/s eta 0:00:00
Downloading regex-2024.11.6-cp312-cp312-win_amd64.whl (273 kB)
Installing collected packages: regex, tiktoken
Successfully installed regex-2024.11.6 tiktoken-0.9.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [22]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.9.0


In [23]:
tokenizer = tiktoken.get_encoding("gpt2")

In [24]:
text = ("Hello, do you like tea? <|endoftext|> In the sunlit terraces"
        "of someunknownPlace.")

In [25]:
integers = tokenizer.encode(text, allowed_special = {"<|endoftext|>"})

print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [26]:
 strings = tokenizer.decode(integers)
strings

'Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.'

### Create Input - Target data pairs

In [27]:
enc_text = tokenizer.encode(raw_text)

In [28]:
print(len(enc_text))

5145


context size is the size of tokens included in the input

In [29]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

NameError: name 'enc_sample' is not defined

In [30]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "--->", desired)
    print(tokenizer.decode(context), "--->", tokenizer.decode([desired]))

NameError: name 'enc_sample' is not defined

But we need these in the form of tensors as pytorch always works with tensors (i/p tensor & o/p tensor)

#### Implementing a Data Loader

In [31]:
from torch.utils.data import DataLoader, Dataset

In [45]:
class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.output_ids = []
        token_ids = tokenizer.encode(text, allowed_special = {"<|endoftext|>"})
        print(f"Total tokens: {len(token_ids)}")

        if len(token_ids) < max_length:
            print("⚠️ Text is too short! Try reducing max_length.")
            return  
        for i in range(0, len(token_ids)-max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.output_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self,idx):
        return self.input_ids[idx], self.output_ids[idx]
        

In [46]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last = True,
                         num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    print(f"Total samples in dataset: {len(dataset)}")
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

Difference between batch_size and num_workers => batch size is the number of operations the model perform before updating its parameters, while num_workers is the parallel procession consumption of cpus/gpus

In [34]:
import torch

In [81]:
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

data_itr = iter(dataloader)
first_batch = next(data_itr)
print(first_batch)

[tensor([[15496,    11,   466,   345]]), tensor([[ 11, 466, 345, 588]])]


In [82]:
second_batch = next(data_itr)

In [83]:
second_batch

[tensor([[ 11, 466, 345, 588]]), tensor([[ 466,  345,  588, 8887]])]

In [89]:
dataloader2 = create_dataloader_v1(raw_text, batch_size=4, max_length=4, stride=4, shuffle=False)

data_itr2 = iter(dataloader2)
inputs,targets = next(data_itr2)
print(inputs, "--->", targets)

tensor([[15496,    11,   466,   345],
        [  588,  8887,    30,   220],
        [50256,   554,   262,  4252],
        [18250,  8812,  2114,  1659]]) ---> tensor([[   11,   466,   345,   588],
        [ 8887,    30,   220, 50256],
        [  554,   262,  4252, 18250],
        [ 8812,  2114,  1659,   617]])


In [1]:
pip install gensim

Defaulting to user installation because normal site-packages is not writeable
Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-win_amd64.whl.metadata (8.2 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp312-cp312-win_amd64.whl.metadata (61 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Downloading wrapt-1.17.2-cp312-cp312-win_amd64.whl.metadata (6.5 kB)
Downloading gensim-4.3.3-cp312-cp312-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   - -------------------------------------- 0.8/24.0 MB 6.7 MB/s eta 0:00:04
   -- ------------------------------------- 1.6/24.0 MB 3.6 MB/s eta 0:00:07
   --- ------------------------------------ 2.1/24.0 MB 3.2 MB/s eta 0:00:07
   ----- ---


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [35]:
import gensim.downloader as api

In [2]:
model = api.load("word2vec-google-news-300")

300 in this stands for 300 dim vector

In [3]:
word_vectors = model
print(word_vectors['computer'])

[ 1.07421875e-01 -2.01171875e-01  1.23046875e-01  2.11914062e-01
 -9.13085938e-02  2.16796875e-01 -1.31835938e-01  8.30078125e-02
  2.02148438e-01  4.78515625e-02  3.66210938e-02 -2.45361328e-02
  2.39257812e-02 -1.60156250e-01 -2.61230469e-02  9.71679688e-02
 -6.34765625e-02  1.84570312e-01  1.70898438e-01 -1.63085938e-01
 -1.09375000e-01  1.49414062e-01 -4.65393066e-04  9.61914062e-02
  1.68945312e-01  2.60925293e-03  8.93554688e-02  6.49414062e-02
  3.56445312e-02 -6.93359375e-02 -1.46484375e-01 -1.21093750e-01
 -2.27539062e-01  2.45361328e-02 -1.24511719e-01 -3.18359375e-01
 -2.20703125e-01  1.30859375e-01  3.66210938e-02 -3.63769531e-02
 -1.13281250e-01  1.95312500e-01  9.76562500e-02  1.26953125e-01
  6.59179688e-02  6.93359375e-02  1.02539062e-02  1.75781250e-01
 -1.68945312e-01  1.21307373e-03 -2.98828125e-01 -1.15234375e-01
  5.66406250e-02 -1.77734375e-01 -2.08984375e-01  1.76757812e-01
  2.38037109e-02 -2.57812500e-01 -4.46777344e-02  1.88476562e-01
  5.51757812e-02  5.02929

In [5]:
print(word_vectors['cat'].shape)

(300,)


King + Women - Man = ?

In [6]:
print(word_vectors.most_similar(positive=['king', 'women'], negative=['man'], topn=10))

[('queen', 0.4827326238155365), ('queens', 0.466781347990036), ('kumaris', 0.4653734564781189), ('kings', 0.4558638632297516), ('womens', 0.422832190990448), ('princes', 0.4176960587501526), ('Al_Anqari', 0.41725507378578186), ('concubines', 0.4011078476905823), ('monarch', 0.3962482810020447), ('monarchy', 0.39430150389671326)]


In [8]:
print(word_vectors.most_similar("tower", topn=5))

[('towers', 0.8531750440597534), ('skyscraper', 0.6417425870895386), ('Tower', 0.639177143573761), ('spire', 0.594687819480896), ('responded_Understood_Atlasjet', 0.5931612253189087)]


In [10]:
print(word_vectors.similarity('woman', 'man'))

0.76640123


In [11]:
import numpy as np

In [12]:
vec_diff = word_vectors['man'] - word_vectors['women']
magnitude_of_difference = np.linalg.norm(vec_diff)

In [13]:
magnitude_of_difference

2.9112875

The closer the relation the lesser the magnitude of difference between the vectors (which is basically the distance between the vectors)

### Creating Token Embeddings

In [15]:
import torch
input_ids = torch.tensor([2,3,5,1])

Quick fox is in the house

In [16]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [17]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [19]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [20]:
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


So basically, Embedding layer is a simple lookup operation that retrieves rows from the embedding layer weight matrix using a Token ID

### Positional Embedding

In [38]:
print(f"Raw text length: {len(raw_text)}")

Raw text length: 20479


In [36]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [50]:
max_lengeth = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_lengeth, stride=max_lengeth, shuffle=False)
data_iter = iter(dataloader)
inputs,targets = next(data_iter)

Total tokens: 5145
Total samples in dataset: 1286


In [52]:
print("Token IDs: ", inputs)
print("Target IDs: ", targets)
print("Input shape: ", inputs.shape)

Token IDs:  tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Target IDs:  tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])
Input shape:  torch.Size([8, 4])


In [53]:
token_embeddings = token_embedding_layer(inputs)

In [54]:
print(token_embeddings.shape)

torch.Size([8, 4, 256])


Another Embedding layer for positional encoding

In [55]:
context_length = max_lengeth
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [56]:
pos_embeddings = pos_embedding_layer(torch.arange(max_lengeth))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [57]:
pos_embeddings

tensor([[ 1.0841,  0.8481, -0.2037,  ...,  1.0956,  1.4326,  0.2528],
        [-0.6772,  0.2824,  0.9444,  ..., -2.2544, -0.8818, -0.3394],
        [ 0.1966,  0.8533,  0.6744,  ..., -0.8597,  0.1662,  0.2231],
        [-1.1089, -0.7789,  1.4223,  ..., -1.3181, -1.5432, -0.6921]],
       grad_fn=<EmbeddingBackward0>)

Input Embedding = Token Embedding + Position Embedding

In [58]:
input_embeddings = token_embeddings + pos_embeddings

In [59]:
print(input_embeddings.shape)

torch.Size([8, 4, 256])


In [61]:
input_embeddings

tensor([[[ 1.4574e+00,  1.5856e+00, -8.6933e-01,  ...,  1.2545e+00,
           1.2707e+00, -6.2303e-01],
         [ 4.2327e-01, -1.5343e+00,  2.1609e+00,  ..., -4.0276e+00,
          -1.9354e+00,  1.3579e+00],
         [-6.6976e-02,  8.3938e-01,  4.8344e-01,  ..., -1.0456e+00,
           7.8556e-01, -4.0891e-01],
         [ 3.3837e-01, -1.2382e+00,  1.1110e+00,  ...,  2.1853e-01,
          -2.6571e+00, -5.0091e-01]],

        [[ 4.4888e-01,  1.0762e+00,  6.3291e-01,  ..., -4.2410e-01,
           8.5268e-01,  2.9654e-01],
         [-6.5944e-01,  6.8111e-01,  2.0689e+00,  ..., -3.7389e+00,
          -2.0054e-01,  3.8500e-01],
         [-8.0545e-01,  3.5989e-01,  2.0445e+00,  ...,  8.4055e-01,
          -5.4681e-01,  9.5492e-01],
         [-2.7666e+00, -1.1245e+00,  1.5913e+00,  ..., -2.3436e+00,
          -4.9883e-02, -4.3736e-03]],

        [[ 1.4605e-01,  3.1457e+00, -2.9976e+00,  ...,  9.1621e-01,
           9.2723e-01,  1.8062e-01],
         [-7.8389e-01,  1.5837e+00,  1.5828e+00,  .