## We are opening a book named "The Verdict" in python

In [1]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print("Total characters :", len(raw_text))
print(raw_text[:99])

Total characters : 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [2]:
import re
text = "Hello World. This, is a text"
result = re.split(r'(\s)', text)
result

['Hello', ' ', 'World.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'text']

In [3]:
result = re.split(r'([,.]|\s)', text)
result


['Hello',
 ' ',
 'World',
 '.',
 '',
 ' ',
 'This',
 ',',
 '',
 ' ',
 'is',
 ' ',
 'a',
 ' ',
 'text']

In [4]:
# " hwllo ".strip() --> Returns TRUE
# " ".strip() --> Returns FALSE

In [5]:
result = [item for item in result if item.strip()]
result

['Hello', 'World', '.', 'This', ',', 'is', 'a', 'text']

In [6]:
new_text = "Hello world, Is this-- a new test"
result_2 = re.split(r'([.,:;?_!\'"]|--|\s)', new_text)

## Removing the white spaces
result_2 = [item.strip() for item in result if item.strip()]
result_2

['Hello', 'World', '.', 'This', ',', 'is', 'a', 'text']

In [7]:
preprocessed = re.split(r'([.,:;?_!\'"]|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

In [8]:
preprocessed[:20]
print(len(preprocessed))

4685


## Step 2 Creating TOKENS

In [9]:
all_words = sorted(set(preprocessed))
print(len(all_words))

1131


In [10]:
vocab = {token:integer for integer,token in enumerate(all_words)}
for i,item in enumerate(vocab.items()) :
    print(item)
    if i >=50 :
        break


('!', 0)
('"', 1)
("'", 2)
('(I', 3)
('(Though', 4)
(')', 5)
(',', 6)
('--', 7)
('.', 8)
(':', 9)
(';', 10)
('?', 11)
('A', 12)
('Ah', 13)
('Among', 14)
('And', 15)
('Are', 16)
('Arrt', 17)
('As', 18)
('At', 19)
('Be', 20)
('Begin', 21)
('Burlington', 22)
('But', 23)
('By', 24)
('Carlo', 25)
('Chicago', 26)
('Claude', 27)
('Come', 28)
('Croft', 29)
('Croft)', 30)
('Destroyed', 31)
('Devonshire', 32)
('Don', 33)
('Dubarry', 34)
('Emperors', 35)
('Florence', 36)
('For', 37)
('Gallery', 38)
('Gideon', 39)
('Gisburn', 40)
('Gisburns', 41)
('Grafton', 42)
('Greek', 43)
('Grindle', 44)
('Grindles', 45)
('HAD', 46)
('Had', 47)
('Hang', 48)
('Has', 49)
('He', 50)


In [11]:
class SimpleTokenizerV1:
    def __init__(self, vocab) :
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self,text) :
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self,ids) :
        text = " ".join([self.int_to_str[i] for i in ids])
        # text = "Jonny , are you ok ."
        # Now we will replace the spaces before specific punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        # text = "Jonny, are you OK."
        return text

In [12]:
tokenizer = SimpleTokenizerV1(vocab)
sample_text = """
Well!--even through the prism of Hermia's tears I felt able to face the fact with equanimity. Poor Jack Gisburn! The women had made him--it was fitting that they should mourn him.
"""
tokenizer.decode(tokenizer.encode(sample_text))

"Well! -- even through the prism of Hermia' s tears I felt able to face the fact with equanimity. Poor Jack Gisburn! The women had made him -- it was fitting that they should mourn him."

In [13]:
## But what if the words are not included in existing vocab ?

In [14]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {str:integer for integer,str in enumerate(all_tokens)}
len(vocab.items())

1133

In [15]:
for i,item in enumerate(list(vocab.items())[-5:]) :
    print(i, item)

0 ('younger', 1128)
1 ('your', 1129)
2 ('yourself', 1130)
3 ('<|endoftext|>', 1131)
4 ('<|unk|>', 1132)


In [16]:
class SimpleTokenizerV2 :
    def __init__(self,vocab) :
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in self.str_to_int.items()}
    
    def encode(self, text) :
        preprocessed = re.split(r'([,.:;?!_"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int
                       else "<|unk|>" for item in preprocessed]
        
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self,ids) :
        text = " ".join([self.int_to_str[i] for i in ids])
        # removing the spaces before Punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text


In [17]:
tokenizer = SimpleTokenizerV2(vocab)
sample_text = "Hello, I am Vasu Choudhari, and I am here to help the students learn AI DL ML etc."
encoded = tokenizer.encode(sample_text)
print(encoded)
decoded = tokenizer.decode(encoded)
print(decoded)

[1132, 6, 55, 151, 1132, 1132, 6, 158, 55, 151, 541, 1017, 1132, 989, 1132, 1132, 1132, 1132, 1132, 1132, 8]
<|unk|>, I am <|unk|> <|unk|>, and I am here to <|unk|> the <|unk|> <|unk|> <|unk|> <|unk|> <|unk|> <|unk|>.


In [18]:
import tiktoken

In [19]:
enc = tiktoken.get_encoding("gpt2")


In [20]:
enc
# enc will work as a vocab (Vocab of GPT-2)

<Encoding 'gpt2'>

In [21]:
text = (
    "Hello, How are you! <|endoftext|> In the shores of Goa"
    "of someunknownPlace."
)

integers = enc.encode(text, allowed_special={"<|endoftext|>"})
integers

[15496,
 11,
 1374,
 389,
 345,
 0,
 220,
 50256,
 554,
 262,
 29963,
 286,
 1514,
 64,
 1659,
 617,
 34680,
 27271,
 13]

In [22]:
strings = enc.decode(integers)
strings

'Hello, How are you! <|endoftext|> In the shores of Goaof someunknownPlace.'

## Creating INPUT TAGEET PAIRS

In [23]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
enc_text = enc.encode(raw_text)
enc_text

[40,
 367,
 2885,
 1464,
 1807,
 3619,
 402,
 271,
 10899,
 2138,
 257,
 7026,
 15632,
 438,
 2016,
 257,
 922,
 5891,
 1576,
 438,
 568,
 340,
 373,
 645,
 1049,
 5975,
 284,
 502,
 284,
 3285,
 326,
 11,
 287,
 262,
 6001,
 286,
 465,
 13476,
 11,
 339,
 550,
 5710,
 465,
 12036,
 11,
 6405,
 257,
 5527,
 27075,
 11,
 290,
 4920,
 2241,
 287,
 257,
 4489,
 64,
 319,
 262,
 34686,
 41976,
 13,
 357,
 10915,
 314,
 2138,
 1807,
 340,
 561,
 423,
 587,
 10598,
 393,
 28537,
 2014,
 198,
 198,
 1,
 464,
 6001,
 286,
 465,
 13476,
 1,
 438,
 5562,
 373,
 644,
 262,
 1466,
 1444,
 340,
 13,
 314,
 460,
 3285,
 9074,
 13,
 46606,
 536,
 5469,
 438,
 14363,
 938,
 4842,
 1650,
 353,
 438,
 2934,
 489,
 3255,
 465,
 48422,
 540,
 450,
 67,
 3299,
 13,
 366,
 5189,
 1781,
 340,
 338,
 1016,
 284,
 3758,
 262,
 1988,
 286,
 616,
 4286,
 705,
 1014,
 510,
 26,
 475,
 314,
 836,
 470,
 892,
 286,
 326,
 11,
 1770,
 13,
 8759,
 2763,
 438,
 1169,
 2994,
 284,
 943,
 17034,
 318,
 477,
 314,
 892,


In [24]:
enc_sample = enc_text[:50]
enc_sample

[40,
 367,
 2885,
 1464,
 1807,
 3619,
 402,
 271,
 10899,
 2138,
 257,
 7026,
 15632,
 438,
 2016,
 257,
 922,
 5891,
 1576,
 438,
 568,
 340,
 373,
 645,
 1049,
 5975,
 284,
 502,
 284,
 3285,
 326,
 11,
 287,
 262,
 6001,
 286,
 465,
 13476,
 11,
 339,
 550,
 5710,
 465,
 12036,
 11,
 6405,
 257,
 5527,
 27075,
 11]

In [25]:
context_size = 4
# this means that model will look upon 4 tokens at a time, and will produce 4 tokens for the input
## ex:  input : [1,2,3,4] ---> out : [2,3,4,5]

# Example : 
x  = enc_sample[:context_size]
y = enc_sample[1:context_size + 1]

print(x)
print(y)

[40, 367, 2885, 1464]
[367, 2885, 1464, 1807]


In [26]:
for i in range(1,context_size+1) : 
    context = enc_sample[:i]
    desired = enc_sample[i]
    
    print(context, " --> ", desired)
    print(enc.decode(context), "-->", enc.decode([desired]))

[40]  -->  367
I -->  H
[40, 367]  -->  2885
I H --> AD
[40, 367, 2885]  -->  1464
I HAD -->  always
[40, 367, 2885, 1464]  -->  1807
I HAD always -->  thought


## Implementing the DataSet

In [27]:
import torch

from torch.utils.data import Dataset, DataLoader

class GPTDataSetV1(Dataset) : 
    def __init__(self,txt,tokenizer, max_length, stride) :
        ## txt --> "the verdict"
        ## max_length --> context_Size
        ## stride is the movement count of sliding window | 1
        
        self.input_ids = []
        self.target_ids = []
        
        # tokenizer has already a predefined vocab...
        token_ids = tokenizer.encode(txt, allowed_special = {"<|endoftext|>"})
        
        ## using the sliding window approach
        
        for i in range(0, len(token_ids) - max_length, stride) :
            input_chunk = token_ids[i:i+max_length]
            output_chunk = token_ids[i+1:i+1+max_length]
            
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(output_chunk))
        
    def __len__(self) :
        return len(self.input_ids)
        
    def __getitem__(self,idx) :
        return self.input_ids[idx], self.target_ids[idx]
    

In [28]:
sample_dataloader = GPTDataSetV1(raw_text, enc,4,1)
sample_dataloader[:5]

([tensor([  40,  367, 2885, 1464]),
  tensor([ 367, 2885, 1464, 1807]),
  tensor([2885, 1464, 1807, 3619]),
  tensor([1464, 1807, 3619,  402]),
  tensor([1807, 3619,  402,  271])],
 [tensor([ 367, 2885, 1464, 1807]),
  tensor([2885, 1464, 1807, 3619]),
  tensor([1464, 1807, 3619,  402]),
  tensor([1807, 3619,  402,  271]),
  tensor([ 3619,   402,   271, 10899])])

## Creating a DataLoader

In [29]:
def create_dataloader_v1(txt,batch_size=4, max_length=256, stride=128,
                         shuffle=True, drop_last=True,num_workers=0) :   
    
    tokenizer = tiktoken.get_encoding("gpt2")
    
    dataset = GPTDataSetV1(txt, tokenizer, max_length, stride)
    
    dataloader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle=shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )
    
    return dataloader

In [30]:
dataloader = create_dataloader_v1(raw_text,1,4,1)
# dataloader will contain batches of batch_size = 4
len(list(dataloader)) # --> this is the number of batches in dataloader
data_iter = iter(dataloader)

first_batch = next(data_iter)
first_batch

[tensor([[ 550,  757, 1057,  625]]), tensor([[ 757, 1057,  625,  422]])]

# VECTOR EMBEDDINGS

In [31]:
import gensim.downloader as api

In [32]:
import numpy
numpy.__version__


'1.26.4'

In [34]:
import gensim.downloader as api
model = api.load("word2vec-google-news-300")



IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





In [38]:
## We will create an embeddding layer weight matrix :
# it is also known as lookup table
vocab_size = 6
"""
    fox : 0
    house : 1
    in : 2
    is : 3
    quick : 4
    the : 5
"""

output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
embedding_layer

Embedding(6, 3)

In [39]:
embedding_layer.weight

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)

In [43]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315]], grad_fn=<EmbeddingBackward0>)


In [44]:
input_ids = torch.tensor([0,4,2,1])
print(embedding_layer(input_ids))

tensor([[ 0.3374, -0.1778, -0.1690],
        [-1.1589,  0.3255, -0.6315],
        [ 1.2753, -0.2010, -0.1606],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


## Positional Embedding

In [45]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size,output_dim)

In [47]:
token_embedding_layer.weight

Parameter containing:
tensor([[-2.1338,  1.0524, -0.3885,  ...,  0.2461,  1.2119,  0.3171],
        [ 1.2277, -0.4297, -2.2121,  ..., -0.1640, -0.3348, -0.0221],
        [ 1.3382,  0.2706,  0.5071,  ...,  0.0175, -2.1517,  0.3924],
        ...,
        [-1.4889, -1.2456,  1.8034,  ..., -0.6392, -1.4939,  0.3614],
        [-1.0703,  0.2795, -0.2637,  ..., -0.2810, -1.4755, -0.1183],
        [-0.0071,  0.4982, -0.3319,  ...,  0.4970,  0.9365, -0.2091]],
       requires_grad=True)

In [49]:
max_length = 4
dataloader = create_dataloader_v1(raw_text, 8, max_length, stride=max_length, shuffle=False)
data_iter = iter(dataloader)

inputs,target = next(data_iter)

print(inputs,target)

tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]]) tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [50]:
print(inputs.shape)

torch.Size([8, 4])


In [52]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])
