In [1]:
#Tokenization 
with open("verdict.txt",'r') as file:
    file = file.read()

print("Total no of characters", len (file))
file[:100]
#our goal is to  tokenize 20479 characters 

Total no of characters 20480


'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g'

In [2]:
import re 
text = "Hello, How are you? My name is Ripesh Ghimire? Are you doinge well? Lets talk shall we "
result = re.split(r'(\s)',text)

In [3]:
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', file)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4649


In [4]:
#Now we convert the token into token id 
all_words = sorted(list(set(preprocessed)))
all_words.extend(["<|unk|>","<|endoftext|>"])
vocab_size = len(all_words)
print(vocab_size)

1162


In [13]:
#converting the given tokens into token-ids
vocab = {token:integer for integer,token in enumerate(all_words)}
last_items = list(vocab.items())[:]

In [82]:
class SimpleTokenizer:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self,text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids 
    def decode(self,ids):
        text = "".join([self.int_to_str[i]for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) #E
        return text

In [83]:
tokenizer = SimpleTokenizer(vocab)

In [84]:
#adding special context tokens 
'''
as we can see we applied simple tokenizer ti a passage from a training set In this section we will modify this tokenizer to handle unknown words
usage and addition of special context tokens that can enhance a model's understanding of context or other relevant information in the text. 
Thes special tokens can include markers for unknown words and document boundaries 

we will modify the vocabulary and toknizer we implemeneted in the previous section, Simple TokenizerV2 to support new token <|unk|> and <|end of text |>

why do you we add this tokens? 
so we add these tokens so we can know the new words that were not part of the training adata and thus not part of the existing vocabulary.  we add end of text token to know that we can user to separate two unrelated text sources

'''
class SimpleTokenizerV2:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self,text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids 
    def decode(self,ids):
        text = "".join([self.int_to_str[i]for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) #E
        return text

In [85]:
text1 = "Hello do you like tea?"
text2 = "In the sunlit terraces ot the palace "
text = "<|endoftext|>".join((text1,text2))
print(text)

Hello do you like tea?<|endoftext|>In the sunlit terraces ot the palace 


In [88]:
#Byte Pair Encoding 
'''
IT BUILDS IT vocabulary by iteratively mergin frequent characters into subwords and frequent subwords into words.
For example: BPE start with adding all individual single characters to its vocabulary ("a","b") IN the next stage, it merges characters combinations that frequently occur together with subwords. For
example d and e may be merge into the word "de" which is common in many English words like "define" "depend" "made" and "hiddden ". The merges are determined by frequency cut off 
'''

import tiktoken
encoder = tiktoken.encoding_for_model("gpt-2")

In [11]:
encoder.decode(encoder.encode("How are you"))

'How are you'

'''
DATA SAMPLING WITH A SLIDING WINDOW 

'''

In [10]:
import tiktoken 
with open("verdict.txt",'r',encoding="utf-8") as f:
    text = f.read() 

In [12]:
tokenizer = tiktoken.get_encoding("gpt2")
enc_text = tokenizer.encode(text)

In [16]:
len(enc_text)

5145

In [17]:
enc_sample = enc_text[50:]

In [30]:
context_size = 4

In [33]:
x = enc_sample[:context_size]
y = enc_sample[:context_size+1]

In [34]:
x

[290, 4920, 2241, 287]

In [35]:
y

[290, 4920, 2241, 287, 257]

In [42]:
for i in range(1,context_size+1):
    context = enc_sample[:i] #process all the context based on the loop where the loop slices all the element are the there for sampling 
    desired  = enc_sample[i] #starts the loop with 1 index because the if we start from 0 there is will a empty list pointing at the first element

    print(context,"---------->",desired)

[290] ----------> 4920
[290, 4920] ----------> 2241
[290, 4920, 2241] ----------> 287
[290, 4920, 2241, 287] ----------> 257


In [45]:
for i in range(1,context_size+1):
    context = enc_sample[:i] #process all the context based on the loop where the loop slices all the element are the there for sampling 
    desired  = enc_sample[i] #starts the loop with 1 index because the if we start from 0 there is will a empty list pointing at the first element

    print(tokenizer.decode(context),"---------->",tokenizer.decode([desired]))

 and ---------->  established
 and established ---------->  himself
 and established himself ---------->  in
 and established himself in ---------->  a


In [58]:
import torch 
from torch.utils.data import Dataset,DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self,text,tokenizer,max_length,stride):
            self.tokenizer = tokenizer
            self.input_ids = []
            self.target_ids = []
            token_ids = tokenizer.encode(text)

            for i in range(0,len(token_ids)- max_length,stride):
                  input_chunk = token_ids[i:i+max_length]
                  target_chunk = token_ids[i+1:i+max_length]
                  self.input_ids.append(input_chunk)
                  self.target_ids.append(target_chunk)
    def __len__(self):
          return len(self.input_ids)
    def __getitem__(self,idx):
          return self.input_ids[idx],self.target_ids[idx]

In [59]:
def create_loader_v1(txt,batch_size = 4 ,max_length = 256,stride=128,shuffle=True,drop_last=True):
    tokenizer= tiktoken.get_encoding('gpt2')
    dataset = GPTDatasetV1(txt,tokenizer,max_length,stride)
    dataloader = DataLoader(dataset,batch_size=batch_size,shuffle=shuffle,drop_last=drop_last)
    return dataloader

In [60]:
with open('verdict.txt','r',encoding='utf-8') as file : 
    raw_text = file.read() 

In [92]:
dataloader = create_loader_v1(txt = raw_text,batch_size=8,max_length=4,stride=4,shuffle=False)
data_itre = iter(dataloader)
first_batch = next(data_itre)
print(first_batch)

[[tensor([   40,  1807, 10899, 15632,   922,   568,  1049,   284]), tensor([ 367, 3619, 2138,  438, 5891,  340, 5975, 3285]), tensor([2885,  402,  257, 2016, 1576,  373,  284,  326]), tensor([1464,  271, 7026,  257,  438,  645,  502,   11])], [tensor([ 367, 3619, 2138,  438, 5891,  340, 5975, 3285]), tensor([2885,  402,  257, 2016, 1576,  373,  284,  326]), tensor([1464,  271, 7026,  257,  438,  645,  502,   11])]]


In [93]:
second_batch = next(data_itre)
second_batch

[[tensor([  287,   465,   550,    11, 27075,  2241,    64, 41976]),
  tensor([  262, 13476,  5710,  6405,    11,   287,   319,    13]),
  tensor([6001,   11,  465,  257,  290,  257,  262,  357]),
  tensor([  286,   339, 12036,  5527,  4920,  4489, 34686, 10915])],
 [tensor([  262, 13476,  5710,  6405,    11,   287,   319,    13]),
  tensor([6001,   11,  465,  257,  290,  257,  262,  357]),
  tensor([  286,   339, 12036,  5527,  4920,  4489, 34686, 10915])]]

In [95]:
torch.manual_seed(100)
input_ids = torch.tensor([2,3,3,5])
vocab_size = 6 
output_dim = 3
embedding_layer = torch.nn.Embedding(vocab_size,output_dim)


In [96]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.1268,  1.3564, -0.0247],
        [-0.8466,  0.0293, -0.5721],
        [-1.2546,  0.0486,  0.2753],
        [-2.1550, -0.7116,  0.0575],
        [ 0.6263, -1.7736, -0.2205],
        [ 2.7467, -1.0480,  1.1239]], requires_grad=True)


In [97]:
'''
“If we compare the embedding vector for token ID 3 to the previous embedding matrix, we see that it is identical to the 4th row (Python starts with a zero index, so it's the row corresponding to index 3). 
In other words, the embedding layer is essentially a look-up operation that retrieves rows from the embedding layer's weight matrix via a token ID.”


'''
embedding_layer(torch.tensor([3]))

tensor([[-2.1550, -0.7116,  0.0575]], grad_fn=<EmbeddingBackward0>)

In [98]:
embedding_layer(input_ids)

tensor([[-1.2546,  0.0486,  0.2753],
        [-2.1550, -0.7116,  0.0575],
        [-2.1550, -0.7116,  0.0575],
        [ 2.7467, -1.0480,  1.1239]], grad_fn=<EmbeddingBackward0>)

In [99]:
'''“Figure 2.16 Embedding layers perform a look-up operation, retrieving the embedding vector corresponding to the token ID from the embedding layer's weight matrix. 
For instance, the embedding vector of the token ID 5 is the sixth row of the embedding layer weight matrix (it is the sixth instead of the fifth row because Python starts counting at 0). 
For illustration purposes, we assume that the token IDs were produced by the small vocabulary we used in section 2.3.'''

"“Figure 2.16 Embedding layers perform a look-up operation, retrieving the embedding vector corresponding to the token ID from the embedding layer's weight matrix. \nFor instance, the embedding vector of the token ID 5 is the sixth row of the embedding layer weight matrix (it is the sixth instead of the fifth row because Python starts counting at 0). \nFor illustration purposes, we assume that the token IDs were produced by the small vocabulary we used in section 2.3."

In [100]:
'''
Encoding word positions 
converted token ids into embedding , In principle this is a suitable input for an LLM. However a minor shortcoming of llms is that their self attention mechanism . doesn't have the notion of position or order for the order for the position
within a sequence 
'''

"\nEncoding word positions \nconverted token ids into embedding , In principle this is a suitable input for an LLM. However a minor shortcoming of llms is that their self attention mechanism . doesn't have the nnotion of position or order for the order for the position\nwithin a sequence \n"