In [1]:
# convert the text into embeddings
train_test = "The quick brown fox jumps over the lazy dog and chased swift playfully"

In [2]:
import re
tokenize = re.split(r'([,./"]|--|\s)',train_test)
train_tokenize = [item for item in tokenize if item is not None and item.strip()]
vocab = {item:val for val,item in enumerate(train_tokenize)}
print(train_tokenize)
print(vocab)


['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', 'and', 'chased', 'swift', 'playfully']
{'The': 0, 'quick': 1, 'brown': 2, 'fox': 3, 'jumps': 4, 'over': 5, 'the': 6, 'lazy': 7, 'dog': 8, 'and': 9, 'chased': 10, 'swift': 11, 'playfully': 12}


In [3]:
# we can add more regex for this if we want 
class TokenizersV1:
    def __init__(self,vocab):
        self.word_to_int = vocab
        self.int_to_word = {i:s for s,i in vocab.items()}
    
    def encode(self,word):
        word = word.lower()
        preprocess = re.split(r'([,.?]|--|\s)',word)
        preprocess = [item for item in preprocess if item is not None and item.strip()]
        itr = [self.word_to_int[i] for i in preprocess]
        return itr
    
    def decode(self,ids):
        text = " ".join([self.int_to_word[i] for i in ids])
        text = re.sub(r'\s+([.,?/])','/1',text)
        return text

tokenize = TokenizersV1(vocab)
sample = "The Brown dog playfully chased the swift fox"
encoded = tokenize.encode(sample)
decoded = tokenize.decode(encoded)
print(encoded)
print(decoded)


[6, 2, 8, 12, 10, 6, 11, 3]
the brown dog playfully chased the swift fox


In [4]:
# tokenizer that handles unknown words
train_data = "The quick brown fox jumps over the lazy dog and chased swift playfully"
word = re.split(r"([.,?]|--|\s)",train_data)
word = [item for item in word if item is not None and item.strip()]
word.extend(["<|endoftext|>", "<|unk|>"])
word = {item:idx for idx,item in enumerate(word)} 
vocab = word
print(word)


{'The': 0, 'quick': 1, 'brown': 2, 'fox': 3, 'jumps': 4, 'over': 5, 'the': 6, 'lazy': 7, 'dog': 8, 'and': 9, 'chased': 10, 'swift': 11, 'playfully': 12, '<|endoftext|>': 13, '<|unk|>': 14}


In [5]:

class TokenizersV2:
    def __init__(self,vocab):
        self.word_to_int = vocab
        self.int_to_word = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        text = text.lower()
        words = re.split(r'([,.?_/\[\]]|--|\s)', text)
        words = [item for item in words if item is not None and item.strip()]
        words = [item if item in self.word_to_int else "<|unk|>" for item in words]
        ids = [self.word_to_int[word] for word in words]
        return ids
    
    def decode(self,ids):
        decode  = " ".join([self.int_to_word[i] for i in ids])
        decode = re.sub(r'\s + ([,.;:?\ ])',r'/1',decode)
        return decode
    

tokenize = TokenizersV2(vocab)
sample1 = "The Brown dog playfully chased the swift fox virat bmw"
encoded1 = tokenize.encode(sample1)
decoded1 = tokenize.decode(encoded1)

sample2 = "Hello, do you like the tea"
encoded2 = tokenize.encode(sample2)
decoded2 = tokenize.decode(encoded2)

print(encoded1)
print(decoded1)
print("  ")

print(encoded2)
print(decoded2)

[6, 2, 8, 12, 10, 6, 11, 3, 14, 14]
the brown dog playfully chased the swift fox <|unk|> <|unk|>
  
[14, 14, 14, 14, 14, 6, 14]
<|unk|> <|unk|> <|unk|> <|unk|> <|unk|> the <|unk|>


byte pair encoding -> which break words into subword units

BPE breaks down words that aren't in its predefined vocob into smaller subword units or even individual characters

In [15]:
# byte pair encoding
from importlib.metadata import version
import tiktoken
print("tiktoken version: ",version("tiktoken"))

tiktoken version:  0.9.0


In [23]:
tokenize = tiktoken.get_encoding("gpt2")
text = "Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknowplace"  
integers = tokenize.encode(text,allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 2954, 2197, 5372]


In [24]:
strings = tokenize.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknowplace


In [25]:
context_size = 4
x = integers[:context_size]
y = integers[1:context_size+1]
print(f"x,{x}")
print(f"y: {y}")

x,[15496, 11, 466, 345]
y: [11, 466, 345, 588]


Data sampling using sliding window

In [26]:
for i in range(1,context_size+1):
    context = integers[:i]
    desired = integers[i]
    print(context,"---->",desired)

[15496] ----> 11
[15496, 11] ----> 466
[15496, 11, 466] ----> 345
[15496, 11, 466, 345] ----> 588


In [27]:
for i in range(1,context_size+1):
    context = integers[:i]
    desired = integers[i]
    print(tokenize.decode(context), "--->", tokenize.decode([desired]))

Hello ---> ,
Hello, --->  do
Hello, do --->  you
Hello, do you --->  like


implementing efficient data sampling with a sliding window

its common to train LLM with input sizes of at least 256 to understand the meaning of stride = 1

In [68]:
import tiktoken
import torch
from torch.utils.data import Dataset ,DataLoader
import tensorflow as tensor

class GPTDatasetV1(Dataset):
    def __init__(self,txt,tokenize,max_length,stride):
        self.input_ids = []
        self.target_ids = []
        
        token_ids = tokenize.encode(txt)
        print("token length : ",len(token_ids))
        
        for i in range(0,len(token_ids)-max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1 : i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index],self.target_ids[index]
    
x = "In the heart of the city stood the old library, a relic from a bygone era. its stone walls bore the marks of time,and ivy clung tightly to its facade"

tokenize = tiktoken.get_encoding("gpt2")
max_length = 4
stride = 1
train = GPTDatasetV1(x,tokenize,max_length,stride)


print(train.__len__())
print(train.__getitem__(1))


token length :  37
33
(tensor([ 262, 2612,  286,  262]), tensor([2612,  286,  262, 1748]))


In [80]:
import tiktoken
def create_dataloader_v1(txt,batch_size=1,max_length=4,stride=1,shuffle=False,drop_last=True):
    tokenize = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt,tokenize,max_length,stride)
    dataloader = DataLoader(dataset,
                    batch_size=batch_size,
                    shuffle=shuffle,
                    drop_last=drop_last
                )
    return dataloader

dataloader = create_dataloader_v1(x)

token length :  37


In [81]:
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[ 818,  262, 2612,  286]]), tensor([[ 262, 2612,  286,  262]])]


In [82]:
second_iter = next(data_iter)
print(second_iter)

[tensor([[ 262, 2612,  286,  262]]), tensor([[2612,  286,  262, 1748]])]
