In [None]:
text = '''yes, you can absolutely use library to automatically extract vocabulary from text data. below are the most commonly used library and how to use them to get the vocabulary, depending on your use case:'''

# Tokenizer

In [1]:
import tiktoken

In [2]:
tokenizer = tiktoken.get_encoding('gpt2')

In [7]:
print(tokenizer.decode(tokenizer.encode(text)))

yes, you can absolutely use library to automatically extract vocabulary from text data. below are the most commonly used library and how to use them to get the vocabulary, depending on your use case:


# Labeling the data

### if the first token is input then the second token will be the output, if 1st and 2nd token is input the 3rd token will be the out put.

In [8]:
text = '''yes, you can absolutely use library to automatically extract vocabulary from text data. below are the most commonly used library and how to use them to get the vocabulary, depending on your use case:'''

In [10]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')

In [11]:
encoded_text = tokenizer.encode(text)

In [None]:
#context size, here we are using 1024 as context size

context_size = 1024

for i in range(1,len(encoded_text[:10])):
    print(encoded_text[:i],"-------->" ,encoded_text[i])
    


[8505] --------> 11
[8505, 11] --------> 345
[8505, 11, 345] --------> 460
[8505, 11, 345, 460] --------> 5543
[8505, 11, 345, 460, 5543] --------> 779
[8505, 11, 345, 460, 5543, 779] --------> 5888
[8505, 11, 345, 460, 5543, 779, 5888] --------> 284
[8505, 11, 345, 460, 5543, 779, 5888, 284] --------> 6338
[8505, 11, 345, 460, 5543, 779, 5888, 284, 6338] --------> 7925


In [20]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader


class MiniLLMTokenizer:
    def __init__(self, txt, tokenizer, context_size, stride):
        self.input_ids = []
        self.target_ids = []
    
        token_ids = tokenizer.encode(text)
        
        for i in range(1, len(token_ids)- context_size, stride):
            input_chunk = token_ids[i:i + context_size]
            target_chunk = token_ids[i + 1: i + context_size + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]



In [None]:
def data_loader(text, batch_size =8, context_size=256, stride=128, shuffle = True, drop_last = True, n_workers=0):

    tokenizer = tiktoken.get_encoding('gpt2')

    dataset = MiniLLMTokenizer(text, tokenizer,context_size,stride=stride)
    dataloader = DataLoader(
            dataset=dataset,
            batch_size=batch_size,
            shuffle=shuffle,
            drop_last=drop_last,
            num_workers=n_workers
    )

    return dataloader

In [34]:
print(next(iter(data_loader(text, batch_size=1,context_size=4, stride=1)))) #used small context size and stride because of small dataset

[tensor([[2420, 1366,   13, 2174]]), tensor([[1366,   13, 2174,  389]])]


# Token embeddings

In [50]:
input = torch.tensor([2, 3, 1, 4], dtype=torch.long)
vocab_size = 6
output_size = 3
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_size)

In [51]:
print(embedding_layer(input))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [ 0.9178,  1.5810,  1.3010],
        [-1.1589,  0.3255, -0.6315]], grad_fn=<EmbeddingBackward0>)


In [55]:
print(embedding_layer.weight) #weight size is limited because it small vocb ise but for implementation we wil use large context size

#These are intial weights, these will be optimized during training and adjust according training.

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


# Adding Positional Information 