### Why can't we just feed the tokenized text into the model?
### Why have seperate embeddings ?

Because random text doesn't denote any semantic meaning.
for ex: cat and kitten are similar but can have very different tokens assigned to them.

And why it's important?

Because even in case on cnn we see that they understand the relation between different objects they detect and their spatial relation.

so similarly here, the meaning of the text their relation to each other and it's position in the sentence is important.

so we use embeddings to represent the meaning of the text.



In [2]:
with open("the-verdict.txt", "r", encoding='utf-8') as f:
    raw_text = f.read()

In [11]:
import torch 

vocab_size = 50247

output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [24]:
embeddings = token_embedding_layer(torch.tensor([1, 2, 3, 4, 5]))

print(embeddings.shape)

torch.Size([5, 256])


In [13]:
from torch.utils.data import DataLoader, Dataset
import torch

class GPTDatasetV1(Dataset):
    def __init__(self,txt,tokenizer,max_length,stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the text
        tokens_ids = tokenizer.encode(txt,allowed_special={"<|endoftext|>"})

        for i in range(0,len(tokens_ids) - max_length,stride):
            input_chunk = tokens_ids[i:i+max_length]
            target_chunk = tokens_ids[i+1:i+max_length+1]

            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

            ## each input output pairs is a tensor of shape (max_lengthx)
            ## and it contains max_length prediction task
            ## i.e. if max_length = 4
            ## then input is [model, predicts, next, token]
            ## and target is [predicts,next,  token, end_of_text]

            ## so pairs like : model -> predicts
            ##           model predicts -> next
            ##           model predicts next -> token
            ##           model predicts next token -> end_of_text

        self.tokenizer = tokenizer
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.stride = stride
        self.enc_text = tokenizer.encode(txt)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self,idx):
        return self.input_ids[idx],self.target_ids[idx]

In [14]:
import tiktoken

def create_dataloader_v1(txt,batch_size=4,max_length=256,stride=128,shuffle=True,drop_last=True,num_workers=0):
    # initalize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # initialize the dataset
    dataset = GPTDatasetV1(txt,tokenizer,max_length,stride)

    # create the dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [15]:
dataloader = create_dataloader_v1(raw_text,batch_size=4,max_length=4,stride=4,shuffle=False,drop_last=True,num_workers=0)

In [16]:
data_iter = iter(dataloader)

In [22]:
first_batch = next(data_iter)
print(first_batch)

x,y = first_batch

print(f'first batch shape x:{x.shape}, y:{y.shape}')


[tensor([[27075,    11,   290,  4920],
        [ 2241,   287,   257,  4489],
        [   64,   319,   262, 34686],
        [41976,    13,   357, 10915]]), tensor([[   11,   290,  4920,  2241],
        [  287,   257,  4489,    64],
        [  319,   262, 34686, 41976],
        [   13,   357, 10915,   314]])]
first batch shape x:torch.Size([4, 4]), y:torch.Size([4, 4])


In [27]:
first_batch_embeddings = token_embedding_layer(x)


print(f'''input batch shape: {x.shape} 
input embedding shape : {first_batch_embeddings.shape}
''')
print(first_batch_embeddings.shape)

input batch shape: torch.Size([4, 4]) 
input embedding shape : torch.Size([4, 4, 256])

torch.Size([4, 4, 256])


### Positional Embeddings

# ![](image.png)

Important because the if the tokens remains same at different poistion their embeddings is still different due to position

The size of positional embeddings depend on the context length because we have only one 

In [28]:
context_length = 4
embedding_dim = 256
pos_embedding_layer = torch.nn.Embedding(context_length,embedding_dim=embedding_dim)

In [29]:
positions = torch.arange(context_length)
print(f"Position indices shape: {positions.shape}")

pos_embeddings = pos_embedding_layer(positions)
print(f"Positional embeddings shape: {pos_embeddings.shape}")


Position indices shape: torch.Size([4])
Positional embeddings shape: torch.Size([4, 256])


In [30]:
first_batch_input_embeddings = first_batch_embeddings + pos_embeddings

print(f'first batch embeddings : {first_batch_embeddings.shape}')

first batch embeddings : torch.Size([4, 4, 256])
