<a href="https://colab.research.google.com/github/VinaySingh561/LARGE-LANGUAGE-MODELS/blob/main/Positional_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [5]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [1]:
import torch

In [14]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [6]:
with open("the-verdict.txt","r", encoding = "utf-8") as f:
  raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))


5145


In [7]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetsV1(Dataset):
  def __init__(self,txt,tokenizer, max_length,stride):
    self.input_ids = []
    self.target_ids = []

    token_ids = tokenizer.encode(txt,allowed_special={"<|endoftext|>"})

    for i in range(0,len(token_ids)-max_length,stride):
      input_chunk = token_ids[i:i+max_length]
      target_chunk = token_ids[i+1:i+max_length+1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self,idx):
    return self.input_ids[idx],self.target_ids[idx]

In [8]:
def create_dataloader_v1(txt,batch_size=4,max_length=256,stride = 128,
                         shuffle = True,drop_last = True,num_workers = 0):
  tokenizer = tiktoken.get_encoding("gpt2")

  # create dataset
  dataset  = GPTDatasetsV1(txt,tokenizer,max_length,stride)

  # create dataloader
  dataloader = DataLoader(
      dataset,
      batch_size=batch_size,
      shuffle=shuffle,
      drop_last=drop_last,
      num_workers=num_workers

  )

  return dataloader

In [9]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text,batch_size=8,max_length=max_length,stride=max_length,shuffle=False
)


In [10]:
data_iter = iter(dataloader)
inputs, targets  = next(data_iter)

In [12]:
print("Inputs : \n", inputs)
print(" Targets : \n", targets)

Inputs : 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
 Targets : 
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [17]:
## now we will convert each of the token ids in 256 dimesnion embedding layer
token_embedding = token_embedding_layer(inputs)
print(token_embedding.shape)

torch.Size([8, 4, 256])


In [21]:
## similar to token embeddinng we need to create positonal embedding lay
## in positonal embedding number of rows will be same as context length as we need positional embedding for those ids at one time
pos_embedding_layer = torch.nn.Embedding(max_length,output_dim)
position_embedding  = pos_embedding_layer(torch.arange(max_length))

In [22]:
final_embedding = token_embedding + position_embedding
final_embedding.shape

torch.Size([8, 4, 256])