We now consider more realistic and useful embedding sizes and encode the input tokens into a 256-dimensional vector representation. This is smaller than what the original GPT-3 model used (in GPT-3, the embedding size is 12,228 dimensions ) but still reeasonable for experimentation.
Furthermore, we assume that the token IDs were created by the BPE tokenizer that we implemented earlier, which has a vocabulary size of 50,257.

In [1]:
import torch
import tiktoken
vocab_size = 50257
output_dim = 256

token_embedding_layer =torch.nn.Embedding(vocab_size, output_dim)

In [2]:
from torch.utils.data import Dataset,DataLoader

class GPTDatasetV1(Dataset):
  def __init__(self,txt,tokenizer,max_length,stride):
    self.input_ids = []
    self.target_ids = []

    token_ids = tokenizer.encode(txt,allowed_special={"<|endoftext|>"})

    for i in range(0,len(token_ids)-max_length,stride):
      input_chunk = token_ids[i:i+max_length]
      target_chunk = token_ids[i+1:i+max_length+1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx],self.target_ids[idx]


In [3]:
def create_dataloader_v1(txt,batch_size=4, max_length=256,stride=128,shuffle=True,drop_last=True,num_workers=0):
  # Initialize the tokenizer
  tokenizer = tiktoken.get_encoding('gpt2')

  #Create Dataset
  dataset = GPTDatasetV1(txt,tokenizer,max_length,stride)

  #Create dataloader
  dataloader = DataLoader(dataset, batch_size = batch_size,shuffle=shuffle,drop_last=drop_last,num_workers=num_workers)

  return dataloader

Using the token_embedding_layer , if we sample data from the data loader , we embed each token in each batch into a 256-dimension batch size of 8 with four tokens each, the result will be an 8 * 4 * 256 tensor

In [4]:
with open("the-verdict.txt","r",encoding="utf-8") as f:
  raw_text = f.read()

In [5]:
max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8,max_length=max_length,stride=max_length,shuffle=False)

data_iter = iter(dataloader)
inputs,targets = next(data_iter)

print("Token ID's ",inputs)
print('Inputs shape',inputs.shape)

Token ID's  tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Inputs shape torch.Size([8, 4])


Each token id above , we want to convert into 256 dimensional vector representation.

In [6]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings)
print(token_embeddings.shape)

tensor([[[ 0.7699, -0.2217, -0.7221,  ...,  0.4837, -0.0489, -0.3779],
         [ 0.1575, -0.9061,  1.0397,  ...,  0.9851,  0.4620, -0.2604],
         [-1.6930,  0.5237, -0.5384,  ...,  1.4252, -0.1468,  1.0591],
         [-3.0873, -0.7495,  0.0201,  ...,  1.6941, -0.2330, -0.1966]],

        [[ 0.2435, -0.6470,  0.3280,  ..., -0.5284,  0.1791,  0.1485],
         [-1.9989, -1.2344, -0.0602,  ..., -0.8507, -0.3927, -2.0589],
         [-0.7564, -1.5067, -1.8109,  ...,  1.6640,  0.2093,  1.0949],
         [ 0.1539, -0.2916, -0.5161,  ..., -0.3100,  0.4168,  0.7999]],

        [[-1.6543,  0.1814,  0.8535,  ...,  1.5126,  1.2118,  1.2212],
         [ 2.4627,  0.8606,  0.3645,  ...,  0.4616, -0.3652,  0.5303],
         [-0.6462,  1.3239,  0.0125,  ...,  0.3837, -0.1584,  0.9718],
         [-0.2964, -0.7377, -0.3283,  ..., -1.0249,  0.4976,  0.5825]],

        ...,

        [[ 0.2764,  0.6392,  0.0874,  ...,  0.2428,  0.2845,  2.1805],
         [ 1.5009,  1.1102, -1.0500,  ...,  1.5171, -0.64

Embedding for positional encoding.

In [7]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length,output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(max_length)) #create sequence of 4 numbers
print(pos_embeddings)
print(pos_embeddings.shape)

tensor([[ 0.3911, -0.1900, -0.0431,  ...,  0.4699,  3.0497,  0.9878],
        [-0.0870,  0.4230,  0.0718,  ...,  0.7828,  0.4820, -0.5926],
        [-0.3907,  0.7725,  0.2031,  ...,  1.0584,  0.5014,  0.0791],
        [ 1.4385, -0.2545, -0.4674,  ...,  0.8664, -0.2904, -1.6673]],
       grad_fn=<EmbeddingBackward0>)
torch.Size([4, 256])


In [8]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings)
print(input_embeddings.shape)

tensor([[[ 1.1610e+00, -4.1173e-01, -7.6517e-01,  ...,  9.5356e-01,
           3.0008e+00,  6.0993e-01],
         [ 7.0466e-02, -4.8314e-01,  1.1114e+00,  ...,  1.7679e+00,
           9.4392e-01, -8.5293e-01],
         [-2.0837e+00,  1.2963e+00, -3.3534e-01,  ...,  2.4836e+00,
           3.5463e-01,  1.1382e+00],
         [-1.6488e+00, -1.0040e+00, -4.4728e-01,  ...,  2.5605e+00,
          -5.2331e-01, -1.8640e+00]],

        [[ 6.3464e-01, -8.3701e-01,  2.8495e-01,  ..., -5.8502e-02,
           3.2288e+00,  1.1362e+00],
         [-2.0859e+00, -8.1140e-01,  1.1564e-02,  ..., -6.7828e-02,
           8.9275e-02, -2.6515e+00],
         [-1.1472e+00, -7.3419e-01, -1.6078e+00,  ...,  2.7224e+00,
           7.1074e-01,  1.1740e+00],
         [ 1.5924e+00, -5.4611e-01, -9.8343e-01,  ...,  5.5640e-01,
           1.2641e-01, -8.6744e-01]],

        [[-1.2632e+00, -8.6432e-03,  8.1044e-01,  ...,  1.9825e+00,
           4.2614e+00,  2.2090e+00],
         [ 2.3756e+00,  1.2836e+00,  4.3629e-01,  .