<a href="https://www.kaggle.com/code/evelynartoria/transformer-pytorch-nlp?scriptVersionId=187422419" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Introduction
- This notebook is based on the "Let's build GPT: from scratch, in code, spelled out" tutorial by Andrej Karpathy. You can find the tutorial here --> https://www.youtube.com/watch?v=kCc8FmEb1nY&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=7
- There are several different approaches in this notebook that do not strictly follow the original video. Some implementations are my own.
- I am using the same shakespeare text as in the video.

# Import needed libraries

In [1]:
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, random_split

# Device agnostic code

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)
generator = torch.Generator(device=device)
print(f"default device set to {device}")

default device set to cpu


# Prepare the data

In [3]:
with open("/kaggle/input/shakespeare/input.txt", "r", encoding="utf-8") as f:
    text = f.read()

vocab = sorted(set(text))
vocab_size = len(vocab)

print(vocab)
print(vocab_size)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
65


### Tokenizer

In [4]:
stoi = {c: v for v, c in enumerate(vocab)}
itos = {v: c for c, v in stoi.items()}
print(stoi["h"])
print(itos[46])

46
h


In [5]:
encode = lambda d: [stoi[idx] for idx in d]
decode = lambda e: "".join([itos[idx] for idx in e])

encoded = encode("hello, how are you?!")
decoded = decode(encoded)
print(encoded)
print(decoded)

[46, 43, 50, 50, 53, 6, 1, 46, 53, 61, 1, 39, 56, 43, 1, 63, 53, 59, 12, 2]
hello, how are you?!


# Prepare the dataset

In [6]:
context_size = 8
n_embd = 5
vocab_size = len(vocab)

In [7]:
def make_dataset(text, context_size):
    data = torch.tensor(encode(text), dtype=torch.long)

    #random_idx = torch.randint(0, len(data)-context_size, (int(len(data)/context_size),))
    random_idx = torch.randperm(len(data)-context_size)
    inputs = torch.stack([data[idx:idx+context_size] for idx in random_idx])
    labels = torch.stack([data[idx+1:idx+context_size+1] for idx in random_idx])

    return TensorDataset(inputs, labels)


In [8]:
# sicne randint might give the same random_idx, randperm is going to be preffered
print(torch.randint(0, 10, (10,)))
print(torch.randperm(10))

tensor([3, 7, 2, 0, 9, 3, 7, 5, 2, 8])
tensor([7, 8, 2, 6, 9, 0, 5, 4, 1, 3])


In [9]:
dataset = make_dataset(text=text[:100000], context_size=8)

In [10]:
train_split = int(len(dataset)*0.8)
test_split = int(len(dataset)-train_split)

train_dataset, test_dataset = random_split(dataset=dataset, lengths=[train_split, test_split], generator=generator)

In [11]:
batch_size = 32
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, generator=generator)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, generator=generator)

In [12]:
def sample_from_data(dataloader):
    for batch, (X, y) in enumerate(dataloader):
        #print(f"batch {batch}, input {X}, label {y}")
        #print(batch)
        pass

In [13]:
sample_from_data(dataloader=train_dataloader)

# Base model (MLP)
- Important note --> instead of building the bigram like model from the video, I've decided to build MLP (multilayer perceptron) model for practice

In [14]:
class MLP(nn.Module):
    def __init__(self, context_size, n_embd, vocab_size):
        super().__init__()


        self.context_size = context_size
        self.vocab_size = vocab_size
        self.n_embd = n_embd
        
        self.token_embedding = nn.Embedding(vocab_size, n_embd) # B x T x C; B --> batches, T --> time (context_size), C --> n_embd
        self.linear1 = nn.Linear(in_features=context_size*n_embd, out_features=8*8) # B x T*C @ T*C x H; H --> number of hidden_units
        self.linear2 = nn.Linear(in_features=8*8, out_features=8*8*8)
        self.linear3 = nn.Linear(in_features=8*8*8, out_features=vocab_size)
        self.act_fn = nn.Tanh()

    def forward(self, idx: torch.Tensor) -> torch.Tensor:
        x = self.token_embedding(idx)
        B, T, C = x.shape
        x = x.view(B, T*C)

        x = self.act_fn(self.linear1(x))
        x = self.act_fn(self.linear2(x))
        x = self.linear3(x)

        return x

    def generate(self, idx: torch.Tensor, randomize: bool, max_length: int, num_samples: int) -> torch.Tensor:
        outputs = []
        for sample in range(num_samples):
            full_text = "" 
            for i in range(max_length):
                logits = self(idx)
                percents = torch.softmax(logits, dim=1)

                if randomize:
                    pred = torch.multinomial(percents, num_samples=1)
                    full_text += decode(pred.tolist()[0])
                    idx = torch.cat([idx[:, 1:], pred], dim=1) # update the context, remove the first element of the tensor and add the new prediction made by the model
                else:
                    pred = torch.argmax(percents)
                    full_text += decode([pred.item()])
                    idx = torch.cat([idx[:, 1:], pred.view(1, 1)], dim=1) # update the context, remove the first element of the tensor and add the new prediction made by the model
                    # in the argmax the output is a single element, pred.view(1, 1) turns it into a batch of dim 1, so it can be concatenated to the previous context

            outputs.append(full_text)

        return outputs

            


# Define the model, optimizer and loss function

In [15]:
mlp = MLP(context_size=context_size, n_embd=n_embd, vocab_size=vocab_size)
optimizer = torch.optim.Adam(params=mlp.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# Take samples from the model

In [16]:
def model_sampler(model, context, randomize, max_length, num_samples):
    #print(len(context))
    #print(context_size)
    test = torch.tensor([[20, 53, 61,  1, 39, 56, 58, 39]])
    #print(test[:, 1:])
    result = torch.cat((test[:, 1:], torch.tensor([[99]])), dim=1)
    #print(result)
    #print("\n")

    mlp.eval()
    with torch.inference_mode():
        idx = torch.tensor(encode(context), dtype=torch.long).view(1, len(encode(context))) # inputs must be batched
        outputs = mlp.generate(idx=idx, randomize=randomize, max_length=max_length, num_samples=num_samples)
        for output in outputs:
            print(f"{output} \n\n")

model_sampler(model=mlp, context="How are ", randomize=True, max_length=10, num_samples=5)

GqTlgaUDOH 


LV.YqCuxzw 



iUIuV;&j& 


.VKdDbwO;; 



 ?!upD'Xb 




# Training loop

In [17]:
def train_model(model, dataloader, loss_fn, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        for batch, (X, y) in tqdm(enumerate(dataloader)):
            logits = model(X)
            loss = loss_fn(logits, y[:, -1])
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if batch % 200 == 0:
                print(f"loss for batch {batch} --> {loss} at epoch {epoch}")

    print(f"loss for the very last batch --> {loss}")

In [18]:
mlp.train()
train_model(model=mlp, dataloader=train_dataloader, loss_fn=loss_fn, optimizer=optimizer, epochs=1)

18it [00:00, 178.88it/s]

loss for batch 0 --> 4.239871978759766 at epoch 0


248it [00:00, 373.64it/s]

loss for batch 200 --> 2.8540937900543213 at epoch 0


441it [00:01, 377.89it/s]

loss for batch 400 --> 2.657844066619873 at epoch 0


676it [00:01, 383.17it/s]

loss for batch 600 --> 2.9451727867126465 at epoch 0


873it [00:02, 385.06it/s]

loss for batch 800 --> 2.691908836364746 at epoch 0


1061it [00:02, 356.59it/s]

loss for batch 1000 --> 2.7322916984558105 at epoch 0


1254it [00:03, 378.36it/s]

loss for batch 1200 --> 2.6366126537323 at epoch 0


1449it [00:03, 345.76it/s]

loss for batch 1400 --> 2.349560022354126 at epoch 0


1677it [00:04, 371.87it/s]

loss for batch 1600 --> 2.4367964267730713 at epoch 0


1868it [00:05, 373.36it/s]

loss for batch 1800 --> 2.327730894088745 at epoch 0


2060it [00:05, 373.52it/s]

loss for batch 2000 --> 2.3716888427734375 at epoch 0


2248it [00:06, 369.37it/s]

loss for batch 2200 --> 2.3205678462982178 at epoch 0


2468it [00:06, 351.11it/s]

loss for batch 2400 --> 2.2288382053375244 at epoch 0


2500it [00:06, 368.51it/s]

loss for the very last batch --> 2.174595832824707





# Model inference

In [19]:
def model_inference(model, dataloader):
    mlp.eval()
    with torch.inference_mode():
        X, y = next(iter(dataloader))
        logits = model(X)
        percents = torch.softmax(logits, dim=1) # dim=1 since the input was batched
        preds = torch.argmax(percents, dim=1) # dim=1 since the input was batched
        print(f"for {X} \n model predicted {preds}")
        print(f"expected --> {y[:, -1]}")
        print(y)

In [20]:
model_inference(model=mlp, dataloader=train_dataloader)

for tensor([[ 1, 39, 52, 42,  1, 51, 63,  1],
        [13, 26, 33, 31, 10,  0, 35, 46],
        [56,  1, 46, 53, 52, 53, 59, 56],
        [59,  1, 42, 47, 57, 51, 47, 57],
        [43,  1, 61, 46, 53, 57, 43,  1],
        [53, 57, 43,  1, 57, 53,  1, 40],
        [56, 57,  6,  1, 45, 53, 53, 42],
        [44, 53, 43,  1, 58, 53,  1, 58],
        [58, 47, 50, 50,  1, 58, 46, 43],
        [ 0, 25, 17, 26, 17, 26, 21, 33],
        [44,  1, 61, 53, 59, 52, 42, 57],
        [57, 58, 57,  1, 58, 53,  1, 63],
        [17, 26, 21, 33, 31, 10,  0, 21],
        [42,  1, 58, 46, 43,  1, 57, 59],
        [44, 56, 39, 51, 43,  0, 32, 46],
        [52, 53, 58,  1, 41, 53, 52, 44],
        [39, 58, 53, 56, 10,  0, 19, 43],
        [53, 56, 47, 53, 50, 39, 52, 59],
        [ 6,  1, 39, 52, 42,  1, 56, 43],
        [ 1, 58, 47, 51, 43,  1, 46, 53],
        [ 1, 53, 44,  1, 51, 43, 52,  6],
        [51, 39, 52, 10,  0, 32, 46, 47],
        [ 1, 57, 47, 45, 46,  5, 42,  1],
        [60, 43,  0, 20, 43,  

In [21]:
model_sampler(model=mlp, context="How are ", randomize=True, max_length=100, num_samples=1)

be tor thofe!d
OoRCOLIheyo, williun, hin ttericirlale
Io, Mnd; walk shand
My Ciml bok, Ieds soreile  


