# Introduction (WIP)
- This notebook is based on the "Let's build GPT: from scratch, in code, spelled out" tutorial by Andrej Karpathy. You can find the tutorial here --> https://www.youtube.com/watch?v=kCc8FmEb1nY&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=7
- There are several different approaches in this notebook that do not strictly follow the original video. Some implementations are my own.
- I am using the same shakespeare text as in the video.

# Import needed libraries

In [1]:
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, random_split

# Device agnostic code

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)
generator = torch.Generator(device=device)
print(f"default device set to {device}")

default device set to cpu


# Prepare the data

In [3]:
with open("/kaggle/input/shakespeare/input.txt", "r", encoding="utf-8") as f:
    text = f.read()

vocab = sorted(set(text))
vocab_size = len(vocab)

print(vocab)
print(vocab_size)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
65


### Tokenizer

In [4]:
stoi = {c: v for v, c in enumerate(vocab)}
itos = {v: c for c, v in stoi.items()}
print(stoi["h"])
print(itos[46])

46
h


In [5]:
encode = lambda d: [stoi[idx] for idx in d]
decode = lambda e: "".join([itos[idx] for idx in e])

encoded = encode("hello, how are you?!")
decoded = decode(encoded)
print(encoded)
print(decoded)

[46, 43, 50, 50, 53, 6, 1, 46, 53, 61, 1, 39, 56, 43, 1, 63, 53, 59, 12, 2]
hello, how are you?!


# Prepare the dataset

In [6]:
context_size = 8
n_embd = 5
vocab_size = len(vocab)

In [7]:
def make_dataset(text, context_size):
    data = torch.tensor(encode(text), dtype=torch.long)

    #random_idx = torch.randint(0, len(data)-context_size, (int(len(data)/context_size),))
    random_idx = torch.randperm(len(data)-context_size)
    inputs = torch.stack([data[idx:idx+context_size] for idx in random_idx])
    labels = torch.stack([data[idx+1:idx+context_size+1] for idx in random_idx])

    return TensorDataset(inputs, labels)


In [8]:
# sicne randint might give the same random_idx, randperm is going to be preffered
print(torch.randint(0, 10, (10,)))
print(torch.randperm(10))

tensor([6, 2, 1, 7, 0, 4, 3, 2, 6, 1])
tensor([3, 8, 5, 1, 7, 9, 0, 4, 6, 2])


In [9]:
dataset = make_dataset(text=text[:100000], context_size=8)

In [10]:
train_split = int(len(dataset)*0.8)
test_split = int(len(dataset)-train_split)

train_dataset, test_dataset = random_split(dataset=dataset, lengths=[train_split, test_split], generator=generator)

In [11]:
batch_size = 32
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, generator=generator)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, generator=generator)

In [12]:
def sample_from_data(dataloader):
    for batch, (X, y) in enumerate(dataloader):
        #print(f"batch {batch}, input {X}, label {y}")
        #print(batch)
        pass

In [13]:
sample_from_data(dataloader=train_dataloader)

# Base model (MLP)

In [14]:
class MLP(nn.Module):
    def __init__(self, context_size, n_embd, vocab_size):
        super().__init__()


        self.context_size = context_size
        self.vocab_size = vocab_size
        self.n_embd = n_embd
        
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd) # B x T x C; B --> batches, T --> time (context_size), C --> n_embd
        self.pos_embedding_table = nn.Embedding(context_size, n_embd) # T x C; this is from the posisitional encoding part of the video
        
        self.linear1 = nn.Linear(in_features=context_size*n_embd, out_features=8*8) # B x T*C @ T*C x H; H --> number of hidden_units
        self.linear2 = nn.Linear(in_features=8*8, out_features=8*8*8)
        self.linear3 = nn.Linear(in_features=8*8*8, out_features=vocab_size)
        self.act_fn = nn.Tanh()

    def forward(self, idx: torch.Tensor) -> torch.Tensor:
        B, T = idx.shape
        C = self.n_embd
        positions = torch.arange(start=0, end=T, step=1)
        x = self.token_embedding_table(idx) + self.pos_embedding_table(positions)
        x = x.view(B, T*C)

        x = self.act_fn(self.linear1(x))
        x = self.act_fn(self.linear2(x))
        x = self.linear3(x)

        return x

    def generate(self, idx: torch.Tensor, randomize: bool, max_length: int, num_samples: int) -> torch.Tensor:
        outputs = []
        for sample in range(num_samples):
            full_text = "" 
            for i in range(max_length):
                logits = self(idx)
                percents = torch.softmax(logits, dim=1)

                if randomize:
                    pred = torch.multinomial(percents, num_samples=1)
                    full_text += decode(pred.tolist()[0])
                    idx = torch.cat([idx[:, 1:], pred], dim=1) # update the context, remove the first element of the tensor and add the new prediction made by the model
                else:
                    pred = torch.argmax(percents)
                    full_text += decode([pred.item()])
                    idx = torch.cat([idx[:, 1:], pred.view(1, 1)], dim=1) # update the context, remove the first element of the tensor and add the new prediction made by the model
                    # in the argmax the output is a single element, pred.view(1, 1) turns it into a batch of dim 1, so it can be concatenated to the previous context

            outputs.append(full_text)

        return outputs

            


# Define the base model, optimizer and loss function

In [15]:
mlp = MLP(context_size=context_size, n_embd=n_embd, vocab_size=vocab_size)
optimizer = torch.optim.Adam(params=mlp.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# Take samples from the base model

In [16]:
@torch.no_grad
def model_sampler(model, context, randomize, max_length, num_samples):
    #print(len(context))
    #print(context_size)
    test = torch.tensor([[20, 53, 61,  1, 39, 56, 58, 39]])
    #print(test[:, 1:])
    result = torch.cat((test[:, 1:], torch.tensor([[99]])), dim=1)
    #print(result)
    #print("\n")

    mlp.eval()
    idx = torch.tensor(encode(context), dtype=torch.long).view(1, len(encode(context))) # inputs must be batched
    outputs = mlp.generate(idx=idx, randomize=randomize, max_length=max_length, num_samples=num_samples)
    for output in outputs:
        print(f"{output} \n\n")

model_sampler(model=mlp, context="How are ", randomize=True, max_length=10, num_samples=5)

XFPqHKUPXy 


n?NCBOGOjt 


grBqJ$mU:3 


PtCujIEqHc 


RdFbzaUjbd 




# Training loop

In [17]:
def train_model(model, dataloader, loss_fn, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        for batch, (X, y) in tqdm(enumerate(dataloader)):
            logits = model(X)
            loss = loss_fn(logits, y[:, -1])
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if batch % 200 == 0:
                print(f"loss for batch {batch} --> {loss} at epoch {epoch}")

    print(f"loss for the very last batch --> {loss}")

In [18]:
mlp.train()
train_model(model=mlp, dataloader=train_dataloader, loss_fn=loss_fn, optimizer=optimizer, epochs=1)

16it [00:00, 159.50it/s]

loss for batch 0 --> 4.240123748779297 at epoch 0


243it [00:00, 313.32it/s]

loss for batch 200 --> 3.148144245147705 at epoch 0


442it [00:01, 321.35it/s]

loss for batch 400 --> 2.717410087585449 at epoch 0


640it [00:02, 324.52it/s]

loss for batch 600 --> 2.761871099472046 at epoch 0


833it [00:02, 313.94it/s]

loss for batch 800 --> 2.4733972549438477 at epoch 0


1029it [00:03, 307.92it/s]

loss for batch 1000 --> 2.1294190883636475 at epoch 0


1257it [00:04, 322.44it/s]

loss for batch 1200 --> 2.7162749767303467 at epoch 0


1454it [00:04, 319.93it/s]

loss for batch 1400 --> 2.421358108520508 at epoch 0


1649it [00:05, 320.30it/s]

loss for batch 1600 --> 2.5102336406707764 at epoch 0


1847it [00:05, 322.11it/s]

loss for batch 1800 --> 2.1967077255249023 at epoch 0


2043it [00:06, 315.62it/s]

loss for batch 2000 --> 2.386378288269043 at epoch 0


2240it [00:07, 325.09it/s]

loss for batch 2200 --> 2.2404003143310547 at epoch 0


2436it [00:07, 310.54it/s]

loss for batch 2400 --> 2.5474936962127686 at epoch 0


2500it [00:07, 315.12it/s]

loss for the very last batch --> 2.297541618347168





# Base model inference

In [19]:
@torch.no_grad
def model_inference(model, dataloader):
    mlp.eval()
    X, y = next(iter(dataloader))
    logits = model(X)
    percents = torch.softmax(logits, dim=1) # dim=1 since the input was batched
    preds = torch.argmax(percents, dim=1) # dim=1 since the input was batched
    print(f"for {X} \n model predicted {preds}")
    print(f"expected --> {y[:, -1]}")
    print(y)

In [20]:
model_inference(model=mlp, dataloader=train_dataloader)

for tensor([[26, 33, 31, 10,  0, 20, 43, 52],
        [43,  1, 63, 53, 59, 56,  1, 41],
        [63,  1, 57, 39, 63,  6,  0, 32],
        [ 1, 57, 47, 56,  6,  1, 58, 46],
        [56, 57,  6,  1, 57, 46, 53, 59],
        [21,  1, 46, 39, 60, 43,  1, 42],
        [ 0, 56, 43, 54, 53, 56, 58,  1],
        [58,  1, 52, 53, 58,  1, 58, 46],
        [56,  1, 45, 43, 52, 43, 56, 39],
        [63, 53, 59,  1, 46, 39, 60, 43],
        [47, 51, 43,  1, 57, 46, 39, 50],
        [59, 56, 57, 43, 50, 60, 43, 57],
        [ 0,  0, 25, 17, 26, 17, 26, 21],
        [53, 59,  1, 57, 46, 39, 50, 50],
        [57,  1, 58, 53,  1, 57, 53, 51],
        [47, 49, 43,  8,  0,  0, 18, 47],
        [15, 53, 52, 44, 43, 57, 57,  1],
        [50,  1, 46, 39, 56, 42, 50, 63],
        [56, 58,  1, 58, 46, 53, 59,  1],
        [ 1, 39, 56, 43,  1, 54, 56, 43],
        [ 1, 41, 39, 52, 52, 53, 58,  1],
        [47, 50, 50, 57,  7,  7, 40, 56],
        [57, 58, 56, 53, 52, 45,  1, 40],
        [ 1, 51, 43, 56, 41, 6

In [21]:
model_sampler(model=mlp, context="How are ", randomize=True, max_length=500, num_samples=1)

cy ware
Wurre uele,
Arly let hase aneessaaf cinle.

FUR,
N
OLill:
Arg theats fo orathen therceomoe ano yeesers! aromedomtre the fathatheon totiseus free tore veenr tans, grerenm,
G an the eit ho nley hore hitr and moull foup.

COUTUS:
Hnowrut hh yound wot hasee,
Aro nobd, u worn.

MORFOLAUr:
Show maseerv: a thaiploanc ifr, wo scem mererteren:
ICOte'e thenetb. Maneu noue, yout anqcane Ho mw twepbeoscy, aors 'atpsoy,.
I'IRIOLthetly Cge heer.---
CAaritT, is mn waobino mowe hasiungoe hitheb fos yoo  




# Self attention math

In [22]:
sample_batch = next(iter(test_dataloader))[0]
B, T = sample_batch.shape # batch of B by T
print(B, T)
example_emb = nn.Embedding(vocab_size, 4)
embedded = example_emb(sample_batch)
B, T, C = embedded.shape # embedded is Batches by Time (context_sie) by Channels (num of values per token)
print(embedded.shape)

32 8
torch.Size([32, 8, 4])


In [23]:
bag_of_words = torch.zeros(size=(B, T, C)) # each of the values has a unique value

for batch_idx in range(B):
    for context_idx in range(T):
        xprev = embedded[batch_idx, :context_idx+1]
        bag_of_words[batch_idx, context_idx] = torch.mean(xprev, dim=0)
print(bag_of_words)

tensor([[[-1.0784e+00,  6.7285e-01,  8.5836e-01, -5.8480e-01],
         [-1.1326e+00,  1.0782e-01,  5.4404e-01,  3.6327e-01],
         [-1.3497e+00, -9.5280e-02,  7.9182e-01,  7.5423e-01],
         ...,
         [-6.6040e-01, -6.2231e-02,  5.3419e-01,  8.0532e-01],
         [-3.6120e-01, -4.1946e-02,  6.5256e-01,  5.9728e-01],
         [-2.1654e-01, -1.9611e-01,  6.2420e-01,  6.2818e-01]],

        [[-2.7063e-02,  3.9568e-01, -3.6808e-02,  4.0562e-01],
         [ 3.4577e-01,  5.1794e-01,  1.4292e-03, -1.4422e-01],
         [ 7.2984e-02, -3.6195e-02,  6.3850e-01,  3.4219e-01],
         ...,
         [-1.3296e-02, -1.1811e-01,  5.6225e-01,  3.8664e-01],
         [-1.9392e-01, -1.0642e-01,  1.8096e-01,  5.0034e-01],
         [ 6.4363e-02, -2.8219e-01,  1.3031e-01,  4.8976e-01]],

        [[-1.2777e+00, -3.6295e-02, -2.1068e+00,  1.1826e+00],
         [-1.5308e+00, -2.6889e-01, -4.0970e-01,  1.3594e+00],
         [-1.2599e+00, -8.0187e-01, -9.7691e-02,  4.7137e-01],
         ...,
         

In [24]:
ones = torch.ones(size=(3, 3))
tril = torch.tril(ones) # lower triangular part of a matrix
print(tril)
a = torch.randint(0, 10, (3, 2), dtype=torch.float32)
b = torch.randint(0, 10, (2, 3), dtype=torch.float32)
matmul_output = a @ b
matmul_tril_output = torch.tril(a) @ b
print(matmul_output)
print(matmul_tril_output)

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
tensor([[12.,  6., 12.],
        [46.,  7., 26.],
        [50.,  5., 25.]])
tensor([[ 0.,  0.,  0.],
        [46.,  7., 26.],
        [50.,  5., 25.]])


In [25]:
# do the same as bag of words but with matrix multiplication (dot product)
a = torch.ones(size=(3, 3), dtype=torch.float32)
b = torch.randint(0, 10, (3, 2), dtype=torch.float32)

a = torch.tril(a)
"""
b = torch.tensor(
    [
        [2, 7],
        [6, 4],
        [6, 5]
    ], dtype=torch.float32
)
"""
print(a)
a = a/a.sum(dim=1, keepdim=True)
print(a)

output = a @ b
print(output)

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[9.0000, 3.0000],
        [5.5000, 6.0000],
        [4.0000, 4.0000]])


In [26]:
sample_batch = next(iter(test_dataloader))[0]
B, T = sample_batch.shape # batch of B by T
example_emb = nn.Embedding(vocab_size, 4)
embedded = example_emb(sample_batch)
B, T, C = embedded.shape # embedded is Batches by Time (context_sie) by Channels (num of values per token)
#print(embedded.shape)

wei = torch.tril(torch.ones(size=(T, T)))
wei = wei / wei.sum(dim=1, keepdim=True)
print(embedded.shape) # B x T x C
print(wei.shape) # T x T
#  1xTxT @ BxTxC
bag_of_words = wei @ embedded
print(bag_of_words)

torch.Size([32, 8, 4])
torch.Size([8, 8])
tensor([[[ 0.3688,  0.5734,  2.1180,  0.3947],
         [ 0.7394,  0.2214,  1.5908,  0.0598],
         [ 0.4816, -0.2449,  1.2497,  0.9870],
         ...,
         [ 0.5903, -0.2065,  0.8286,  0.5348],
         [ 0.6665, -0.2292,  0.6267,  0.5644],
         [ 0.6354, -0.3445,  0.5693,  0.4646]],

        [[ 0.9006, -2.3556,  1.0607, -0.0566],
         [ 0.6516, -1.1261,  1.4045,  0.6743],
         [ 0.3296, -0.4183,  1.3290,  0.0963],
         ...,
         [ 0.0480,  0.3228,  0.9913,  0.2512],
         [ 0.2862,  0.4519,  1.0448, -0.0263],
         [ 0.3698,  0.5899,  0.9644, -0.0089]],

        [[ 1.7154,  1.2267,  1.3655, -1.6913],
         [ 0.8407,  0.0246,  0.9665,  0.5751],
         [ 1.0021, -0.4073,  1.2261,  0.0095],
         ...,
         [ 0.5052, -0.4513,  1.1875,  0.4964],
         [ 0.5916, -0.4055,  1.1698,  0.3862],
         [ 0.3710, -0.4536,  1.1329,  0.2108]],

        ...,

        [[ 0.0328, -1.9299, -0.2432,  0.4098],
   

# Bag of words type aggregation with a mask

In [27]:
tril = torch.tril(torch.ones(size=(T, T)))
wei = torch.zeros(size=(T, T)) # zeros just so there's a plaaceholder for masked_fill
wei = wei.masked_fill(tril==0, float("-inf")) # whenever the value in tril is 0, it will get replaced with -inf; this allows softmax to come into place, since -inf will get a percent of 0
wei = torch.softmax(wei, dim=1)
print(wei)
bag_of_words = wei @ embedded
print(bag_of_words)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])
tensor([[[ 0.3688,  0.5734,  2.1180,  0.3947],
         [ 0.7394,  0.2214,  1.5908,  0.0598],
         [ 0.4816, -0.2449,  1.2497,  0.9870],
         ...,
         [ 0.5903, -0.2065,  0.8286,  0.5348],
         [ 0.6665, -0.2292,  0.6267,  0.5644],
         [ 0.6354, -0.3445,  0.5693,  0.4646]],

        [[ 0.9006, -2.3556,  1.0607, -0.0566],
         [ 0.6516, -1.1261,  1.4045,  0.6743],
         [ 0.32

# MLP model with agreggation
- a problem that needs to be addressed with the previous model is that it needs to always receive a input of B x T (batch_size by context_size), whereas it would be best if the model could adapt to inputs of different context_size

In [28]:
# Code to allow comunication between past tokens
C = n_embd
T = context_size
B = batch_size
wei = torch.zeros(size=(T, T))
tril = torch.tril(torch.ones(size=(T, T)))
wei = wei.masked_fill(tril==0, float('-inf'))
xbow = wei.softmax(dim=1)

test_tensor = torch.randn(size=(B, T, C))
print(xbow @ test_tensor)

tensor([[[-4.4853e-02, -6.0513e-01, -2.6979e-01,  2.9367e-01,  7.4300e-03],
         [-3.3273e-01, -1.6956e-01,  4.8027e-02,  4.6598e-01, -3.2772e-02],
         [-6.6604e-01, -2.3695e-01,  5.4108e-01,  3.2967e-01,  4.6265e-01],
         ...,
         [ 1.8625e-01, -3.9044e-01,  2.9101e-01,  4.7646e-02,  3.3985e-01],
         [ 1.8139e-01, -6.1906e-02,  2.4004e-01, -4.8740e-02,  2.6888e-01],
         [-3.0426e-02, -9.8348e-02,  2.8390e-01, -1.2465e-01,  5.6283e-01]],

        [[-1.1765e+00, -5.0294e-01,  5.0863e-01,  1.0011e+00, -2.8166e-01],
         [-1.1570e+00, -9.9522e-01, -1.4237e-01,  6.8389e-01, -7.8292e-01],
         [-9.4526e-01, -1.1017e+00, -7.5666e-01, -1.4278e-02, -5.6260e-01],
         ...,
         [ 1.3102e-01, -3.8917e-01, -2.3491e-01,  2.7016e-01,  2.8503e-02],
         [ 1.2854e-01, -3.7960e-01, -1.2848e-01,  1.2805e-01, -2.0101e-01],
         [ 2.4634e-01, -3.9867e-01, -9.3902e-02,  3.1972e-01, -4.4178e-02]],

        [[-3.8941e-02, -4.2473e-01,  8.4499e-01, -1.9319

In [29]:
# a problem that needs to be addressed with the previous model is that it needs to always receive a 
batch_sample_inputs, batch_sample_labels = next(iter(train_dataloader))
print(batch_sample_inputs.shape)
print(batch_sample_labels.shape)

torch.Size([32, 8])
torch.Size([32, 8])


In [30]:
class MLPv2(nn.Module):
    def __init__(self, vocab_size, n_embd, context_size):
        super().__init__()

        self.vocab_size = vocab_size
        self.n_embd = n_embd
        self.context_size = context_size

        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.positional_embedding_table = nn.Embedding(context_size, n_embd) # the position of each token has a separate table of values; this helps the model keep track of the order of characters

        #self.lm_head = nn.Linear(in_features=n_embd, out_features=vocab_size) # this time in_features=n_embd so it's not context_size dependant
        self.linear1 = nn.Linear(in_features=n_embd, out_features=8*8)
        self.linear2 = nn.Linear(in_features=8*8, out_features=8*8*8)
        self.linear3 = nn.Linear(in_features=8*8*8, out_features=vocab_size)

        self.act_fn = nn.Tanh()

    def info(self):
        info_dict = {
            "vocab_size": self.vocab_size,
            "n_embd": self.n_embd,
            "context_size": self.context_size
        }

        return info_dict

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, T = x.shape
        C = self.n_embd

        #print(B, T, C)

        positions = torch.arange(start=0, end=T, step=1)
        token_emb = self.token_embedding_table(x) # batch_size x context_size x n_embd --> BxTxC
        pos_emb = self.positional_embedding_table(positions) # T x C (each context has a position)

        x = token_emb + pos_emb
        x = x.view(B*T, C)

        #logits = self.lm_head(token_emb.view(B*T, C)) # output of shape B*T x hidden_units1; this might be a problem for the labels since they are of shape B x T, thus they need to be reshaped aswell
        x = self.act_fn(self.linear1(token_emb.view(B*T, C))) # hidden_units1 x hidden_units2
        x = self.act_fn(self.linear2(x)) # hidden_units2 x hidden_units3
        x = self.linear3(x) # hidden_units3 x vocab_size
        return x

    def generate(self, starting_idx: torch.Tensor, max_length) -> torch.Tensor:

        full_text = itos[starting_idx.item()]
        for i in range(max_length):
            logits = self(starting_idx)
            percents = torch.softmax(logits, dim=1)
            pred = torch.multinomial(percents, num_samples=1)
            starting_idx = pred
            full_text += decode([pred.item()])
        return full_text


In [31]:
mlpv2 = MLPv2(vocab_size=vocab_size, n_embd=32, context_size=context_size)

In [32]:
mlpv2.info()

{'vocab_size': 65, 'n_embd': 32, 'context_size': 8}

In [33]:
mlpv2_loss_fn = nn.CrossEntropyLoss()
batch_sample_inputs, batch_sample_labels = next(iter(train_dataloader))
sample_input = batch_sample_inputs[0]
sample_label = batch_sample_labels[0]
print(sample_input) # 1 x T (B x T)
print(sample_label.view(1, -1)) # 1 x T (B x T)
mlpv2.eval()
with torch.inference_mode():
    logits = mlpv2(sample_input.view(1, -1))
    labels = sample_label.view(-1) # from B x T to B*T to match the shape of the logits
    print(logits.shape) # B*T x vocab_size
    print(labels.shape)

    loss = mlpv2_loss_fn(logits, labels)
    print(loss)

tensor([26, 33, 31, 10,  0, 20, 43, 52])
tensor([[33, 31, 10,  0, 20, 43, 52, 41]])
torch.Size([8, 65])
torch.Size([8])
tensor(4.1236)


In [34]:
@torch.no_grad
def generate_from_model(model, num_outputs, starting_char, max_length):
    mlpv2.eval()
    outputs = []
    starting_idx = torch.tensor([stoi[starting_char]], dtype=torch.long).view(1, -1)
    for i in range(num_outputs):
        output = mlpv2.generate(starting_idx=starting_idx, max_length=max_length) # must be batched
        outputs.append(output)

    return outputs

In [35]:
test_output = generate_from_model(model=mlpv2, num_outputs=1, starting_char="a", max_length=5)
print(test_output[0])

aOzrxA


In [36]:
def train_model(model, dataloader, loss_fn, optimizer, epochs):
    model.train()

    for epoch in range(epochs):
        for batch, (X, y) in enumerate(dataloader):
            logits = model(X)
            loss = loss_fn(logits, y.view(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


            if batch % 1200 == 0:
                print(f"loss for batch {batch} --> {loss} at epoch {epoch}")

    print(f"loss for the very last batch --> {loss}")

In [37]:
mlpv2_optimizer = torch.optim.Adam(params=mlpv2.parameters(), lr=1e-3)
mlpv2_loss_fn = nn.CrossEntropyLoss()

In [38]:
train_model(model=mlpv2, dataloader=train_dataloader, loss_fn=mlpv2_loss_fn, optimizer=mlpv2_optimizer, epochs=4)

loss for batch 0 --> 4.199133396148682 at epoch 0
loss for batch 1200 --> 2.527355909347534 at epoch 0
loss for batch 2400 --> 2.313702344894409 at epoch 0
loss for batch 0 --> 2.3813376426696777 at epoch 1
loss for batch 1200 --> 2.50433611869812 at epoch 1
loss for batch 2400 --> 2.310081720352173 at epoch 1
loss for batch 0 --> 2.3805489540100098 at epoch 2
loss for batch 1200 --> 2.4978551864624023 at epoch 2
loss for batch 2400 --> 2.3088157176971436 at epoch 2
loss for batch 0 --> 2.379178047180176 at epoch 3
loss for batch 1200 --> 2.49359130859375 at epoch 3
loss for batch 2400 --> 2.3069968223571777 at epoch 3
loss for the very last batch --> 2.4351465702056885


In [39]:
test_outputs = generate_from_model(model=mlpv2, max_length=100, num_outputs=2, starting_char="b")
for output in test_outputs:
    print(f"{output}\n\n")

bene pr ve COr!
Hesth,
A y!


Ongele
BRo'd tomo' wiu:
Tanowofoumonot miff h
Fo e,
Fie?
Hedivedr bo, h


beoncode ierond lintouthutace hivo ss hicanead, avegsorou oouly serkendeincingeng uth
Thelle y thinfr




In [40]:
torch.arange(start=0, end=T, step=1) # from 0 to T-1

tensor([0, 1, 2, 3, 4, 5, 6, 7])

# Self attention
- with xbow you can add information about the tokens, but the model itself does not attribute any weight to them. This is what self attention solves by using Keys, Queries and Values
- every token will have a specific Query (Q) and Key (K) attatched to it
    - Query --> what the model is looking for
    - Key --> the weight the model is giving to this certain token

In [41]:
# Code to allow comunication between past tokens

B, T, C = 2, 4, 8
wei = torch.zeros(size=(T, T))
tril = torch.tril(torch.ones(size=(T, T)))
wei = wei.masked_fill(tril==0, float('-inf'))
xbow = wei.softmax(dim=1)

test_tensor = torch.randn(size=(B, T, C))
output = xbow @ test_tensor
print(output.shape)

torch.Size([2, 4, 8])


In [42]:
head_size = 32
key = nn.Linear(in_features=C, out_features=head_size, bias=False) # bias = False so that it's just a multiplication
query = nn.Linear(in_features=C, out_features=head_size, bias=False) 

#print(test_tensor.shape)
k = key(test_tensor) # BxTxC @ BxTxhead_size --> BxTxhead_size; each batch has a context and each context character has a key value
q = query(test_tensor) # BxTxC @ BxTxhead_size --> BxTxhead_size; each batch has a context and each context character has a query value
print(k.shape)
print(q.shape)

torch.Size([2, 4, 32])
torch.Size([2, 4, 32])


In [43]:
print(k.transpose(-2, -1).shape) # same as k.permute(0, 2, 1)
print(k.permute(0, 2, 1).shape)

wei = q @ k.transpose(-2, -1) # BxTxhead_size @ Bxhead_sizexT --> BxTxT
print(wei.shape) #  B x T x T

tril = torch.tril(torch.ones(size=(T, T)))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = torch.softmax(wei, dim=-1) # dim=-1 in this case, since wei is of shape

output = wei @ test_tensor

print(output.shape) # B x T x C

torch.Size([2, 32, 4])
torch.Size([2, 32, 4])
torch.Size([2, 4, 4])
torch.Size([2, 4, 8])


# Update the model with a self attention head

In [44]:
class Head(nn.Module):
    def __init__(self, head_size, n_embd):
        super(Head, self).__init__()

        self.Q = nn.Linear(in_features=n_embd, out_features=head_size)
        self.K = nn.Linear(in_features=n_embd, out_features=head_size)
        self.V = nn.Linear(in_features=n_embd, out_features=head_size)


    def forward(self, x: torch.Tensor, mask) -> torch.Tensor:
        B, T, C = x.shape
        #mask = torch.tril(torch.ones(size=(T, T)))

        q = self.Q(x) # B x T x head_size
        k = self.K(x) # B x T x head_size
        v = self.V(x) # B x T x head_size

        k = k.transpose(-2, -1) # B x head_size x T

        wei = q @ k # BxTxhead_size @ Bxhead_sizexT --> BxTxT

        wei = wei.masked_fill(mask==0, float('-inf'))
        wei = torch.softmax(wei, dim=-1)

        #x = wei @ v # BxTxT @ BxTxhead_size --> BxTxhead_size
        x = wei @ x

        return x



In [45]:
class MLPv3(nn.Module):
    def __init__(self, vocab_size, n_embd, context_size):
        super(MLPv3, self).__init__()

        self.vocab_size = vocab_size
        self.n_embd = n_embd
        self.context_size = context_size

        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.positional_embedding_table = nn.Embedding(context_size, n_embd) # the position of each token has a separate table of values; this helps the model keep track of the order of characters

        self.head = Head(head_size=32, n_embd=n_embd)

        #self.lm_head = nn.Linear(in_features=n_embd, out_features=vocab_size) # this time in_features=n_embd so it's not context_size dependant
        self.linear1 = nn.Linear(in_features=n_embd, out_features=8*8)
        self.linear2 = nn.Linear(in_features=8*8, out_features=8*8*8)
        self.linear3 = nn.Linear(in_features=8*8*8, out_features=vocab_size)

        self.act_fn = nn.Tanh()

        self.mask = torch.tril(torch.ones(size=(context_size, context_size)))

    def info(self):
        info_dict = {
            "vocab_size": self.vocab_size,
            "n_embd": self.n_embd,
            "context_size": self.context_size
        }

        return info_dict

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, T = x.shape
        C = self.n_embd

        #print(B, T, C)

        positions = torch.arange(start=0, end=T, step=1)
        token_emb = self.token_embedding_table(x) # batch_size x context_size x n_embd --> BxTxC
        pos_emb = self.positional_embedding_table(positions) # T x C (each context has a position)


        x = token_emb + pos_emb
        x = self.head(x, self.mask)
        x = x.view(B*T, C)

        return x

    def generate(self, starting_idx: torch.Tensor, max_length) -> torch.Tensor:

        full_text = itos[starting_idx.item()]
        for i in range(max_length):
            logits = self(starting_idx)
            percents = torch.softmax(logits, dim=1)
            pred = torch.multinomial(percents, num_samples=1)
            starting_idx = pred
            full_text += decode([pred.item()])
        return full_text


In [46]:
n_embd = 32
mlpv3 = MLPv3(vocab_size=vocab_size, n_embd=n_embd, context_size=context_size)
mlpv3.optimizer = torch.optim.Adam(params=mlpv3.parameters(), lr=1e-2)
mlpv3_loss_fn = nn.CrossEntropyLoss()
mlpv3.info()

{'vocab_size': 65, 'n_embd': 32, 'context_size': 8}

In [47]:
outputs = generate_from_model(model=mlpv3, max_length=200, num_outputs=2, starting_char="a")
for output in outputs:
    print(f"{output} \n\n")

ate. wourkerd ty thad t; leese bue ozer y an whicakesodeanermorthemes!
Burboun
CIUS:
Whidour he y ve thaghif winothatatoun arcepl:
TInteby  m, t o.
Ber,

Cock d whe d thi'twe hons teounst ly TIUThemout 


and pl
When
Fig wing sf mitopl bef nimbreane
A: rt
LewheathUS:
A ans,
DIlenwerarthar asu, d y loien adideqush ce t Co sif as-

Beaverofru f tye t anors empin.
s by tom's.
The nongeleitomy cteme
Wey g

 




In [48]:
#train_model(model=mlpv3, dataloader=train_dataloader, loss_fn=mlpv3_loss_fn, optimizer=mlpv2_optimizer, epochs=1)

In [49]:
outputs = generate_from_model(model=mlpv3, max_length=200, num_outputs=2, starting_char="a")
for output in outputs:
    print(f"{output} \n\n")

astharty boe ouves fanon:
Corcowofofrilours leashe
Fime be st:
MEnorestusestt
SI thall
Therecothad alesonwnluthamy. thakeny ces' f heronomatithe in. to' irinete pe t jur set?
MNENI gellenvis.
Th lt ld  


atoursin m o un athe tivenke be more t f hom, oid ce, otititcoondincatheatobonelom IUS:
A:
Twn nathik.
Finith t ther wn:
As foue eswizep,
ORUS:
S: mears.
Shelld bor hiseale ff sat t'a ot
yor seatoute,  




# Self-attetion x cross-attention
- in self-attention the values for queries (Q), keys (K) and values (V) all come from x itself, thus self-attention
- in cross-attention those values can come from somewhere else