<a href="https://www.kaggle.com/code/evelynartoria/transformer-from-scratch-pytorch-nlp?scriptVersionId=187722660" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Introduction (WIP)
- This notebook is based on the "Let's build GPT: from scratch, in code, spelled out" tutorial by Andrej Karpathy. You can find the tutorial here --> https://www.youtube.com/watch?v=kCc8FmEb1nY&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=7
- There are several different approaches in this notebook that do not strictly follow the original video. Some implementations are my own.
- I am using the same shakespeare text as in the video.

# Import needed libraries

In [1]:
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, random_split

# Device agnostic code

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)
generator = torch.Generator(device=device)
print(f"default device set to {device}")

default device set to cuda


# Prepare the data

In [3]:
with open("/kaggle/input/shakespeare/input.txt", "r", encoding="utf-8") as f:
    text = f.read()

vocab = sorted(set(text))
vocab_size = len(vocab)

print(vocab)
print(vocab_size)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
65


### Tokenizer

In [4]:
stoi = {c: v for v, c in enumerate(vocab)}
itos = {v: c for c, v in stoi.items()}
print(stoi["h"])
print(itos[46])

46
h


In [5]:
encode = lambda d: [stoi[idx] for idx in d]
decode = lambda e: "".join([itos[idx] for idx in e])

encoded = encode("hello, how are you?!")
decoded = decode(encoded)
print(encoded)
print(decoded)

[46, 43, 50, 50, 53, 6, 1, 46, 53, 61, 1, 39, 56, 43, 1, 63, 53, 59, 12, 2]
hello, how are you?!


# Prepare the dataset

In [6]:
context_size = 128
vocab_size = len(vocab)

In [7]:
def make_dataset(data, context_size):
    random_idx_tensor = torch.randperm((len(data)-context_size)//context_size) * context_size
    inputs = torch.stack([data[idx:idx+context_size] for idx in random_idx_tensor])
    labels = torch.stack([data[idx+1:idx+context_size+1] for idx in random_idx_tensor])
    
    return TensorDataset(inputs.to(torch.long), labels.to(torch.long))



In [8]:
# sicne randint might give the same random_idx, randperm is going to be preffered
print(torch.randint(0, 10, (10,)))
print(torch.randperm(10))

tensor([8, 5, 5, 5, 0, 6, 3, 8, 1, 9], device='cuda:0')
tensor([1, 4, 9, 5, 3, 2, 7, 8, 6, 0], device='cuda:0')


In [9]:
data = torch.tensor(encode(text))
dataset = make_dataset(data=data, context_size=context_size)

In [10]:
train_split = int(len(dataset)*0.75)
test_split = int(len(dataset)-train_split)

train_dataset, test_dataset = random_split(dataset=dataset, lengths=[train_split, test_split], generator=generator)

In [11]:
batch_size = 32
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, generator=generator)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, generator=generator)

In [12]:
sample_input = dataset[0][0]
sample_label = dataset[0][1]

print(sample_input)
print(sample_label)

print(f"dataset length --> {len(dataset)} ({len(dataset) * context_size} characters), that is, about the length of text {len(text)} - context_size --> {len(text)-context_size}")

tensor([ 0, 52, 53, 40, 50, 43,  1, 25, 39, 56, 41, 47, 59, 57,  8,  0,  0, 14,
        30, 33, 32, 33, 31, 10,  0, 20, 43,  5, 57,  1, 39,  1, 50, 39, 51, 40,
         1, 47, 52, 42, 43, 43, 42,  6,  1, 58, 46, 39, 58,  1, 40, 39, 43, 57,
         1, 50, 47, 49, 43,  1, 39,  1, 40, 43, 39, 56,  8,  0,  0, 25, 17, 26,
        17, 26, 21, 33, 31, 10,  0, 20, 43,  5, 57,  1, 39,  1, 40, 43, 39, 56,
         1, 47, 52, 42, 43, 43, 42,  6,  1, 58, 46, 39, 58,  1, 50, 47, 60, 43,
        57,  1, 50, 47, 49, 43,  1, 39,  1, 50, 39, 51, 40,  8,  1, 37, 53, 59,
         1, 58], device='cuda:0')
tensor([52, 53, 40, 50, 43,  1, 25, 39, 56, 41, 47, 59, 57,  8,  0,  0, 14, 30,
        33, 32, 33, 31, 10,  0, 20, 43,  5, 57,  1, 39,  1, 50, 39, 51, 40,  1,
        47, 52, 42, 43, 43, 42,  6,  1, 58, 46, 39, 58,  1, 40, 39, 43, 57,  1,
        50, 47, 49, 43,  1, 39,  1, 40, 43, 39, 56,  8,  0,  0, 25, 17, 26, 17,
        26, 21, 33, 31, 10,  0, 20, 43,  5, 57,  1, 39,  1, 40, 43, 39, 56,  1,
      

# Base model (MLP)

In [13]:
class MLP(nn.Module):
    def __init__(self, context_size, n_embd, vocab_size):
        super().__init__()


        self.context_size = context_size
        self.vocab_size = vocab_size
        self.n_embd = n_embd
        
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd) # B x T x C; B --> batches, T --> time (context_size), C --> n_embd
        self.pos_embedding_table = nn.Embedding(context_size, n_embd) # T x C; this is from the posisitional encoding part of the video
        
        self.linear1 = nn.Linear(in_features=n_embd, out_features=8*8) # B x T*C @ T*C x H; H --> number of hidden_units
        self.linear2 = nn.Linear(in_features=8*8, out_features=8*8*8)
        self.linear3 = nn.Linear(in_features=8*8*8, out_features=vocab_size)
        self.act_fn = nn.Tanh()

    def forward(self, idx: torch.Tensor) -> torch.Tensor:
        B, T = idx.shape
        C = self.n_embd
        positions = torch.arange(start=0, end=T, step=1) # 1 x B x T
        x = self.token_embedding_table(idx) + self.pos_embedding_table(positions)
        x = x.view(B*T, C) # B*T x C so it does not need the full context size at the beggining to generate predictions
        #print(f"x shape is {x.shape}, T is {T}, C is {C}")

        x = self.act_fn(self.linear1(x)) # B x T*C 
        x = self.act_fn(self.linear2(x))
        x = self.linear3(x)

        return x

    def generate(self, starting_idx: torch.Tensor, max_length: int, debug: bool) -> torch.Tensor:
        full_text = decode(starting_idx.tolist()[0])

        
        context = starting_idx
        
        for _ in range(max_length):
            context = context[:, -self.context_size:] # make sure the context is of size context_size
            
            if debug:
                print(f"predicting on context: {decode(context[0].tolist())}")
            
            logits = self(context) # B*T x vocab_size --> 1*2 x vocab_size
            logits = logits[-1, :].view(1, self.vocab_size) # only take the prediction for the last character
            percents = torch.softmax(logits, dim=1) # 1*2xvocab_size
            pred = torch.multinomial(percents, num_samples=1) 
            full_text += decode(pred.tolist()[0])
            
            #print(len(padded[0]))
            context = torch.cat([context, pred], dim=1) # add to the context dimension instead of the batch dim
            
        return full_text

            


# Define the base model, optimizer and loss function

In [14]:
mlp = MLP(context_size=context_size, n_embd=32, vocab_size=vocab_size)
optimizer = torch.optim.Adam(params=mlp.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# Take samples from the base model

In [15]:
@torch.no_grad
def model_sampler(model, context, randomize, max_length, num_samples):
    #print(len(context))
    #print(context_size)
    test = torch.tensor([[20, 53, 61,  1, 39, 56, 58, 39]])
    #print(test[:, 1:])
    result = torch.cat((test[:, 1:], torch.tensor([[99]])), dim=1)
    #print(result)
    #print("\n")

    mlp.eval()
    idx = torch.tensor(encode(context), dtype=torch.long).view(1, len(encode(context))) # inputs must be batched
    outputs = mlp.generate(starting_idx=idx, max_length=max_length, debug=False)
    print(f"{outputs} \n\n")

model_sampler(model=mlp, context="How are ", randomize=True, max_length=10, num_samples=5)

How are E;BFV;mpkZ 




# Training loop

In [16]:
def train_model(model, dataloader, loss_fn, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        for batch, (X, y) in tqdm(enumerate(dataloader)):
            logits = model(X)
            loss = loss_fn(logits, y.view(-1)) # y.view(-1) turn y of shape BxT into B*T to mathc the logits shape
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if batch % 20 == 0:
                print(f"loss for batch {batch} --> {loss} at epoch {epoch}")

    print(f"loss for the very last batch --> {loss}")

In [17]:
mlp.train()
train_model(model=mlp, dataloader=train_dataloader, loss_fn=loss_fn, optimizer=optimizer, epochs=1)

30it [00:00, 107.33it/s]

loss for batch 0 --> 4.1840667724609375 at epoch 0
loss for batch 20 --> 3.1744179725646973 at epoch 0
loss for batch 40 --> 2.970180034637451 at epoch 0
loss for batch 60 --> 2.954010009765625 at epoch 0


128it [00:00, 258.22it/s]

loss for batch 80 --> 2.8322715759277344 at epoch 0
loss for batch 100 --> 2.759148597717285 at epoch 0
loss for batch 120 --> 2.7204084396362305 at epoch 0
loss for batch 140 --> 2.7117512226104736 at epoch 0


205it [00:00, 231.56it/s]

loss for batch 160 --> 2.6893413066864014 at epoch 0
loss for batch 180 --> 2.708850622177124 at epoch 0
loss for batch 200 --> 2.6500089168548584 at epoch 0
loss for the very last batch --> 2.540949821472168





# Base model inference

In [18]:
@torch.no_grad
def model_inference(model, dataloader):
    mlp.eval()
    X, y = next(iter(dataloader))
    logits = model(X)
    percents = torch.softmax(logits, dim=1) # dim=1 since the input was batched
    preds = torch.argmax(percents, dim=1) # dim=1 since the input was batched
    print(f"for {X} \n model predicted {preds}")
    print(f"expected --> {y[:, -1]}")
    print(y)

In [19]:
model_inference(model=mlp, dataloader=train_dataloader)

for tensor([[44,  1, 57,  ..., 59,  1, 61],
        [ 6,  1, 58,  ..., 52, 45, 57],
        [53,  1, 39,  ..., 53, 53, 42],
        ...,
        [ 1, 61, 53,  ..., 53, 56,  1],
        [ 0, 13, 57,  ..., 57, 58, 43],
        [53,  1, 51,  ..., 39, 56, 45]], device='cuda:0') 
 model predicted tensor([ 1, 58,  1,  ..., 52,  1,  1], device='cuda:0')
expected --> tensor([43,  8, 10, 42, 53, 52,  0, 53, 46, 61, 59, 15, 50,  0, 42, 61, 43, 47,
        46, 17, 61, 58, 52, 58, 57, 43, 61,  5, 56, 51, 56, 53],
       device='cuda:0')
tensor([[ 1, 57, 53,  ...,  1, 61, 43],
        [ 1, 58, 46,  ..., 45, 57,  8],
        [ 1, 39, 57,  ..., 53, 42, 10],
        ...,
        [61, 53, 56,  ..., 56,  1, 51],
        [13, 57,  1,  ..., 58, 43, 56],
        [ 1, 51, 59,  ..., 56, 45, 53]], device='cuda:0')


In [20]:
model_sampler(model=mlp, context="How are ", randomize=True, max_length=500, num_samples=1)

How are we
arenotheeinselu ndlh saatisheeinRYEO:
Mkoy edd ueamagofrim,nt re g aralin ionher then my ng wour han.l albyo thewan se en tir,oou t f the whoft mekixfre we cen
Dlipe,

Pu
Daswe
T:
Scur inddowecd y fte pe mot don?rthize n manp,
o Be.
T:
Thasind g
Tikhelo atofrie Iindlo t:
Whepar pr uimlit hiig:
Pc n sg t sngesh hae


HCN gondtearcund f come gKce ce-m,or mhomin t bed inug sounthacindheene uy my minso rouh.
Ad;

M ce e r fouthan.eome te hgor dOMorto p thing ng
IENU:
't.
ATi i utow:
Wh bu
S:
M wo 




# Self attention math

In [21]:
sample_batch = next(iter(test_dataloader))[0]
B, T = sample_batch.shape # batch of B by T
print(B, T)
example_emb = nn.Embedding(vocab_size, 4)
embedded = example_emb(sample_batch)
B, T, C = embedded.shape # embedded is Batches by Time (context_sie) by Channels (num of values per token)
print(embedded.shape)

32 128
torch.Size([32, 128, 4])


In [22]:
bag_of_words = torch.zeros(size=(B, T, C)) # each of the values has a unique value

for batch_idx in range(B):
    for context_idx in range(T):
        xprev = embedded[batch_idx, :context_idx+1]
        bag_of_words[batch_idx, context_idx] = torch.mean(xprev, dim=0)
print(bag_of_words)

tensor([[[-0.8609,  0.3323,  0.6140, -0.5782],
         [ 0.2443, -0.1584,  0.6773, -0.1373],
         [ 0.5538, -0.3726,  0.0825, -0.3324],
         ...,
         [-0.0662, -0.1874, -0.2624, -0.1384],
         [-0.0626, -0.1848, -0.2633, -0.1336],
         [-0.0645, -0.1840, -0.2653, -0.1321]],

        [[ 0.4916, -2.7525,  0.3134, -0.7418],
         [-0.6967, -1.5363, -0.3940, -0.2345],
         [-0.0146, -1.2406, -0.0158, -0.0552],
         ...,
         [-0.0847, -0.3982,  0.0414, -0.0404],
         [-0.0739, -0.3929,  0.0300, -0.0455],
         [-0.0757, -0.3904,  0.0258, -0.0446]],

        [[-0.5031,  1.4130,  1.4337, -1.6267],
         [ 0.4232,  0.3819,  1.0871, -0.6616],
         [-0.2549, -0.0508,  0.2470, -0.4332],
         ...,
         [-0.3068, -0.3564,  0.0160, -0.0483],
         [-0.3005, -0.3753,  0.0184, -0.0538],
         [-0.3129, -0.3748,  0.0096, -0.0512]],

        ...,

        [[-1.4424, -1.2755,  2.5444,  0.0591],
         [-0.4754, -2.0140,  1.4289, -0.3414]

In [23]:
ones = torch.ones(size=(3, 3))
tril = torch.tril(ones) # lower triangular part of a matrix
print(tril)
a = torch.randint(0, 10, (3, 2), dtype=torch.float32)
b = torch.randint(0, 10, (2, 3), dtype=torch.float32)
matmul_output = a @ b
matmul_tril_output = torch.tril(a) @ b
print(matmul_output)
print(matmul_tril_output)

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]], device='cuda:0')
tensor([[61., 41., 17.],
        [ 6.,  4.,  1.],
        [20., 14., 17.]], device='cuda:0')
tensor([[54., 36.,  9.],
        [ 6.,  4.,  1.],
        [20., 14., 17.]], device='cuda:0')


In [24]:
# do the same as bag of words but with matrix multiplication (dot product)
a = torch.ones(size=(3, 3), dtype=torch.float32)
b = torch.randint(0, 10, (3, 2), dtype=torch.float32)

a = torch.tril(a)
"""
b = torch.tensor(
    [
        [2, 7],
        [6, 4],
        [6, 5]
    ], dtype=torch.float32
)
"""
print(a)
a = a/a.sum(dim=1, keepdim=True)
print(a)

output = a @ b
print(output)

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]], device='cuda:0')
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]], device='cuda:0')
tensor([[5.0000, 5.0000],
        [5.5000, 6.5000],
        [5.3333, 5.3333]], device='cuda:0')


In [25]:
sample_batch = next(iter(test_dataloader))[0]
B, T = sample_batch.shape # batch of B by T
example_emb = nn.Embedding(vocab_size, 4)
embedded = example_emb(sample_batch)
B, T, C = embedded.shape # embedded is Batches by Time (context_sie) by Channels (num of values per token)
#print(embedded.shape)

wei = torch.tril(torch.ones(size=(T, T)))
wei = wei / wei.sum(dim=1, keepdim=True)
print(embedded.shape) # B x T x C
print(wei.shape) # T x T
#  1xTxT @ BxTxC
bag_of_words = wei @ embedded
print(bag_of_words)

torch.Size([32, 128, 4])
torch.Size([128, 128])
tensor([[[ 7.6019e-01,  2.8860e-02,  4.0811e-01, -3.6478e-02],
         [ 7.8799e-01,  4.5868e-01,  8.3820e-01,  5.6509e-01],
         [ 6.4575e-01,  7.8612e-01,  4.6441e-01,  4.9295e-01],
         ...,
         [-3.1060e-02,  6.0723e-03,  4.4947e-01,  4.1681e-01],
         [-4.2786e-02,  8.8795e-03,  4.5361e-01,  4.1451e-01],
         [-4.8643e-02,  1.1964e-02,  4.5928e-01,  4.1661e-01]],

        [[-1.0767e+00, -4.7999e-01,  9.7094e-01,  3.1590e-01],
         [-6.6464e-01, -9.1798e-01,  6.1554e-01,  2.2937e-01],
         [-1.7116e-01, -3.1582e-01,  8.3312e-01,  5.4180e-01],
         ...,
         [-6.7564e-02, -1.5873e-03,  4.4038e-01,  5.5480e-01],
         [-6.6833e-02, -1.4725e-04,  4.3821e-01,  5.5713e-01],
         [-7.2502e-02,  3.0082e-03,  4.4400e-01,  5.5812e-01]],

        [[ 5.2257e-01, -3.2790e-01,  8.4859e-01,  1.0093e+00],
         [ 6.6918e-01,  2.8030e-01,  1.0584e+00,  1.0880e+00],
         [-1.2662e-01, -2.0178e-01,  9

# Bag of words type aggregation with a mask

In [26]:
tril = torch.tril(torch.ones(size=(T, T)))
wei = torch.zeros(size=(T, T)) # zeros just so there's a plaaceholder for masked_fill
wei = wei.masked_fill(tril==0, float("-inf")) # whenever the value in tril is 0, it will get replaced with -inf; this allows softmax to come into place, since -inf will get a percent of 0
wei = torch.softmax(wei, dim=1)
print(wei)
bag_of_words = wei @ embedded
print(bag_of_words)

tensor([[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0079, 0.0079, 0.0079,  ..., 0.0079, 0.0000, 0.0000],
        [0.0079, 0.0079, 0.0079,  ..., 0.0079, 0.0079, 0.0000],
        [0.0078, 0.0078, 0.0078,  ..., 0.0078, 0.0078, 0.0078]],
       device='cuda:0')
tensor([[[ 7.6019e-01,  2.8860e-02,  4.0811e-01, -3.6478e-02],
         [ 7.8799e-01,  4.5868e-01,  8.3820e-01,  5.6509e-01],
         [ 6.4575e-01,  7.8612e-01,  4.6441e-01,  4.9295e-01],
         ...,
         [-3.1060e-02,  6.0723e-03,  4.4947e-01,  4.1681e-01],
         [-4.2786e-02,  8.8795e-03,  4.5361e-01,  4.1451e-01],
         [-4.8643e-02,  1.1964e-02,  4.5928e-01,  4.1661e-01]],

        [[-1.0767e+00, -4.7999e-01,  9.7094e-01,  3.1590e-01],
         [-6.6464e-01, -9.1798e-01,  6.1554e-01,  2.2937e-01],
         [-1.7116e-01, -3.1582e-01,  8.3312e-01,  5.4180e-

# MLP model with agreggation
- a problem that needs to be addressed with the previous model is that it needs to always receive a input of B x T (batch_size by context_size), whereas it would be best if the model could adapt to inputs of different context_size

In [27]:
# Code to allow comunication between past tokens
n_embd = 4
C = n_embd
T = context_size
B = batch_size
wei = torch.zeros(size=(T, T))
tril = torch.tril(torch.ones(size=(T, T)))
wei = wei.masked_fill(tril==0, float('-inf'))
xbow = wei.softmax(dim=1)

test_tensor = torch.randn(size=(B, T, C))
print(xbow @ test_tensor)

tensor([[[-2.9187e-01,  9.7878e-01, -1.1457e+00,  7.6321e-01],
         [ 2.7074e-01,  3.2424e-01,  1.4588e-01,  2.1881e-01],
         [ 6.8003e-01, -5.8981e-02,  4.3124e-02, -9.4029e-02],
         ...,
         [ 6.6365e-02,  4.5040e-02,  7.2592e-02, -1.0932e-01],
         [ 7.4244e-02,  4.9955e-02,  9.1549e-02, -9.7730e-02],
         [ 7.6225e-02,  4.4056e-02,  9.0803e-02, -1.0479e-01]],

        [[ 1.0964e+00, -1.6107e+00,  6.6661e-01,  6.9881e-01],
         [ 2.8685e-01, -7.6096e-01, -6.4121e-01,  2.2426e-01],
         [ 1.6113e-01, -7.7475e-01, -5.8018e-01,  3.3083e-01],
         ...,
         [ 1.0633e-01, -1.0349e-01, -2.7788e-02, -5.8346e-02],
         [ 1.0075e-01, -9.4910e-02, -2.8888e-02, -6.6626e-02],
         [ 9.4589e-02, -8.6337e-02, -1.9282e-02, -7.9699e-02]],

        [[ 2.5255e-01, -5.6990e-01, -3.1766e-01,  1.3747e+00],
         [-3.4899e-01, -7.8460e-01, -1.7710e+00,  1.0325e+00],
         [-9.8026e-02,  3.7190e-01, -1.3154e+00,  6.8815e-01],
         ...,
         

In [28]:
# a problem that needs to be addressed with the previous model is that it needs to always receive a 
batch_sample_inputs, batch_sample_labels = next(iter(train_dataloader))
print(batch_sample_inputs.shape)
print(batch_sample_labels.shape)

torch.Size([32, 128])
torch.Size([32, 128])


# Code cleanup

In [29]:
class MLPv2(nn.Module):
    def __init__(self, vocab_size, n_embd, context_size):
        super().__init__()

        self.vocab_size = vocab_size
        self.n_embd = n_embd
        self.context_size = context_size

        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.positional_embedding_table = nn.Embedding(context_size, n_embd) # the position of each token has a separate table of values; this helps the model keep track of the order of characters

        #self.lm_head = nn.Linear(in_features=n_embd, out_features=vocab_size) # this time in_features=n_embd so it's not context_size dependant
        self.linear1 = nn.Linear(in_features=n_embd, out_features=8*8)
        self.linear2 = nn.Linear(in_features=8*8, out_features=8*8*8)
        self.linear3 = nn.Linear(in_features=8*8*8, out_features=vocab_size)

        self.act_fn = nn.Tanh()

    def info(self):
        info_dict = {
            "vocab_size": self.vocab_size,
            "n_embd": self.n_embd,
            "context_size": self.context_size
        }

        return info_dict

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, T = x.shape
        C = self.n_embd

        #print(B, T, C)

        positions = torch.arange(start=0, end=T, step=1)
        token_emb = self.token_embedding_table(x) # batch_size x context_size x n_embd --> BxTxC
        pos_emb = self.positional_embedding_table(positions) # T x C (each context has a position)

        x = token_emb + pos_emb
        x = x.view(B*T, C)

        #logits = self.lm_head(token_emb.view(B*T, C)) # output of shape B*T x hidden_units1; this might be a problem for the labels since they are of shape B x T, thus they need to be reshaped aswell
        x = self.act_fn(self.linear1(token_emb.view(B*T, C))) # hidden_units1 x hidden_units2
        x = self.act_fn(self.linear2(x)) # hidden_units2 x hidden_units3
        x = self.linear3(x) # hidden_units3 x vocab_size
        return x

    def generate(self, starting_idx: torch.Tensor, max_length) -> torch.Tensor:

        full_text = itos[starting_idx.item()]
        for i in range(max_length):
            logits = self(starting_idx)
            percents = torch.softmax(logits, dim=1)
            pred = torch.multinomial(percents, num_samples=1)
            starting_idx = pred
            full_text += decode([pred.item()])
        return full_text


In [30]:
mlpv2 = MLPv2(vocab_size=vocab_size, n_embd=32, context_size=context_size)

In [31]:
mlpv2.info()

{'vocab_size': 65, 'n_embd': 32, 'context_size': 128}

In [32]:
mlpv2_loss_fn = nn.CrossEntropyLoss()
batch_sample_inputs, batch_sample_labels = next(iter(train_dataloader))
sample_input = batch_sample_inputs[0]
sample_label = batch_sample_labels[0]
print(sample_input) # 1 x T (B x T)
print(sample_label.view(1, -1)) # 1 x T (B x T)
mlpv2.eval()
with torch.inference_mode():
    logits = mlpv2(sample_input.view(1, -1))
    labels = sample_label.view(-1) # from B x T to B*T to match the shape of the logits
    print(logits.shape) # B*T x vocab_size
    print(labels.shape)

    loss = mlpv2_loss_fn(logits, labels)
    print(loss)

tensor([44,  1, 57, 53, 51, 43,  1, 57, 47, 62,  1, 53, 56,  1, 57, 43, 60, 43,
        52,  6,  0, 58, 46, 43,  1, 51, 53, 57, 58,  1, 57, 59, 44, 44, 47, 41,
        47, 43, 52, 58,  1, 53, 44,  1, 63, 53, 59, 56,  1, 54, 39, 56, 47, 57,
        46,  8,  0,  0, 17, 24, 14, 27, 35, 10,  0, 32, 53,  1, 63, 53, 59, 56,
         1, 61, 53, 56, 57, 46, 47, 54,  5, 57,  1, 46, 53, 59, 57, 43,  6,  1,
        57, 47, 56, 12,  0,  0, 17, 31, 15, 13, 24, 33, 31, 10,  0, 32, 53,  1,
        51, 63,  1, 46, 53, 59, 57, 43,  8,  1, 18, 39, 56, 43,  1, 63, 53, 59,
         1, 61], device='cuda:0')
tensor([[ 1, 57, 53, 51, 43,  1, 57, 47, 62,  1, 53, 56,  1, 57, 43, 60, 43, 52,
          6,  0, 58, 46, 43,  1, 51, 53, 57, 58,  1, 57, 59, 44, 44, 47, 41, 47,
         43, 52, 58,  1, 53, 44,  1, 63, 53, 59, 56,  1, 54, 39, 56, 47, 57, 46,
          8,  0,  0, 17, 24, 14, 27, 35, 10,  0, 32, 53,  1, 63, 53, 59, 56,  1,
         61, 53, 56, 57, 46, 47, 54,  5, 57,  1, 46, 53, 59, 57, 43,  6,  1, 57,
 

In [33]:
@torch.no_grad
def generate_from_model(model, num_outputs, starting_char, max_length):
    mlpv2.eval()
    outputs = []
    starting_idx = torch.tensor([stoi[starting_char]], dtype=torch.long).view(1, -1)
    for i in range(num_outputs):
        output = mlpv2.generate(starting_idx=starting_idx, max_length=max_length) # must be batched
        outputs.append(output)

    return outputs

In [34]:
test_output = generate_from_model(model=mlpv2, num_outputs=1, starting_char="a", max_length=5)
print(test_output[0])

aqRPeX


In [35]:
def train_model(model, dataloader, loss_fn, optimizer, epochs):
    model.train()

    for epoch in range(epochs):
        for batch, (X, y) in enumerate(dataloader):
            logits = model(X)
            loss = loss_fn(logits, y.view(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


            if batch % 20 == 0:
                print(f"loss for batch {batch} --> {loss} at epoch {epoch}")

    print(f"loss for the very last batch --> {loss}")

In [36]:
mlpv2_optimizer = torch.optim.Adam(params=mlpv2.parameters(), lr=1e-3)
mlpv2_loss_fn = nn.CrossEntropyLoss()

In [37]:
train_model(model=mlpv2, dataloader=train_dataloader, loss_fn=mlpv2_loss_fn, optimizer=mlpv2_optimizer, epochs=4)

loss for batch 0 --> 4.183022499084473 at epoch 0
loss for batch 20 --> 2.9723587036132812 at epoch 0
loss for batch 40 --> 2.7465195655822754 at epoch 0
loss for batch 60 --> 2.7067058086395264 at epoch 0
loss for batch 80 --> 2.5965919494628906 at epoch 0
loss for batch 100 --> 2.5509095191955566 at epoch 0
loss for batch 120 --> 2.493223190307617 at epoch 0
loss for batch 140 --> 2.535734176635742 at epoch 0
loss for batch 160 --> 2.508403778076172 at epoch 0
loss for batch 180 --> 2.5316035747528076 at epoch 0
loss for batch 200 --> 2.5006420612335205 at epoch 0
loss for batch 0 --> 2.4882824420928955 at epoch 1
loss for batch 20 --> 2.4636991024017334 at epoch 1
loss for batch 40 --> 2.4904398918151855 at epoch 1
loss for batch 60 --> 2.507495880126953 at epoch 1
loss for batch 80 --> 2.4819583892822266 at epoch 1
loss for batch 100 --> 2.4858243465423584 at epoch 1
loss for batch 120 --> 2.434555768966675 at epoch 1
loss for batch 140 --> 2.4825916290283203 at epoch 1
loss for ba

In [38]:
test_outputs = generate_from_model(model=mlpv2, max_length=100, num_outputs=2, starting_char="b")
for output in test_outputs:
    print(f"{output}\n\n")

bris t, beswenngl berofan, Whepalmal oughen sthaiay.
NThrtoul's bepe;
Pn r
ERWherus

ICaul ir thavero


boussise my I's h:

MNE:
Whay m th whenothino hey po I ifrmeang merdecome s CHires frrtherhecks man h




# Self attention
- with xbow you can add information about the tokens, but the model itself does not attribute any weight to them. This is what self attention solves by using Keys, Queries and Values
- every token will have a specific Query (Q) and Key (K) attatched to it
    - Query --> what the model is looking for
    - Key --> the weight the model is giving to this certain token
    - Value --> matches queires and keys

In [39]:
# Code to allow comunication between past tokens

B, T, C = 2, 4, 8
wei = torch.zeros(size=(T, T))
tril = torch.tril(torch.ones(size=(T, T)))
wei = wei.masked_fill(tril==0, float('-inf'))
xbow = wei.softmax(dim=1)

test_tensor = torch.randn(size=(B, T, C))
output = xbow @ test_tensor
print(output.shape)

torch.Size([2, 4, 8])


In [40]:
head_size = 32
key = nn.Linear(in_features=C, out_features=head_size, bias=False) # bias = False so that it's just a multiplication
query = nn.Linear(in_features=C, out_features=head_size, bias=False) 

#print(test_tensor.shape)
k = key(test_tensor) # BxTxC @ BxTxhead_size --> BxTxhead_size; each batch has a context and each context character has a key value
q = query(test_tensor) # BxTxC @ BxTxhead_size --> BxTxhead_size; each batch has a context and each context character has a query value
print(k.shape)
print(q.shape)

torch.Size([2, 4, 32])
torch.Size([2, 4, 32])


In [41]:
print(k.transpose(-2, -1).shape) # same as k.permute(0, 2, 1)
print(k.permute(0, 2, 1).shape)

wei = q @ k.transpose(-2, -1) # BxTxhead_size @ Bxhead_sizexT --> BxTxT
print(wei.shape) #  B x T x T

tril = torch.tril(torch.ones(size=(T, T)))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = torch.softmax(wei, dim=-1) # dim=-1 in this case, since wei is of shape

output = wei @ test_tensor

print(output.shape) # B x T x C

torch.Size([2, 32, 4])
torch.Size([2, 32, 4])
torch.Size([2, 4, 4])
torch.Size([2, 4, 8])


# Update the model with self attention

## Head

In [42]:
class Head(nn.Module):
    def __init__(self, n_embd, head_size, context_size):
        super(Head, self).__init__()
        
        self.Q = nn.Linear(in_features=n_embd, out_features=head_size) # takes in BxTxC and return BxTxHead_size
        self.K = nn.Linear(in_features=n_embd, out_features=head_size)
        self.V = nn.Linear(in_features=n_embd, out_features=head_size)
        
        self.register_buffer("tril", torch.tril(torch.ones(size=(context_size, context_size))))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, T, C = x.shape # batch_size by context_size by n_embd
        q = self.Q(x) # BxTxHead_size
        k = self.K(x) # BxTxHead_size
        
        wei = q @ k.transpose(-2, -1) * (C ** -0.5) # BxTxHead_size @ BxHead_sizexT --> BxTxT then divided by the square root of n_embd
        
        wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf')) # :T and :T is needed in case context is smaller than context_size
        wei = torch.softmax(wei, dim=-1)
        v = self.V(x) # BxTxHead_size
        output = wei @ v # BxTxT @ BxTxHead_size --> BxTxHead_size
        
        return output

## Multiheaded Attention
- multiple heads concatenated together for better data representation

In [43]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, n_embd, context_size, n_heads, head_size):
        super(MultiHeadedAttention, self).__init__()
        
        self.heads = nn.ModuleList([Head(n_embd=n_embd, head_size=head_size, context_size=context_size) for _ in range(n_heads)]) # BxTx (n_heads * head_size)
        self.projection = nn.Linear(in_features=n_heads*head_size, out_features=n_embd) # ensures the output is going to be o shape BxTxn_embd (BxTxC) so that is can go through multiple attention block
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = torch.cat([head(x) for head in self.heads], dim=-1) # cat in the Channels dimension; output shape is BxTx (n_heads * head_size)
        #print(f"multihead output shape is {out.shape}")
        #return out

        x = self.projection(x)
        return  x

## Feedforward
- just a simple feedforward to scale the logits from each attention block

In [44]:
class FeedForward(nn.Module):
    def __init__(self, in_features):
        super(FeedForward, self).__init__()

        self.ffwrd_layer = nn.Sequential(
            nn.Linear(in_features=in_features, out_features=in_features * 4), # scale by 4, according to the attention is all you need paper
            nn.ReLU(),
            nn.Linear(in_features=in_features * 4, out_features=in_features) # another projection layer
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.ffwrd_layer(x)

# Attention block
- Make a class so that multiheaded attention can be applied multiple times, alongside layer normalization and the sum of residual connections

In [45]:
class Block(nn.Module):
    def __init__(self, n_heads, head_size, n_embd, context_size):
        super(Block, self).__init__()
        self.multiheaded_self_attetion = MultiHeadedAttention(n_embd=n_embd, context_size=context_size, n_heads=n_heads, head_size=head_size) # create a multiheaded attention block; returns shape BxTx (num_heads*head_size)
        #self.ffwrd = FeedForward(in_features=n_heads*head_size) # returns shape BxTx (num_heads*head_size) --> this is only in case there is no projection layer inside of multiheaded attention
        self.ffwrd = FeedForward(in_features=n_embd)

        self.layer_norm1 = nn.LayerNorm(n_embd)
        self.layer_norm2 = nn.LayerNorm(n_embd)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # /// no residual connections ///
        #x = self.multiheaded_self_attetion(x)
        #x = self.ffwrd(x)

        # /// with residual conections for better optimization ///
        x = x + self.multiheaded_self_attetion(self.layer_norm1(x))
        x = x + self.ffwrd(self.layer_norm2(x))

        return x

# Decoder transformer

In [46]:
class Decoder(nn.Module):
    def __init__(self, n_embd, context_size, vocab_size, num_sa_heads, sa_head_size):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.context_size = context_size
        self.n_embd = n_embd
        
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd) # each character from the vocab has n_embd values associated to it
        self.positional_embedding_table = nn.Embedding(context_size, n_embd) # each character position in the context has n_embd values associated to it
        
        
        # /// Some previous modifications for testing purposes ///
        
        #self.linear1 = nn.Linear(in_features=n_embd, out_features=8*8)
        #self.linear2 = nn.Linear(in_features=8*8, out_features=vocab_size)
        #self.act_fn = nn.Tanh()

        #self.sa_head_size = 64
        #self.sa_head_size = n_embd // self.num_sa_heads # this proportion is needed in case you are using multiple attention blocks so to keep proper dimensions, otherwhise you can set head_size to anything you want

        #self.multiheadattention = MultiHeadedAttention(n_embd=n_embd, context_size=context_size, n_heads=self.num_sa_heads, head_size=self.sa_head_size)
        #self.ffwrd = FeedForward(in_features=self.num_sa_heads*self.sa_head_size) # going to take in BxTx (sa_head_size * num_sa_heads) --> going to output the same shape
        #self.sa_head = Head(n_embd=64, head_size=64, context_size=self.context_size)

        # /////////////////
        
        self.attention_blocks = nn.Sequential(
            Block(n_heads=num_sa_heads, head_size=sa_head_size, context_size=self.context_size, n_embd=self.n_embd), # takes in BxTxC, calculate logits of BxTx (num_heads * head_size), then project it as BxTxC
            Block(n_heads=num_sa_heads, head_size=sa_head_size, context_size=self.context_size, n_embd=self.n_embd),
            Block(n_heads=num_sa_heads, head_size=sa_head_size, context_size=self.context_size, n_embd=self.n_embd),
            nn.LayerNorm(n_embd) # normalize the layers
        )
        

        self.lm_head = nn.Linear(in_features=n_embd, out_features=vocab_size) # (B, T, vocab_size)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, T = x.shape # batch_size and context_size
        positions = torch.arange(start=0, end=T, step=1)
        
        pos_emb = self.positional_embedding_table(positions) # T x C --> in broadcasting, pytorch adds a batch dim=1
        token_emb = self.token_embedding_table(x) # B x T x C
        
        x = token_emb + pos_emb # BxTxC
        
        # /// Some previous modifications for testing purposes ///
        
        #x = self.lm_head(x) # BxTxVocab_size
        #x = self.act_fn(self.linear1(x)) # BxTxVocab_size
        #x = self.linear2(x) # BxTxVocab_size
        
        #x = self.multiheadattention(x) # BxTx (sa_head_size*num_sa_heads)
        #x = self.ffwrd(x)
        #self_attention = self.sa_head(x) # BxTxHead_size (BxTxC in this case, since head_size=n_embd)
        
        # /////////////////

        x = self.attention_blocks(x) # returns logits of shape BxTx (self.sa_head_size * self.num_sa_heads) projected to BxTxC

        x = self.lm_head(x) # BxTxvocab_size --> BxTxHead_size @ BxTxVocab_size return BxTxVocab_size

        return x.view(B*T, self.vocab_size) # easier shape to work with the labels
    
    def generate(self, starting_idx: torch.Tensor, max_length: int, debug: bool) -> torch.Tensor:
        full_text = decode([starting_idx.item()])
        context = starting_idx
        
        for _ in range(max_length):
            context = context[:, -self.context_size:] # make sure the context is of size context_size
            
            if debug:
                print(f"predicting on context: {decode(context[0].tolist())}")
            
            logits = self(context) # B*T x vocab_size --> 1*2 x vocab_size
            logits = logits[-1, :].view(1, self.vocab_size) # only take the prediction for the last character
            percents = torch.softmax(logits, dim=1) # 1*2xvocab_size
            pred = torch.multinomial(percents, num_samples=1) 
            full_text += decode(pred.tolist()[0])
            
            #print(len(padded[0]))
            context = torch.cat([context, pred], dim=1) # add to the context dimension instead of the batch dim
            
        return full_text


# Define the final model and its hyperparameters

In [47]:
n_embd = 1024
vocab_size = len(vocab)
context_size = 128 # same as previously set
num_sa_heads = 16
sa_head_size = 64

decoder = Decoder(vocab_size=vocab_size, n_embd=n_embd, context_size=context_size, num_sa_heads=num_sa_heads, sa_head_size=sa_head_size)
optimizer = torch.optim.Adam(params=decoder.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# Model text generator class

In [48]:
class model_generator:
    def __init__(self, model: object, max_length: int, num_samples: int, vocab_size: int):
        self.model = model
        self.max_length = max_length
        self.num_samples = num_samples
        self.vocab_size = vocab_size
        
        self.last_output = ""
        
        self.params_dict = {
            "model": model,
            "max_length": max_length,
            "num_samples": num_samples,
            "previous_outputs": []
        }
    
    @torch.no_grad
    def generate(self, starting_char: str = None, clear_outputs: bool = True, debug: bool = False):
        self.model.eval()
        
        if clear_outputs:
            self.clear_ouptuts()
            
        if starting_char is None:
            starting_char = decode([torch.randint(0, vocab_size, (1,)).item()])
            
        for _ in range(self.num_samples):
            starting_idx = torch.tensor(encode(starting_char), dtype=torch.long).view(1, 1)
            output = self.model.generate(starting_idx=starting_idx, max_length=self.max_length, debug=debug)
            self.params_dict["previous_outputs"].append(output)
            self.last_output = output
    
    def update_params(self, model: object = None, max_length: int = None, num_samples: int = None, clear_outputs: bool = None):
        if clear_outputs:
            self.clear_outputs()
            
        updated_dict = {
            "model": model,
            "max_length": max_length,
            "num_samples": num_samples
        }
        
        for attribute, value in updated_dict.items():
            if value is not None:
                self.params_dict[attribute] = value
                setattr(self, attribute, value)
    
    def clear_ouptuts(self):
        self.params_dict["previous_outputs"] = []
        self.last_output = ""
        
    def print_outputs(self, last: bool = None):
        if last:
            print(self.last_output)
        else:
            for output in self.params_dict["previous_outputs"]:
                print(f"{output}\n\n")

In [49]:
decoder_generator = model_generator(model=decoder, max_length=32, num_samples=1, vocab_size=vocab_size)

In [50]:
decoder_generator.update_params(max_length=100, num_samples=1)
decoder_generator.generate()
decoder_generator.print_outputs()

F JUTYBF;':wlnlw$kGUCsHyy-kKOGKvhnIyFXpeoQsdL;
&yCjHHOV?AajdQY-Y?;xQgveRRP
'frhPFlwDvqbzI!Fq!DBk::LTz




# Training loop
- Note --> this model is not using any dropout layers, thus there is going to be overfitting

In [51]:
def train_model(model, dataloader, loss_fn, optimizer, epochs):
    model.train()
    
    for epoch in range(epochs):
        for batch, (X, y) in tqdm(enumerate(dataloader)):
            logits = model(X) # shape of B*T x vocab_size
            labels = y.view(-1) # shape of B*T --> each character has it's own prediction
            loss = loss_fn(logits, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if batch % 100 == 0:
                print(f"loss for batch {batch} --> {loss} at epoch {epoch}")

In [52]:
train_model(model=decoder, dataloader=train_dataloader, loss_fn=loss_fn, optimizer=optimizer, epochs=20)

1it [00:00,  2.86it/s]

loss for batch 0 --> 4.335644245147705 at epoch 0


101it [00:29,  3.32it/s]

loss for batch 100 --> 2.525604486465454 at epoch 0


201it [01:00,  3.18it/s]

loss for batch 200 --> 2.4626641273498535 at epoch 0


205it [01:01,  3.34it/s]
1it [00:00,  3.18it/s]

loss for batch 0 --> 2.4262943267822266 at epoch 1


101it [00:32,  2.96it/s]

loss for batch 100 --> 2.1621334552764893 at epoch 1


201it [01:07,  2.91it/s]

loss for batch 200 --> 1.9090383052825928 at epoch 1


205it [01:08,  3.00it/s]
1it [00:00,  3.00it/s]

loss for batch 0 --> 1.9108017683029175 at epoch 2


101it [00:33,  2.99it/s]

loss for batch 100 --> 1.8107575178146362 at epoch 2


201it [01:07,  2.96it/s]

loss for batch 200 --> 1.663124442100525 at epoch 2


205it [01:08,  3.00it/s]
1it [00:00,  2.90it/s]

loss for batch 0 --> 1.677341341972351 at epoch 3


101it [00:34,  2.97it/s]

loss for batch 100 --> 1.6587157249450684 at epoch 3


201it [01:07,  2.98it/s]

loss for batch 200 --> 1.5699682235717773 at epoch 3


205it [01:08,  2.99it/s]
1it [00:00,  3.00it/s]

loss for batch 0 --> 1.574483036994934 at epoch 4


101it [00:33,  2.96it/s]

loss for batch 100 --> 1.564962387084961 at epoch 4


201it [01:07,  2.94it/s]

loss for batch 200 --> 1.504082202911377 at epoch 4


205it [01:08,  2.98it/s]
1it [00:00,  2.93it/s]

loss for batch 0 --> 1.4994834661483765 at epoch 5


101it [00:34,  2.95it/s]

loss for batch 100 --> 1.503920555114746 at epoch 5


201it [01:07,  2.97it/s]

loss for batch 200 --> 1.4501631259918213 at epoch 5


205it [01:08,  2.98it/s]
1it [00:00,  2.94it/s]

loss for batch 0 --> 1.4469995498657227 at epoch 6


101it [00:33,  2.98it/s]

loss for batch 100 --> 1.4591846466064453 at epoch 6


201it [01:07,  2.95it/s]

loss for batch 200 --> 1.414280891418457 at epoch 6


205it [01:08,  3.00it/s]
1it [00:00,  2.99it/s]

loss for batch 0 --> 1.412670612335205 at epoch 7


101it [00:33,  2.98it/s]

loss for batch 100 --> 1.4113860130310059 at epoch 7


201it [01:07,  2.96it/s]

loss for batch 200 --> 1.38746976852417 at epoch 7


205it [01:08,  2.99it/s]
1it [00:00,  3.00it/s]

loss for batch 0 --> 1.3681691884994507 at epoch 8


101it [00:33,  2.94it/s]

loss for batch 100 --> 1.3768370151519775 at epoch 8


201it [01:07,  2.96it/s]

loss for batch 200 --> 1.3457719087600708 at epoch 8


205it [01:08,  2.98it/s]
1it [00:00,  2.99it/s]

loss for batch 0 --> 1.3255358934402466 at epoch 9


101it [00:34,  2.95it/s]

loss for batch 100 --> 1.334720253944397 at epoch 9


201it [01:07,  2.96it/s]

loss for batch 200 --> 1.3064805269241333 at epoch 9


205it [01:08,  2.98it/s]
1it [00:00,  2.98it/s]

loss for batch 0 --> 1.2960284948349 at epoch 10


101it [00:34,  2.95it/s]

loss for batch 100 --> 1.298583745956421 at epoch 10


201it [01:07,  2.96it/s]

loss for batch 200 --> 1.2653659582138062 at epoch 10


205it [01:08,  2.98it/s]
1it [00:00,  2.93it/s]

loss for batch 0 --> 1.2502861022949219 at epoch 11


101it [00:34,  2.97it/s]

loss for batch 100 --> 1.2722318172454834 at epoch 11


201it [01:07,  2.97it/s]

loss for batch 200 --> 1.2239904403686523 at epoch 11


205it [01:08,  2.98it/s]
1it [00:00,  2.91it/s]

loss for batch 0 --> 1.1961712837219238 at epoch 12


101it [00:33,  2.96it/s]

loss for batch 100 --> 1.220923662185669 at epoch 12


201it [01:07,  2.97it/s]

loss for batch 200 --> 1.19638192653656 at epoch 12


205it [01:08,  2.99it/s]
1it [00:00,  2.97it/s]

loss for batch 0 --> 1.143993616104126 at epoch 13


101it [00:33,  2.96it/s]

loss for batch 100 --> 1.1777873039245605 at epoch 13


201it [01:07,  2.96it/s]

loss for batch 200 --> 1.1664202213287354 at epoch 13


205it [01:08,  2.99it/s]
1it [00:00,  2.95it/s]

loss for batch 0 --> 1.1075537204742432 at epoch 14


101it [00:33,  2.95it/s]

loss for batch 100 --> 1.1179004907608032 at epoch 14


201it [01:07,  2.97it/s]

loss for batch 200 --> 1.113943338394165 at epoch 14


205it [01:08,  2.99it/s]
1it [00:00,  3.01it/s]

loss for batch 0 --> 1.050620675086975 at epoch 15


101it [00:33,  2.98it/s]

loss for batch 100 --> 1.0504333972930908 at epoch 15


201it [01:07,  2.95it/s]

loss for batch 200 --> 1.058074951171875 at epoch 15


205it [01:08,  3.00it/s]
1it [00:00,  3.06it/s]

loss for batch 0 --> 1.0109940767288208 at epoch 16


101it [00:33,  2.96it/s]

loss for batch 100 --> 1.019134521484375 at epoch 16


201it [01:07,  2.97it/s]

loss for batch 200 --> 1.0005109310150146 at epoch 16


205it [01:08,  2.99it/s]
1it [00:00,  2.92it/s]

loss for batch 0 --> 0.9543896913528442 at epoch 17


101it [00:33,  2.96it/s]

loss for batch 100 --> 0.9402701258659363 at epoch 17


201it [01:07,  2.98it/s]

loss for batch 200 --> 0.8948304057121277 at epoch 17


205it [01:08,  2.99it/s]
1it [00:00,  2.99it/s]

loss for batch 0 --> 0.9140760898590088 at epoch 18


101it [00:33,  2.97it/s]

loss for batch 100 --> 0.8990307450294495 at epoch 18


201it [01:07,  2.96it/s]

loss for batch 200 --> 0.8072279095649719 at epoch 18


205it [01:08,  2.99it/s]
1it [00:00,  3.01it/s]

loss for batch 0 --> 0.8272302746772766 at epoch 19


101it [00:33,  2.94it/s]

loss for batch 100 --> 0.8277518153190613 at epoch 19


201it [01:07,  2.97it/s]

loss for batch 200 --> 0.7959966659545898 at epoch 19


205it [01:08,  2.99it/s]


# Sample from the model

In [53]:
print("generating 5 samples of 10000 characters each")
decoder_generator.update_params(max_length=10000, num_samples=5)
decoder_generator.generate()
decoder_generator.print_outputs()

generating 5 samples of 10000 characters each
Pt's to gether forth our tell all the grand thee,
I' is usurpose, lay good and a holy father's
The Came them my coung straitor woman's.
To take them to me, in no far my invuat: for my lord,
And smillo age that rant to long me him,
But I mistance give men an mentail,
That I am man, everlain the beet that past an the gow.

LUCIO:
In that go swant at thee. is take no more gown,
Be-mine all me hear action feast of beat to the
I sent tell th a king time man, good make lease,
That I may shall beee in limbs nir per him.
A bollade think woman, sught to doo.
Come, eman poor hear me to your general.

ANTONIO:
That's take heair; a visit no of grim,
in--
Taith, serving no more two and not, my ears.

DUKE VINCENTIO:
I do ear again, I dare night now stan.

MENTAGUE:
I must madam, I am not ame with make .

KATHARI make my kim, leave I may name,
That though an I do foll thy seized
And braze my seming live, old man,
I dark't remember me with me to my man.
N

# Self-attetion x cross-attention
- in self-attention the values for queries (Q), keys (K) and values (V) all come from x itself, thus self-attention
- in cross-attention those values can come from somewhere else (the encoder)