# Processing Data

In [4]:
import torch

## Loading Data

In [5]:
batch_size = 32     # how many independent sequences will we process in parallel?
block_size = 8      # what is the maximum context length for predictions?
max_iters = 3000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200

In [6]:
! wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-05-21 00:37:19--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-05-21 00:37:19 (16.7 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [7]:
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [8]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [9]:
# let's look at the first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [10]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


## Encoding the dataset

In [11]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

temp_str = "hii there"
print("Encoded list of the string '%s':" % temp_str, encode(temp_str))
print("Decoding the encoded list:", decode(encode("hii there")))

Encoded list of the string 'hii there': [46, 47, 47, 1, 58, 46, 43, 56, 43]
Decoding the encoded list: hii there


In [12]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

## Splitting data into Train and Validation sets and Batching

In [13]:
n = int(0.9*len(data))

train_data = data[:n]
val_data = data[n:]

In [14]:
block_size = 8  # context size
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [15]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]

    print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [23]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split="train"):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

@torch.no_grad()
def estimate_Loss():
    out = {}
    model.eval()
    for split in ["train", "valid"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()

        out[split] = losses.mean()
    model.train()
    return out

xb, yb = get_batch(split='train')
print(f"Inputs: {xb}\t{xb.shape}")
print(f"Targets: {yb}\t{yb.shape}")

print(70 * "-")

for b in range(batch_size):     # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b, t]

        print(f"when input is: {context.tolist()}, the target is: {target}")

Inputs: tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])	torch.Size([4, 8])
Targets: tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])	torch.Size([4, 8])
----------------------------------------------------------------------
when input is: [24], the target is: 43
when input is: [24, 43], the target is: 58
when input is: [24, 43, 58], the target is: 5
when input is: [24, 43, 58, 5], the target is: 57
when input is: [24, 43, 58, 5, 57], the target is: 1
when input is: [24, 43, 58, 5, 57, 1], the target is: 46
when input is: [24, 43, 58, 5, 57, 1, 46], the target is: 43
when input is: [24, 43, 58, 5, 57, 1, 46, 43], the target is: 39
when input is: [44], the target is: 53
when input is: [44, 53], the target is: 56
when input is: [44, 53, 56], the target 

# Creating Model

In [24]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) tensors of integers
        logits = self.token_embedding_table(idx)    # (B, T, C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)

            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = BigramLanguageModel(vocab_size)
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1, 1), dtype=torch.long)     # 0 tensor serves as a <EOS> token which in our case is a new line character '\n'
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


# Training the model

In [25]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [27]:
batch_size = 32
max_iters = 5000
eval_interval = 500

for iter in range(max_iters):
    if iter % eval_interval == 0:
        losses = estimate_Loss()
        print(f"Step [{iter}]: train loss {losses['train']:.4f}, val loss: {losses['valid']:.4f}")

    xb, yb = get_batch(split="train")   # sample a batch

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

Step [0]: train loss 4.7286, val loss: 4.7233
Step [500]: train loss 4.1773, val loss: 4.1739
Step [1000]: train loss 3.7327, val loss: 3.7328
Step [1500]: train loss 3.3876, val loss: 3.3875
Step [2000]: train loss 3.1138, val loss: 3.1369
Step [2500]: train loss 2.9347, val loss: 2.9356
Step [3000]: train loss 2.7902, val loss: 2.8090
Step [3500]: train loss 2.7045, val loss: 2.7178
Step [4000]: train loss 2.6488, val loss: 2.6430
Step [4500]: train loss 2.5934, val loss: 2.6013
2.5764429569244385


In [28]:
idx = torch.zeros((1, 1), dtype=torch.long)     # 0 tensor serves as a <EOS> token which in our case is a new line character '\n'
print(decode(m.generate(idx, max_new_tokens=400)[0].tolist()))


y!

I'vel,SGUDUCHe hy,

corinys&QMADOY'
'tr thSStllewl, noisuan os : IN:

ThemVOFo?uejQGS:
Imy, thack.
pAl s VJusuer f t tor r athicke hivmispZ;
A
a!?jolo.

Swhy BYORTI tar

FoTowobrt
PENED:
Fas heandbrn mus:
Ty.
Vlly y y.
I slinis mbCHishadjKIQYO al thangjENCINUEMgq-I:
IOFak'eve YDYo-Spstheco, KNGorDO, te, t jusretand s d basorst fine smirIf w.
GNTo!

IN hindaForolfer s thu moThiswe torthashallSS


In [None]:
!python3 bigram.py

# The mathematical trick in self-attention
Masked attention

In [29]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)    # Batch, time, channels
print(x.shape)

torch.Size([4, 8, 2])


In [30]:
# We want x[b, t] = mean_i{i<=t} x[b,i]
# version 1

xbow = torch.zeros((B, T, C))   # bag of words

for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]                  # (t, C)
        xbow[b, t] = torch.mean(xprev, 0)   # (1, C)

In [31]:
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))        # torch.ones(3, 3)
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [32]:
# version 2

wei = torch.tril(torch.ones(T, T))      # weights
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x     # (T, T) @ (B, T, C) -----> Python does Batch mat mul -----> (B, T, T) @ (B, T, C) ----> (B, T, C)
torch.allclose(xbow, xbow2, atol=1e-7)
# print(xbow.shape, xbow2.shape)

True

In [33]:
# version 3

tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3, atol=1e-7)
# xbow-xbow3

True

In [34]:
# version 4: self-attention
torch.manual_seed(1337)
B, T, C = 4, 8, 32      # batch, time, channels
x=torch.randn(B,T,C)

# Query: (Roughly speaking) What am I looking for?
# Key: (Roughly speaking) What do I contain?
# Value: get the affinity between the Query and Key, and this is basically a dot product (similaity check) and it becomes 'wei' in our example

# this is a single head of self-attention
# bias is equal false, so these only apply a matrix multiplcation.
head_size = 16
query = nn.Linear(C, head_size, bias=False)
key = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

q = query(x)                    # (B, T, 16)
k = key(x)                      # (B, T, 16)

wei = q @ k.transpose(-2, -1)   # (B, T, 16) @ (B, 16, T) ---> (B, T, T); these give us the affinity of the two layers

tril = torch.tril(torch.ones(T,T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = key(x)                      # (B, T, 16)
out = wei @ v
# out = wei @ x

out.shape

torch.Size([4, 8, 16])

In [35]:
n_embd = 32
class Head(nn.Module):
    """ one head of the self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)                             # (B, T, C)
        q = self.query(x)                           # (B, T, C)

        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2, -1) * C**-0.5     # (B, T, 16) @ (B, 16, T) ---> (B, T, T); these give us the affinity of the two layers
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))    # (B, T, T)
        wei = F.softmax(wei, dim=-1)

        # performs the weighted aggregation of the values
        v = self.value(x)                           # (B, T, C)
        out = wei @ v                               # (B, T, T) @ (B, T, C) ---> (B, T, C)
        return out

# Creating Model
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        # self.sa_head = Head(n_embd)
        self.sa_heads = MultiHeadAttention(4, n_embd//4)                        # 4 heads of 8-dimentional self-attention
        self.blocks = nn.Sequential(
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4),
            Block(n_embd, n_head=4)
        )
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B, T) tensors of integers
        tok_emb = self.token_embedding_table(idx)                               # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, C)
        x = tok_emb + pos_emb                                                   # (B, T, C)
        x = self.sa_heads(x)                                                    # (B, T, C); one head of self-attention
        x = self.blocks(x)                                                      # (B, T, C)
        logits = self.lm_head(x)                                                # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)

            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [36]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        return out

In [37]:
class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linear activation"""
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, n_embd),
            nn.ReLU(),
            nn.Linear(n_embd, n_embd)
        )
    def forward(self, x):
        return self.net(x)

In [38]:
class Block(nn.Module):
    """ Transformer Block: self-attention communication followed by computation """
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)

    def forward(self, x):
        x = x + self.sa(x)
        x = x + self.ffwd(x)
        return x

In [39]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5877, 0.4123, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4457, 0.2810, 0.2733, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2220, 0.7496, 0.0175, 0.0109, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0379, 0.0124, 0.0412, 0.0630, 0.8454, 0.0000, 0.0000, 0.0000],
        [0.5497, 0.2187, 0.0185, 0.0239, 0.1831, 0.0062, 0.0000, 0.0000],
        [0.2576, 0.0830, 0.0946, 0.0241, 0.1273, 0.3627, 0.0507, 0.0000],
        [0.0499, 0.1052, 0.0302, 0.0281, 0.1980, 0.2657, 0.1755, 0.1474]],
       grad_fn=<SelectBackward0>)

If this is the last row of 'wei': [0.0499, 0.1052, 0.0302, 0.0281, 0.1980, 0.2657, 0.1755, 0.1474]

and in particular if we look at 0.1474 (last value; token) it knows what content it has and what position it is in, and now the 8th token is sort of asking the rest of the tokens "Hey, I am a vowel, I am at 8th position and I am looking for consonants between 0-4"

now all of the nodes will emit keys, and they will say "I am a consonant, and I am in 0-4 position" and, after dot product, this will create a high affinity for these two nodes. This means that in softmax, this high affinity will make sure that a lot of information of the 0-4 position node goes through in the embedding, and will learn a lot about it.

x is kind of like private information to this token, "So I am a fifth token and I have some identity and my information is kept in vector x, (and for the purpose of single head) Here is what I am interested in, here is what I have and here is what I will communicate with you if you find me interesting, and this is stored in v (value)".

Notes:

- Attention is a communication mechanism. Can be thought of directed graphs.
- There is no notion of space in attention, inherently. But we do this on our own to give it some notion of position in the sentence (positional encoding).
- Examples (batches), these are independent examples and do not talk to each other.

In [None]:
!python3 gpt.py

Vocab Size: 65, Vocab: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Step [   0]: train loss 4.2000, val loss: 4.2047
Step [ 500]: train loss 2.6911, val loss: 2.7087
Step [1000]: train loss 2.5196, val loss: 2.5303
Step [1500]: train loss 2.4775, val loss: 2.4829
Step [2000]: train loss 2.4408, val loss: 2.4523
Step [2500]: train loss 2.4272, val loss: 2.4435
Step [3000]: train loss 2.4130, val loss: 2.4327
Step [3500]: train loss 2.3956, val loss: 2.4212
Step [4000]: train loss 2.4041, val loss: 2.3992
Step [4500]: train loss 2.3980, val loss: 2.4084

Wes le isen.
Woto teven INGO, ous into CYedd shou maithe ert thethens the the del ede cksy ow? Wlouby aicecat tisall wor
G'imemonou mar ee hacreancad hontrt had wousk ucavere.

Baraghe lfousto beme,
S m; ten gh;
S:
Ano ice de bay alysathef beatireplim serbeais I fard
Sy,
Me hallil:
DWAR: us,
Wte hse aecathate, parrise in hr'd pat
ERY:
Bf bul walde betl'ts I yshore grest atre ciak aloo; wo fart het


After adding multiple head of self-attention

In [None]:
!python3 gpt.py

Vocab Size: 65, Vocab: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Step [   0]: train loss 4.2227, val loss: 4.2226
Step [ 500]: train loss 2.6592, val loss: 2.6733
Step [1000]: train loss 2.4980, val loss: 2.5064
Step [1500]: train loss 2.4291, val loss: 2.4349
Step [2000]: train loss 2.3716, val loss: 2.3844
Step [2500]: train loss 2.3417, val loss: 2.3561
Step [3000]: train loss 2.3149, val loss: 2.3347
Step [3500]: train loss 2.2918, val loss: 2.3171
Step [4000]: train loss 2.2895, val loss: 2.2868
Step [4500]: train loss 2.2748, val loss: 2.2858

We! le ises.
Wmay they row we thutinte Caldd shou mait tiertlentthens the the dol ede cksy ba? Wlouby arceckentisste wre
G'imemonot mar ef hacr
COngd Go mringt thouskiu?

Fre.

Bardageplftisto be ess the to hon;
Soretr ice we bay, Thouthe wome isspe, laveberis I fald
Sy,
Whissitill there git; the se aTist,
Anos arrise in wito pat
ER:
Tuche'l walde,
Anl'th I yourre grest at west ont of; wonf Gost t


After adding Feed Forward layers

In [None]:
!python3 gpt.py

Vocab Size: 65, Vocab: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Step [   0]: train loss 4.2533, val loss: 4.2552
Step [ 500]: train loss 2.6406, val loss: 2.6593
Step [1000]: train loss 2.4890, val loss: 2.4925
Step [1500]: train loss 2.4175, val loss: 2.4158
Step [2000]: train loss 2.3506, val loss: 2.3700
Step [2500]: train loss 2.3226, val loss: 2.3445
Step [3000]: train loss 2.3080, val loss: 2.3265
Step [3500]: train loss 2.2773, val loss: 2.3079
Step [4000]: train loss 2.2748, val loss: 2.2823
Step [4500]: train loss 2.2550, val loss: 2.2789

Boodil thine shall nothe hot mus fin.

Gyou rad Ipre, tir frate yand hat lodssell-oolld ave lith so tus noct dand unet, do laded wass:
Thouk sto ingrege
To pale gaeed and:
I wos law sor? fliduace being
DE:
Yect tountoru--th'd meme cha fles,
Tharr, lean poutss:
Thim yout ing pearspte, diguls, the souts pof theo calk, is woter that he thawirth tlill mo shat trows,
What lound houpeat to mal ryouat, h


After making multiple blocks of Transformers

In [None]:
!python3 gpt.py

Vocab Size: 65, Vocab: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Step [   0]: train loss 4.1644, val loss: 4.1609
Step [ 500]: train loss 3.1764, val loss: 3.1873
Step [1000]: train loss 3.0541, val loss: 3.0389
Step [1500]: train loss 2.9291, val loss: 2.9113
Step [2000]: train loss 2.8216, val loss: 2.8066
Step [2500]: train loss 2.7349, val loss: 2.7298
Step [3000]: train loss 2.6222, val loss: 2.6188
Step [3500]: train loss 2.5914, val loss: 2.5837
Step [4000]: train loss 2.5664, val loss: 2.5481
Step [4500]: train loss 2.5479, val loss: 2.5253

Thed sneedn 'mallrt etn ring; le
u,v sortc id'm ned!
Iute; mTei hog?

BDhord soucnset, an yu sunit anmaoc,
Ad ab houd ydad onr ish hoer cu hy hle miur ti, so wre cyeameg'l'l
cins
Bhy', up hit hocheer sedeult yi, d moud, Treed ivous i au me wa kraul, anann hanws youe:

HTe of! I'nw lou Tult thed hatt thi, hed be theps,
Thateet?

Ydbard etnosem itode huet fid wep he loldt Ylee Hit he gerd ligge;
Fre


After adding residual connections and projection layers in the blocks and adding LayerNorm; updated the parameters, added Dropout, and made the network deeper.

In [40]:
!python3 gpt.py

Vocab Size: 65, Vocab: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Step [   0]: train loss 4.3779, val loss: 4.3820
Step [ 500]: train loss 2.1740, val loss: 2.2213
Step [1000]: train loss 1.6603, val loss: 1.8288
Step [1500]: train loss 1.4602, val loss: 1.6750
Step [2000]: train loss 1.3555, val loss: 1.5945
Step [2500]: train loss 1.2883, val loss: 1.5467
Step [3000]: train loss 1.2354, val loss: 1.5135
Step [3500]: train loss 1.1903, val loss: 1.4974
Step [4000]: train loss 1.1544, val loss: 1.4932
Step [4500]: train loss 1.1236, val loss: 1.4867

SICINIUS:
Master you do stay.

MAMILLIUS:
Such a little tedious justice, nor
That straight from made. The reasonable
To reus more supple, that you should be case
Made beatten to the chase of Lord Tower.

BRUTUS:
Fatalis, this bidest me Chery thus at hand,
That is worse back off yor soaking shalt not death.

BENVOLIO:
An evil our sister, thre love but rije.
Yet, since then being to see thy grace wi
