In [18]:
"""
Prepare the Shakespeare dataset for character-level language modeling.
So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
Will save train.bin, val.bin containing the ids, and meta.pkl containing the
encoder and decoder and some other related info.
"""
import os
import numpy as np
from pathlib import Path
import torch
base_dir = Path(__file__).resolve().parent if '__file__' in globals() else Path.cwd()
print(base_dir)

with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print("length of dataset in characters: ", len(text))

chars = sorted(list(set(text)))
vocab_size = len(chars)
print("Vocab size is ", vocab_size)
print(''.join(chars))

stoi = { ch:i for i, ch in enumerate(chars)}
itos = { i:ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])
print(encode("hello world"))
print(decode(encode("hello world")))

data = torch.tensor(encode(text), dtype=torch.long) 
print(data[:10])
# length of dataset in characters:  1115394
# all the unique characters:
#  !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
# vocab size: 65
# train has 1003854 tokens
# val has 111540 tokens


/home/ruochen/projects/nanoGPT/data/shakespeare_char
length of dataset in characters:  1115394
Vocab size is  65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]
hello world
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])


In [19]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]
print(train_data[:10])
print(val_data[:10])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])
tensor([12,  0,  0, 19, 30, 17, 25, 21, 27, 10])


In [20]:
# This is what the max context window that the model can see.
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [21]:
# [1,2,3,4,5,6,7,8,9,10...]
# [0:8]
x = train_data[:block_size]
# [1:9]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target is {target}")

when input is tensor([18]) the target is 47
when input is tensor([18, 47]) the target is 56
when input is tensor([18, 47, 56]) the target is 57
when input is tensor([18, 47, 56, 57]) the target is 58
when input is tensor([18, 47, 56, 57, 58]) the target is 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


In [22]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    # ix is going to be a four random integers
    # for example, [ 76049, 234249, 934904, 560986]
    ix = torch.randint(len(data) - block_size, (batch_size,) )
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print(xb.shape)
print(xb)
print(yb.shape)
print(yb)



torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [23]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        """
        With nn.Embedding:
        self.token_embedding_table = nn.Embedding(65, 65)
        PyTorch automatically knows this is a learnable parameter!
        """
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        """
        logits = self.token_embedding_table(idx)  # idx is [4, 8]
        ```

        What PyTorch does:
        1. Takes each token ID in your `[4, 8]` input
        2. Looks up that token's row in the embedding table
        3. Returns all 65 scores from that row

        **Concrete example**: Let's trace position [0, 0] (first sequence, first token):
        - `xb[0, 0]` might be token `23`
        - Look up row 23 in the embedding table
        - Get back 65 numbers (the scores for what comes after token 23)
        - Store this at `logits[0, 0, :]` which has size 65

        This happens **independently** for all 4×8 = 32 positions!

        ### Step 4: The Output Shape
        ```
        Input:  [4, 8]       → 4 sequences × 8 tokens each
                            ↓ lookup each token
        Output: [4, 8, 65]   → 4 sequences × 8 positions × 65 scores for next token
        ```

        ## Intuitive Visualization

        Imagine you have the sentence "Hello my name is":
        ```
        Position 0: "Hello"  → Get 65 scores for what comes after "Hello"
        Position 1: "my"     → Get 65 scores for what comes after "my"  
        Position 2: "name"   → Get 65 scores for what comes after "name"
        Position 3: "is"     → Get 65 scores for what comes after "is"
        """

        logits = self.token_embedding_table(idx)  # [batch_size, block_size, vocab_size]
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            # The corss entropy loss expects B * T, C 
            logits = logits.view(B * T, C)
            # The reason we don't need C for target is target is the source of truth that as the 
            # exact next character's value. 
            targets = targets.view(B * T)
            # Use negative likelihood loss. 
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions 
            logits, loss = self(idx)
            # focus only on the last time step, shape should be (Batch, Channel)
            logits = logits[:, -1, :]
            # apply softmax to get probabilities
            # dim tells which direction to calculate the probability
            # we want the softmax to be calculated in the last dimension since it's channel. 
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)

        return idx

m = BigramLanguageModel(vocab_size)
logits, loss= m(xb, yb)
print(logits.shape)
print(type(loss))


idx = torch.zeros((1,1), dtype=torch.long)
print(decode(m.generate(idx=idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
<class 'torch.Tensor'>

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [24]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

batch_size = 32
for steps in range(10):
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    # Getting the gradient of all the parameters in the model.
    loss.backward()
    # Use the gradient to update the parameters. 
    optimizer.step()

print(loss.item())

4.7182841300964355


In [25]:
print(decode(m.generate(idx=idx, max_new_tokens=300)[0].tolist()))


knLI!UEHwcObxfNCha!qKt-ocPTFjyXrF
W:..ZKAL.AHA.P!HAXNw,,$zCJ-!or'yxabLWGfkKowCXNe:g;gXEG'uVJMJ$
&AkfNfq-GXlay!B?!
SP JsBo.d,jIgEQzkq$YCZTOiqErphq?$zrzGJl3'IoiKIFuJuw
CM
&C-3
.yff;DRj:Td,&uDK$Wj;Y -w?XXXEG:iPDtR'd,t
-EHA3fxRObotE-wGRJiPmG'wyaIsr&NCj;IgLIas:C ?OgYQcM,jNCO.
XXgwzLlaPm$VlgY.rbXSP fyN.N&


The mathematical trick in self-attention

In [26]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

# We want x[b, t] = mean_{i<=t} x[b, i]
xbow = torch.zeros((B, T, C))
# b form 0 to B-1
for b in range(B):
    # t form 0 to T-1
    for t in range(T):
        # xprev is the previous tokens up to the current token. 
        # We are averaging the columns vertically because we want the average 
        # of all the previous tokens.
        xprev = x[b, :t+1] # (t, C)
        xbow[b, t] = torch.mean(xprev, 0)

In [27]:
xbow.shape
torch.tril(torch.ones(4, 3))


tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.],
        [1., 1., 1.]])

In [28]:
# print out these to see the averages
x[0]
xbow[0]

wei = torch.tril(torch.ones(T, T))
# Summing the second dimension, we get the sum of each row. 
wei = wei / wei.sum(1, keepdim=True)
print(wei)
xbow2 = wei @ x # (T, T) @ (B, T, C) -> (B, T, C) pytorch automatically broadcasts the 1 to B
diff = (xbow - xbow2).abs().max()
print(f"Max difference: {diff}")


tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])
Max difference: 3.236345946788788e-08


In [29]:
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
# Token from the past cannot communicate. By setting them to -inf, we will not aggregate anything from those tokens from those 
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow2, xbow3)

True

In [30]:
torch.manual_seed(42)
# T x T tensor
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print('a =', a)
print('b =', b)
print('c =', c)





a = tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b = tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c = tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [31]:

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

n_embed = 32
device = "cuda"

class BigramLanguageModelImproved(nn.Module):
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        token_emb = self.token_embedding_table(idx)  # [B, T, C (n_embed)]
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # [T, C (n_embed)]
        x = token_emb + pos_emb  # [B, T, C (n_embed)]
        logits = self.lm_head(x)  # [B, T, vocab_size]

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            # The corss entropy loss expects B * T, C 
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            # Use negative likelihood loss. 
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions 
            logits, loss = self(idx)
            # focus only on the last time step, shape should be (Batch, Channel)
            logits = logits[:, -1, :]
            # apply softmax to get probabilities
            # dim tells which direction to calculate the probability
            # we want the softmax to be calculated in the last dimension since it's channel. 
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)

        return idx


In [32]:
# Let's now code out the attention mechanism. 
torch.manual_seed(1337)
B, T, C = 4, 8, 32 # batch, sequence length, channel
x = torch.randn(B, T, C)

# let's see a single Head perform self-attention.
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)
wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) -> (B, T, T)

tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
print(wei[0])


v = value(x)
out = wei @ v
# out = wei @ x

out.shape


tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)


torch.Size([4, 8, 16])

In [33]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

In [34]:
class Head(nn.Module):
    "One head of self-attention"

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x) # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) -> (B, T, T)
        # Self attntion
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        v = self.value(x) # (B, T, head_size)
        out = wei @ v # (B, T, T) @ (B, T, head_size) -> (B, T, head_size)
        return out

class MultiHeadAttention(nn.Module):
    """Multiple heads of self-attention in parallel"""
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

    def forward(self, x):
        return torch.cat([h(x) for h in self.heads], dim=-1)

### The new BigramLanguageModel

In [35]:

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

n_embed = 32
device = "cuda"
batch_size = 32
block_size = 8
max_iters = 5000
eval_interval = 500
learning_rate = 1e-3
eval_iters = 200
n_embed = 32   

class BigramLanguageModelWithSelfAttention(nn.Module):
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        # self-attention head
        self.sa_head = Head(head_size=n_embed)
        # multi-head self-attention
        self.mha = MultiHeadAttention(num_heads=4, head_size=n_embed // 4)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        token_emb = self.token_embedding_table(idx)  # [B, T, C (n_embed)]
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # [T, C (n_embed)]
        x = token_emb + pos_emb  # [B, T, C (n_embed)]
        # Single head self-attention    
        # x = self.sa_head(x) # (B, T, n_embed/head_size)
        # Multi-head self-attention
        x = self.mha(x) # (B, T, n_embed)
        logits = self.lm_head(x)  # [B, T, vocab_size]

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            # The corss entropy loss expects B * T, C 
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            # Use negative likelihood loss. 
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions 
            logits, loss = self(idx_cond)
            # focus only on the last time step, shape should be (Batch, Channel)
            logits = logits[:, -1, :]
            # apply softmax to get probabilities
            # dim tells which direction to calculate the probability
            # we want the softmax to be calculated in the last dimension since it's channel. 
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)

        return idx


### Hyperparameters

In [None]:
torch.manual_seed(1337)
m = BigramLanguageModelWithSelfAttention()
m = m.to(device)
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)

def get_batch(split):
    data = train_data if split == 'train' else val_data
    # ix is going to be a four random integers
    # for example, [ 76049, 234249, 934904, 560986]
    ix = torch.randint(len(data) - block_size, (batch_size,) )
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

for steps in range(10):
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    # Getting the gradient of all the parameters in the model.
    loss.backward()
    # Use the gradient to update the parameters. 
    optimizer.step()

print(loss.item())

2.251319169998169


In [37]:
idx = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(idx=idx, max_new_tokens=100)[0].tolist()))


Whent ik bridcowf,
Tkis soret mad selabube toe.
Sagrtand thalied
hy ard that usqurthe.
War dilth ate


In [38]:
# The problem with the current model is we went way too fast to calculate the logits.
# The tokens look at each other but didn't really have time to think on what they have found from other toknes.

class FeedForward(nn.Module):
    """A simple feed-forward network"""
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, n_embed),
            nn.ReLU(),)
    
    def forward(self, x):
        return self.net(x)

In [39]:

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

n_embed = 32
device = "cuda"
batch_size = 32
block_size = 8
max_iters = 5000
eval_interval = 500
learning_rate = 1e-3
eval_iters = 200
n_embed = 32   

class BigramLanguageModelWithSelfAttentionWithFeedforward(nn.Module):
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        # self-attention head
        self.sa_head = Head(head_size=n_embed)
        # multi-head self-attention
        self.mha = MultiHeadAttention(num_heads=4, head_size=n_embed // 4) # Output dimension: (B, T, n_embed)
        self.ffwd = FeedForward(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        token_emb = self.token_embedding_table(idx)  # [B, T, C (n_embed)]
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # [T, C (n_embed)]
        x = token_emb + pos_emb  # [B, T, C (n_embed)]
        # Single head self-attention    
        # x = self.sa_head(x) # (B, T, n_embed/head_size)
        # Multi-head self-attention
        x = self.mha(x) # (B, T, c(n_embed))
        x = self.ffwd(x) # (B, T, c(n_embed))
        logits = self.lm_head(x)  # [B, T, vocab_size]

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            # The corss entropy loss expects B * T, C 
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            # Use negative likelihood loss. 
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions 
            logits, loss = self(idx_cond)
            # focus only on the last time step, shape should be (Batch, Channel)
            logits = logits[:, -1, :]
            # apply softmax to get probabilities
            # dim tells which direction to calculate the probability
            # we want the softmax to be calculated in the last dimension since it's channel. 
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)

        return idx


In [40]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """
    def __init__(self, n_embed, n_heads):
        super().__init__()
        head_size = n_embed // n_heads
        self.sa = MultiHeadAttention(n_heads, head_size)
        self.ffwd = FeedForward(n_embed)

    def forward(self, x):
        """
        The residual connection is important for the model to learn. 
        It allows the model to learn the identity function. 
        Without it, the model will have a hard time learning. 
        Here is the code without residual connection:
        # x = self.sa(x)
        # x = self.ffwd(x)
        """
        # With residual connection
        x = x + self.sa(x)
        x = x + self.ffwd(x)
        return x

class MultiHeadAttention(nn.Module):
    """A new multi-head attention layer that uses projection"""
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embed, n_embed)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        return out

class FeedForward(nn.Module):
    """A simple feed-forward network"""
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(n_embed, n_embed)
            )
    
    def forward(self, x):
        return self.net(x)



In [41]:

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

n_embed = 32
device = "cuda"
batch_size = 32
block_size = 8
max_iters = 5000
eval_interval = 500
learning_rate = 1e-3
eval_iters = 200
n_embed = 32   

class BigramLanguageModelAttentionWithoutLayerNorm(nn.Module):
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.blocks = nn.Sequential(Block(n_embed, n_heads=4), Block(n_embed, n_heads=4), Block(n_embed, n_heads=4))
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        token_emb = self.token_embedding_table(idx)  # [B, T, C (n_embed)]
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # [T, C (n_embed)]
        x = token_emb + pos_emb  # [B, T, C (n_embed)]
        x = self.blocks(x)
        logits = self.lm_head(x)  # [B, T, vocab_size]

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            # The corss entropy loss expects B * T, C 
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            # Use negative likelihood loss. 
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions 
            logits, loss = self(idx_cond)
            # focus only on the last time step, shape should be (Batch, Channel)
            logits = logits[:, -1, :]
            # apply softmax to get probabilities
            # dim tells which direction to calculate the probability
            # we want the softmax to be calculated in the last dimension since it's channel. 
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)

        return idx


In [42]:
# Let's now introduce layer norm to the model.
class BlockWithLayerNorm(nn.Module):
    """Transformer block: communication followed by computation with layer norm"""
    def __init__(self, n_embed, n_heads):
        super().__init__()
        head_size = n_embed // n_heads
        self.sa = MultiHeadAttention(n_heads, head_size)
        self.ffwd = FeedForward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


In [46]:

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

n_embed = 32
device = "cuda"
batch_size = 32
block_size = 8
max_iters = 5000
eval_interval = 500
learning_rate = 1e-3
eval_iters = 200
n_embed = 32   

class FeedForward(nn.Module):
    """A simple feed-forward network"""
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed)
            )
    
    def forward(self, x):
        return self.net(x)

class BigramLanguageModelAttentionFinal(nn.Module):
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.blocks = nn.Sequential(BlockWithLayerNorm(n_embed, n_heads=4), BlockWithLayerNorm(n_embed, n_heads=4), BlockWithLayerNorm(n_embed, n_heads=4), nn.LayerNorm(n_embed))
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        token_emb = self.token_embedding_table(idx)  # [B, T, C (n_embed)]
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # [T, C (n_embed)]
        x = token_emb + pos_emb  # [B, T, C (n_embed)]
        x = self.blocks(x)
        logits = self.lm_head(x)  # [B, T, vocab_size]

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            # The corss entropy loss expects B * T, C 
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            # Use negative likelihood loss. 
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions 
            logits, loss = self(idx_cond)
            # focus only on the last time step, shape should be (Batch, Channel)
            logits = logits[:, -1, :]
            # apply softmax to get probabilities
            # dim tells which direction to calculate the probability
            # we want the softmax to be calculated in the last dimension since it's channel. 
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)

        return idx


### Let's train the model again!

In [47]:
torch.manual_seed(1337)
m = BigramLanguageModelAttentionFinal()
m = m.to(device)
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)

def get_batch(split):
    data = train_data if split == 'train' else val_data
    # ix is going to be a four random integers
    # for example, [ 76049, 234249, 934904, 560986]
    ix = torch.randint(len(data) - block_size, (batch_size,) )
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

for steps in range(max_iters):
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    # Getting the gradient of all the parameters in the model.
    loss.backward()
    # Use the gradient to update the parameters. 
    optimizer.step()

print(loss.item())

1.9977909326553345


In [49]:
idx = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(m.generate(idx=idx, max_new_tokens=100)[0].tolist()))



JUCHARD:
I wcowill to lawer K
bady;
Thou but: and O-dam meall and bard thy pusquet to bardetlaccan,
