In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
!pip install einops

--2024-11-07 09:59:32--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: 'input.txt'


2024-11-07 09:59:32 (46.3 MB/s) - 'input.txt' saved [1115394/1115394]

Collecting einops
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.8.0


In [2]:
with open('/kaggle/working/input.txt', 'r') as f:
    text = f.read()

In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

In [4]:
stoi = {i: j for i, j in enumerate(chars)}
itos = {j: i for i, j in enumerate(chars)}

encode = lambda input: [itos[ch] for ch in input]
decode = lambda input: ''.join([stoi[ch] for ch in input])

In [5]:
import torch
data = torch.tensor(encode(text), dtype = torch.long)
print(data.shape, data.dtype)

torch.Size([1115394]) torch.int64


In [6]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [None]:
torch.manual_seed(1337)
batch_size = 16
block_size = 32

def get_batch(split):
    data = train_data if split == 'train' else val_data

    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    
    return x, y

In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            X = X.to(device)
            Y = Y.to(device)
            logits, loss = model(X, targets=Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
max_iters = 5000
eval_interval = 100
lr = 1e-3
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0

class FeedForward(nn.Module):
    def __init__(self, n_embd): 
        super().__init__()
        self.net = nn.Sequential(nn.Linear(n_embd, n_embd * 4), 
                                 nn.ReLU(),
                                 nn.Linear(4 * n_embd, n_embd), 
                                 nn.Dropout(dropout)
                                )

    def forward(self, x):
        return self.net(x)

class BigramLanguageModel(nn.Module):
    def __init__(self): 
        super().__init__()
        self.embed = nn.Embedding(vocab_size, n_embd)
        self.pos_enc = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.lm_f = nn.LayerNorm(n_embd)
        self.lm_layer = nn.Linear(n_embd, vocab_size)
        
    def forward(self, idx, targets=None):
        # idx shape (B, T)
        B, T = idx.shape
        token_embd = self.embed(idx)
        pos_embd = self.pos_enc(torch.arange(T, device=device))
        x = token_embd + pos_embd
        x = self.blocks(x)
        x = self.lm_f(x)
        logits = self.lm_layer(x)
        
        if targets is None:
            loss = None
        else:
            #logits shape (B, T), targets shape (B, T)
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_next], dim=1)

        return idx

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out
        
        
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = self.ln1(self.sa(x)) + x
        return self.ln2(self.ffwd(x)) + x

class MultiHeadAttention(nn.Module):
    def __init__(self, n_head, head_size, n_embd): 
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_head)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = torch.cat([h(x) for h in self.heads], dim=-1)
        x = self.dropout(x)
        return self.proj(x)

In [None]:
model = BigramLanguageModel()
model = model.to(device)
print(f"Total number of parameters: {sum(p.numel() for p in model.parameters())}")

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

In [None]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')
    xb = xb.to(device)
    yb = yb.to(device)

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=2000)[0].tolist()))

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')
    xb = xb.to(device)
    yb = yb.to(device)

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=2000)[0].tolist()))

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr*0.25)
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')
    xb = xb.to(device)
    yb = yb.to(device)

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=2000)[0].tolist()))

### Training more complex model

In [None]:
batch_size = 64
block_size = 256

def get_batch(split):
    data = train_data if split == 'train' else val_data

    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    
    return x, y

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
max_iters = 5000
eval_interval = 500
lr = 3e-4
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

class FeedForward(nn.Module):
    def __init__(self, n_embd): 
        super().__init__()
        self.net = nn.Sequential(nn.Linear(n_embd, n_embd * 4), 
                                 nn.ReLU(),
                                 nn.Linear(4 * n_embd, n_embd), 
                                 nn.Dropout(dropout)
                                )

    def forward(self, x):
        return self.net(x)

class GPT(nn.Module):
    def __init__(self): 
        super().__init__()
        self.embed = nn.Embedding(vocab_size, n_embd)
        self.pos_enc = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.lm_f = nn.LayerNorm(n_embd)
        self.lm_layer = nn.Linear(n_embd, vocab_size)
        
    def forward(self, idx, targets=None):
        # idx shape (B, T)
        B, T = idx.shape
        token_embd = self.embed(idx)
        pos_embd = self.pos_enc(torch.arange(T, device=device))
        x = token_embd + pos_embd
        x = self.blocks(x)
        x = self.lm_f(x)
        logits = self.lm_layer(x)
        
        if targets is None:
            loss = None
        else:
            #logits shape (B, T), targets shape (B, T)
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_next], dim=1)

        return idx

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out
        
        
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = self.ln1(self.sa(x)) + x
        return self.ln2(self.ffwd(x)) + x

class MultiHeadAttention(nn.Module):
    def __init__(self, n_head, head_size, n_embd): 
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_head)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = torch.cat([h(x) for h in self.heads], dim=-1)
        x = self.dropout(x)
        return self.proj(x)

In [None]:
model = GPT()
model = model.to(device)
print(f"Total number of parameters: {sum(p.numel() for p in model.parameters())}")

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, diff loss {losses['val'] - losses['train']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')
    xb = xb.to(device)
    yb = yb.to(device)

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=2000)[0].tolist()))

In [None]:
batch_size = 32
block_size = 512

def get_batch(split):
    data = train_data if split == 'train' else val_data

    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    
    return x, y

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
max_iters = 5000
eval_interval = 500
lr = 3e-4
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 10
dropout = 0.2

class FeedForward(nn.Module):
    def __init__(self, n_embd): 
        super().__init__()
        self.net = nn.Sequential(nn.Linear(n_embd, n_embd * 4), 
                                 nn.ReLU(),
                                 nn.Linear(4 * n_embd, n_embd), 
                                 nn.Dropout(dropout)
                                )

    def forward(self, x):
        return self.net(x)

class GPT(nn.Module):
    def __init__(self): 
        super().__init__()
        self.embed = nn.Embedding(vocab_size, n_embd)
        self.pos_enc = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.lm_f = nn.LayerNorm(n_embd)
        self.lm_layer = nn.Linear(n_embd, vocab_size)
        
    def forward(self, idx, targets=None):
        # idx shape (B, T)
        B, T = idx.shape
        token_embd = self.embed(idx)
        pos_embd = self.pos_enc(torch.arange(T, device=device))
        x = token_embd + pos_embd
        x = self.blocks(x)
        x = self.lm_f(x)
        logits = self.lm_layer(x)
        
        if targets is None:
            loss = None
        else:
            #logits shape (B, T), targets shape (B, T)
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_next], dim=1)

        return idx

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out
        
        
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = self.ln1(self.sa(x)) + x
        return self.ln2(self.ffwd(x)) + x

class MultiHeadAttention(nn.Module):
    def __init__(self, n_head, head_size, n_embd): 
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_head)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = torch.cat([h(x) for h in self.heads], dim=-1)
        x = self.dropout(x)
        return self.proj(x)

In [None]:
model = GPT()
model = model.to(device)
print(f"Total number of parameters: {sum(p.numel() for p in model.parameters())}")

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, diff loss {losses['val'] - losses['train']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')
    xb = xb.to(device)
    yb = yb.to(device)

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=2000)[0].tolist()))

In [56]:
batch_size = 64
block_size = 256

def get_batch(split):
    data = train_data if split == 'train' else val_data

    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    
    return x, y

In [61]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
max_iters = 5000
eval_interval = 500
lr = 3e-4
eval_iters = 200
n_embd = 256
n_head = 4
n_layer = 4
dropout = 0.0

class FeedForward(nn.Module):
    def __init__(self, n_embd): 
        super().__init__()
        self.net = nn.Sequential(nn.Linear(n_embd, n_embd * 4), 
                                 nn.ReLU(),
                                 nn.Linear(4 * n_embd, n_embd), 
                                 nn.Dropout(dropout)
                                )

    def forward(self, x):
        return self.net(x)

class GPT(nn.Module):
    def __init__(self): 
        super().__init__()
        self.embed = nn.Embedding(vocab_size, n_embd)
        self.pos_enc = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.lm_f = nn.LayerNorm(n_embd)
        self.lm_layer = nn.Linear(n_embd, vocab_size)
        
    def forward(self, idx, targets=None):
        # idx shape (B, T)
        B, T = idx.shape
        token_embd = self.embed(idx)
        pos_embd = self.pos_enc(torch.arange(T, device=device))
        x = token_embd + pos_embd
        x = self.blocks(x)
        x = self.lm_f(x)
        logits = self.lm_layer(x)
        
        if targets is None:
            loss = None
        else:
            #logits shape (B, T), targets shape (B, T)
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_next], dim=1)

        return idx

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out
        
        
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = self.ln1(self.sa(x)) + x
        return self.ln2(self.ffwd(x)) + x

class MultiHeadAttention(nn.Module):
    def __init__(self, n_head, head_size, n_embd): 
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_head)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = torch.cat([h(x) for h in self.heads], dim=-1)
        x = self.dropout(x)
        return self.proj(x)

In [62]:
model = GPT()
model = model.to(device)
print(f"Total number of parameters: {sum(p.numel() for p in model.parameters())}")

Total number of parameters: 3255361


In [63]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, diff loss {losses['val'] - losses['train']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')
    xb = xb.to(device)
    yb = yb.to(device)

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.4203, val loss 4.4114, diff loss -0.0089
step 500: train loss 1.8011, val loss 1.9257, diff loss 0.1246
step 1000: train loss 1.4433, val loss 1.6390, diff loss 0.1957
step 1500: train loss 1.3189, val loss 1.5643, diff loss 0.2454
step 2000: train loss 1.2401, val loss 1.5294, diff loss 0.2892
step 2500: train loss 1.1749, val loss 1.5437, diff loss 0.3688
step 3000: train loss 1.1095, val loss 1.5595, diff loss 0.4500
step 3500: train loss 1.0538, val loss 1.5958, diff loss 0.5419
step 4000: train loss 0.9968, val loss 1.6579, diff loss 0.6611
step 4500: train loss 0.9399, val loss 1.7307, diff loss 0.7908
step 4999: train loss 0.8814, val loss 1.8100, diff loss 0.9286


p = 0.2
step 0: train loss 4.3143, val loss 4.3112, diff loss -0.0031
step 500: train loss 1.9058, val loss 2.0208, diff loss 0.1151
step 1000: train loss 1.4890, val loss 1.6842, diff loss 0.1952
step 1500: train loss 1.3414, val loss 1.5590, diff loss 0.2176
step 2000: train loss 1.2543, val loss 1.5117, diff loss 0.2575
step 2500: train loss 1.1992, val loss 1.4886, diff loss 0.2894
step 3000: train loss 1.1495, val loss 1.4826, diff loss 0.3332
step 3500: train loss 1.1087, val loss 1.4855, diff loss

p = 0.5
step 0: train loss 1.2580, val loss 1.5604, diff loss 0.3023
step 500: train loss 1.2424, val loss 1.5538, diff loss 0.3114
step 1000: train loss 1.2214, val loss 1.5448, diff loss 0.3234
step 1500: train loss 1.2074, val loss 1.5378, diff loss 0.3304
step 2000: train loss 1.1869, val loss 1.5287, diff loss 0.3419
step 2500: train loss 1.1825, val loss 1.5419, diff loss 0.3595
step 3000: train loss 1.1648, val loss 1.5370, diff loss 0.3722
step 3500: train loss 1.1543, val loss 1.5312, diff loss 0.3768
step 4000: train loss 1.1419, val loss 1.5389, diff loss 0.3970
step 4500: train loss 1.1299, val loss 1.5438, diff loss 0.4139
step 4999: train loss 1.1194, val loss 1.5455, diff l

t = 512
step 0: train loss 4.3666, val loss 4.3507, diff loss -0.0159
step 500: train loss 2.3006, val loss 2.3596, diff loss 0.0590
step 1000: train loss 1.5622, val loss 1.7534, diff loss 0.1912
step 1500: train loss 1.3480, val loss 1.5809, diff loss 0.2329
step 2000: train loss 1.2438, val loss 1.5013, diff loss 0.2576
step 2500: train loss 1.1666, val loss 1.4814, diff loss 0.3149
step 3000: train loss 1.1061, val loss 1.4795, diff loss 0.3734
step 3500: train loss 1.0529, val loss 1.4962, diff loss 0.4433
step 4000: train loss 0.9921, val loss 1.5086, diff los

n_layers = 8
step 0: train loss 4.4640, val loss 4.4617, diff loss -0.0023
step 500: train loss 2.0043, val loss 2.0823, diff loss 0.0780
step 1000: train loss 1.5414, val loss 1.7261, diff loss 0.1847
step 1500: train loss 1.3615, val loss 1.5778, diff loss 0.2163
step 2000: train loss 1.2767, val loss 1.5239, diff loss 0.2471
step 2500: train loss 1.2151, val loss 1.4999, diff loss 0.2848
step 3000: train loss 1.1659, val loss 1.4918, diff loss 0.3259
step 3500: train loss 1.1199, val loss 1.4969, diff loss 0.3770
step 4000: train loss 1.0873, val loss 1.5112, diff loss 0.4240
step 4500: train loss 1.0492, val loss 1.5077, diff loss 0.4585
step 4999: train loss 1.0182, val loss 1.5255, diff l

n_embd = 512
n_head = 8
step 0: train loss 4.3457, val loss 4.3480, diff loss 0.0023
step 500: train loss 1.9286, val loss 2.0417, diff loss 0.1132
step 1000: train loss 1.4955, val loss 1.6967, diff loss 0.2012
step 1500: train loss 1.3296, val loss 1.5483, diff loss 0.2187
step 2000: train loss 1.2450, val loss 1.5145, diff loss 0.2695
step 2500: train loss 1.1842, val loss 1.4963, diff loss 0.3122
step 3000: train loss 1.1347, val loss 1.5023, diff loss 0.3676
step 3500: train loss 1.0872, val loss 1.5024, diff loss 0.4152
step 4000: train loss 1.0414, val loss 1.5185, diff loss 0.4770
step 4500: train loss 0.9959, val loss 1.5341, diff loss 0.5382
step 4999: train loss 0.9538, val loss 1.5756, diff losn_embd = 512
n_head = 8
n_layers = 8
step 0: train loss 4.1405, val loss 4.1471, diff loss 0.0067
step 500: train loss 2.1302, val loss 2.1906, diff loss 0.0604
step 1000: train loss 1.7153, val loss 1.8802, diff loss 0.1649
step 1500: train loss 1.5075, val loss 1.7024, diff loss 0.1949
step 2000: train loss 1.4135, val loss 1.6379, diff loss 0.2244
step 2500: train loss 1.3447, val loss 1.5705, diff loss 0.2258
step 3000: train loss 1.2980, val loss 1.5476, diff loss 0.2496
step 3500: train loss 1.2620, val loss 1.5472, diff loss 0.2852
step 4000: train loss 1.2308, val loss 1.5219, diff loss 0.2912
step 4500: train loss 1.2109, val loss 1.5257, diff loss 0.3147
step 4999: train loss 1.1836, val loss 1.5191, diff l
step 5000: train loss 1.1828, val loss 1.5241, diff loss 0.3413
step 5500: train loss 1.1583, val loss 1.5287, diff loss 0.3704
step 6000: train loss 1.1532, val loss 1.5283, diff loss 0.3751
step 6500: train loss 1.1392, val loss 1.5434, diff loss 0.4041
step 7000: train loss 1.1477, val loss 1.5469, diff loss 0.

n_embd = 320
n_head = 5
n_layer = 4
step 0: train loss 4.3636, val loss 4.3533, diff loss -0.0103
step 500: train loss 1.9883, val loss 2.0751, diff loss 0.0869
step 1000: train loss 1.5644, val loss 1.7555, diff loss 0.1911
step 1500: train loss 1.4171, val loss 1.6312, diff loss 0.2141
step 2000: train loss 1.3365, val loss 1.5639, diff loss 0.2274
step 2500: train loss 1.2792, val loss 1.5238, diff loss 0.2445
step 3000: train loss 1.2340, val loss 1.5031, diff loss 0.2691
step 3500: train loss 1.2043, val loss 1.4952, diff loss 0.2909
step 4000: train loss 1.1728, val loss 1.4864, diff lossstep 4500: train loss 1.1506, val loss 1.4823, diff loss 0.3317


n_embd = 256
n_head = 4
n_layer = 4
step 0: train loss 4.4203, val loss 4.4114, diff loss -0.0089
step 500: train loss 2.0601, val loss 2.1233, diff loss 0.0633
step 1000: train loss 1.6193, val loss 1.8052, diff loss 0.1859
step 1500: train loss 1.4730, val loss 1.6747, diff loss 0.2017
step 2000: train loss 1.3984, val loss 1.6009, diff loss 0.2025
step 2500: train loss 1.3416, val loss 1.5605, diff loss 0.2188
step 3000: train loss 1.3044, val loss 1.5421, diff loss 0.2377
step 3500: train loss 1.2759, val loss 1.5158, diff loss 0.2399
step 4000: train loss 1.2507, val loss 1.5030, diff loss 0.2522
step 4500: train loss 1.2286, val loss 1.4956, diff loss 0.2670
step 4999: train loss 1.2137, val loss 1.4893, diff l

n_embd = 256
n_head = 4
n_layer = 4
p = 0.5
step 0: train loss 4.4203, val loss 4.4114, diff loss -0.0089
step 500: train loss 2.4102, val loss 2.4385, diff loss 0.0283
step 1000: train loss 2.0076, val loss 2.1037, diff loss 0.0961
step 1500: train loss 1.7711, val loss 1.9379, diff loss 0.1668
step 2000: train loss 1.6449, val loss 1.8469, diff loss 0.2020
step 2500: train loss 1.5680, val loss 1.7812, diff loss 0.2132
step 3000: train loss 1.5124, val loss 1.7330, diff loss 0.2206
step 3500: train loss 1.4739, val loss 1.6934, diff loss 0.2195
step 4000: train loss 1.4497, val loss 1.6768, diff lossstep 4500: train loss 1.4219, val loss 1.6559, diff loss 0.2340

n_embd = 256
n_head = 4
n_layer = 4
p = 0.0
 0.2270
oss 0.2756 0.3136
3992oss 0.3355s 0.6219
oss 0.5073s 0.5165
oss 0.4261 0.3768

In [66]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))


Men punished again.

AEdile:
Shame Gremions, much more! Well, ready, we must have
had looked on my cousin's point.

DUKE OF YORK:
Provoke poisonseth be so moved: if he much
merciful villain! Gloucester, after those thoughts
that he shall be chequeen, make her heart.

DUKE OF AUMERLE:
Comfort will it is the walks.

KING RICHARD II:
Norfolk, know thy name, dear?
Ferewell, of this day of mischa harm, which and like.
What came unto thee were to come his wrong,
Or who do imposset in fore; you were us
