# GPT for Tiny Shakespere Dataset
following Andrej Karpathy's Neural Network Course

In [2]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-11-08 18:24:26--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2024-11-08 18:24:27 (13.6 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [3]:
with open("input.txt", 'r',  encoding='utf-8') as file:
    text = file.read()

In [4]:
all_chars = sorted(set(text))
print("".join(all_chars))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


all_chars has all the characters in the entire dataset for the model to train in

In [5]:
vocab_size = len(all_chars)
vocab_size

65

In [6]:
stoi = {ch:i for i, ch in enumerate(all_chars)}
itos = {i:ch for i, ch in enumerate(all_chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [7]:
decode(encode("Hi"))

'Hi'

In [8]:
import torch
import torch.nn as nn
from torch.nn import functional as F



batchsize = 64 # how many independent sequences will we process in parallel?
blocksize = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

In [9]:
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [10]:
len(train_data), len(val_data)

(1003854, 111540)

In [11]:
def get_batch(what):
    if what == "train":
        d = train_data
    else:
        d = val_data
    ix = torch.randint(len(d)-blocksize, (batchsize,))
    x = torch.stack([data[i:i+blocksize] for i in ix])
    y = torch.stack([data[i+1: i+blocksize + 1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [12]:
x, y = get_batch("train")

In [13]:
x[0:5], y[0:5]

(tensor([[ 0, 17, 60,  ..., 50, 53, 56],
         [50, 53, 61,  ..., 42, 61, 39],
         [47, 58, 46,  ..., 46, 43,  1],
         [63, 53, 59,  ..., 47, 58, 46],
         [46, 39, 58,  ...,  1, 47, 57]]),
 tensor([[17, 60, 43,  ..., 53, 56, 42],
         [53, 61,  7,  ..., 61, 39, 56],
         [58, 46,  1,  ..., 43,  1, 41],
         [53, 59,  1,  ..., 58, 46,  1],
         [39, 58,  1,  ..., 47, 57,  1]]))

In [14]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(blocksize, blocksize)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out


In [15]:
x.shape

torch.Size([64, 256])

In [16]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
    def forward(self, x):
        return self.proj(torch.cat([h(x) for h in self.heads], dim=-1))

In [17]:
class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(n_embd, 4 * n_embd), nn.ReLU(), nn.Linear(4 * n_embd, n_embd))
    def forward(self, x):
        return self.net(x)


In [18]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        return x + self.ffwd(self.ln2(x))


In [19]:
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(blocksize, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
    def forward(self, idx, targets=None):
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(idx.size(1), device=idx.device))
        x = tok_emb + pos_emb
        x = self.ln_f(self.blocks(x))
        logits = self.lm_head(x)
        return logits, F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) if targets.size(0) > 0 else None


In [20]:
model = BigramLanguageModel().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


In [21]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [22]:
for iter in tqdm(range(1, max_iters)):
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"Step {iter}: Train loss {losses['train']:.4f}, Val loss {losses['val']:.4f}")
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward
    optimizer.step()


  0%|          | 0/4999 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [1]:
from tqdm.notebook import tqdm