In [14]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = "mps" if torch.backends.mps.is_available() and torch.backends.mps.is_built() else device

# Hyperparameters
batch_size = 32
block_size = 8
max_iters = 10000
eval_interval = 300
learning_rate = 1e-3
eval_iters = 200
n_embd = 384
n_layers = 4
dropout = 0.2

In [2]:
# Generate Vocabulary
with open('input.txt', 'r', encoding='utf-8') as file:
    text = file.read()
chars = sorted(set(text))

print(chars)
vocab_size = len(chars)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


In [3]:
# Character-Level Tokenizer
string_to_int = { ch:i for i, ch in enumerate(chars) }
int_to_string = { i:ch for i, ch in enumerate(chars) }

encode = lambda s: [ string_to_int[c] for c in s]
decode = lambda l: ''.join(int_to_string[i] for i in l)

data = torch.tensor(encode(text), dtype=torch.long)
# print(data[:100])

In [4]:
# Train-Test Split
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]

# Input-Target Parallel Implementation
def get_batch(split):
    data = train_data if split == "train" else val_data
    idx = torch.randint(len(data) - block_size, (batch_size,))
    
    x = torch.stack([data[i:i+block_size] for i in idx])
    y = torch.stack([data[i+1:i+block_size+1] for i in idx])
    x, y = x.to(device), y.to(device)
    
    return x, y

x, y = get_batch('train')
print("inputs: ")
print(x)
print("targets: ")
print(y)

inputs: 
tensor([[ 1, 39,  1, 61, 53, 56, 42,  1],
        [10,  1, 63, 53, 59,  1, 51, 39],
        [50,  1, 58, 43, 50, 50,  1, 63],
        [ 1, 47, 58,  1, 53, 59, 58,  1],
        [25, 17, 30, 15, 33, 32, 21, 27],
        [ 1, 47, 58,  1, 58, 53,  1, 51],
        [60, 43, 52,  1, 52, 53, 61,  6],
        [56,  1, 58, 56, 43, 57, 54, 39],
        [57, 46, 43,  1, 40, 56, 47, 52],
        [21,  1, 61, 39, 57, 11,  1, 61],
        [51,  6,  1, 57, 47, 56,  8,  0],
        [ 1, 42, 53, 53, 51, 57,  7, 42],
        [56, 39, 52, 45, 43, 50, 63,  1],
        [46, 43,  1, 41, 39, 52, 53, 54],
        [39, 51, 12,  0,  0, 16, 33, 15],
        [ 1, 46, 39, 57, 58,  1, 42, 53],
        [34, 21, 26, 15, 17, 26, 32, 21],
        [52, 45, 43,  1, 50, 39, 61, 10],
        [ 0, 21,  1, 46, 39, 60, 43,  1],
        [44, 53, 53, 58,  6,  0, 13, 52],
        [53, 58,  1, 41, 46, 53, 53, 57],
        [42, 43, 56,  1, 42, 53, 45,  2],
        [ 0, 32, 46, 43,  1, 44, 39, 56],
        [59, 57, 58,  1, 

In [5]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
class Head(nn.Module):
    """one head of attention"""

    def __init__(self, head_size):
        super().__init__()
        # although same compression, weights randomly initalized -> diff results after training
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) # save computation by initializing once

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input size: (B, T, C)
        # output size: (B, T, head_size)
        B, T, C = x.shape
        k = self.key(x) # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)
        
        # compute attention scores/affinities 
        # transpose(-2, -1) swaps second last dim w/ last dim, divides 1/sqrt(head_size) at end; compute (B, T, T) for token dot products
        wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5 # (B, T, head_size) @ (B, head_size, T) -> (B, T, T)
        # [:T, :T] takes row from idx 0->T exclusive, col from idx 0->T exclusive
        # curr: tril w/ 1 on diag and below; == 0 creates booleans that are 0, then changes True to -inf
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # changes all values above diag to 0; (B, T, T)
        wei = F.softmax(wei, dim=-1) # normalize all along rows (B, T, T)
        wei = self.dropout(wei) # apply dropout from __init__
        
        # weighted aggregation of values
        v = self.value(x) # (B, T, head_size)
        out = wei @ v # (B, T, T) @ (B, T, head_size) -> (B, T, head_size)
        return out

In [None]:
class MultiHeadAttention(nn.Module):
    """multiple heads of self-attention in parallel"""

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(num_heads * head_size, n_embd) # more learnable params; note: num_heads*head_size = n_embd
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # TODO DEFINE WHAT h(x) DOES!!!!!
        out = torch.cat([h(x) for head in self.heads], dim=-1) # combine on last dim for all heads -> (B, T, num_heads*head_size)
        out = self.dropout(self.proj(out))
        return out

In [None]:
class FeedForward(nn.Module):
    """simple linear layer followed by nonlinearity"""

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x) # apply layers

In [None]:
class Block(nn.Module):
    """Transformer block w/ multihead-attention"""

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head # for better compute in parallel
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd) # apply linear + nonlinearity
        self.ln1 = nn.LayerNorm(n_embd) 
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x + y)
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x

In [6]:
class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size) # hidden representations -> vocab logits

        self.apply(self._init_weights) # goes through all layers of neural network

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) # normal distrib
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias) # w/ underscore modifies tensor in-place
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)        
    
    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)

        # B = batch size, T = tokens/sequence, C = channel/embedding dim
        # idx and targets are (B, T) tensors
        tok_emb = self.token_embedding_table(index) # (B, T, C)
        pos_emb = self.token_embedding_table(torch.arange(T, device=device)) # generate positions up to T, size (T, C)
        x = tok_emb + pos_emb # (B, T, C)
        x = self.blocks(x) # feed into decoding blocks, (B, T, C)
        x = self.ln_f(x) # layer norm (B, T, C)
        logits = self.lm_head(x) # linear to vocab (B, T, vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, V = logits.shape # v = vocab_size, probabilities for each vocab word
            logits = logits.view(B * T, V) # compress to (B*T, V) for indiv predictions
            targets = targets.view(B * T) # each value in (B*T) is an index for "correct" word
            loss = F.cross_entropy(logits, targets) # computes softmax for each row of B*T and does -log(index of target)
        
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self.forward(idx) # get best pred of next token
            logits = logits[:, -1, :] # last pred, all B, last T, all V -> (B, V)
            probs = F.softmax(logits, dim=-1) # softmax along last dim (V) -> (B, V) but normalized
            idx_next = torch.multinomial(probs, num_samples=1) # sample one token/batch, one pred/batch -> (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # add to pred to idx -> (B, T+1)
        return idx

model = BigramLanguageModel(vocab_size)
m = model.to(device)

In [15]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f'step: {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}')
    
    xb, yb = get_batch('train')

    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step: 0: train loss 2.4566, val loss 2.4991
step: 300: train loss 2.4593, val loss 2.5097
step: 600: train loss 2.4505, val loss 2.5124
step: 900: train loss 2.4549, val loss 2.5087
step: 1200: train loss 2.4551, val loss 2.5085
step: 1500: train loss 2.4432, val loss 2.5032
step: 1800: train loss 2.4540, val loss 2.5088
step: 2100: train loss 2.4494, val loss 2.5093
step: 2400: train loss 2.4530, val loss 2.5044
step: 2700: train loss 2.4533, val loss 2.5159
step: 3000: train loss 2.4535, val loss 2.5020
step: 3300: train loss 2.4441, val loss 2.5129
step: 3600: train loss 2.4580, val loss 2.5052
step: 3900: train loss 2.4480, val loss 2.5186
step: 4200: train loss 2.4554, val loss 2.5089
step: 4500: train loss 2.4527, val loss 2.5135
step: 4800: train loss 2.4572, val loss 2.5036
step: 5100: train loss 2.4423, val loss 2.5115
step: 5400: train loss 2.4471, val loss 2.5071
step: 5700: train loss 2.4494, val loss 2.5070
step: 6000: train loss 2.4465, val loss 2.5085
step: 6300: train l

In [24]:
# Generate the new tokens
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


ARit fokeralind we mar.
ARG iobe e Houll whe ais ce ak!
NROrke mur po ing amaigharinomindonty wht,
Bupe be tlly ante twisshicathetr T:
hors win ter ilbord; p'AMEShoor he for w ou dss, plag Whibtthirern of t m wexeime herifoume ll at y

Led te towaininore tingheicon s prie
CHouge crgu'lorwffeee t mar, ham f pr hiteeperoth toldrimak'd myo kee horitarem wikitcowrerad'd e sceinofoou, wowo; f: ig be has 'r.

Miou,-be, hait ds ck s thooowhad s mie monithe?
K:
NEd eell hadrsunghu an.
Goand my. Rem houl
