In [34]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
block_size = 8
batch_size = 4
max_iters = 10000
# eval_interval = 2500
learning_rate = 3e-4

cuda


In [31]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
chars = sorted(set(text))
print(chars)
print(len(chars))
vocab_size = len(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']
81


In [20]:
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])
print(len(data))

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])
232309


In [28]:
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    print(ix)
    x = torch.stack([data[i : i+block_size] for i in ix])
    y = torch.stack([data[i+1 : i+block_size+1] for i in ix])
    x,y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
print(x)
print('targets:')
print(y)

tensor([ 61741, 105838,  66110,  38360])
inputs:
tensor([[60,  1, 58, 65, 72, 58,  1, 61],
        [ 1, 61, 68, 71, 72, 58, 23,  1],
        [73, 54, 64, 58,  1, 54,  1, 67],
        [57,  1, 56, 68, 66, 58,  1, 68]], device='cuda:0')
targets:
tensor([[ 1, 58, 65, 72, 58,  1, 61, 58],
        [61, 68, 71, 72, 58, 23,  1,  3],
        [54, 64, 58,  1, 54,  1, 67, 54],
        [ 1, 56, 68, 66, 58,  1, 68, 74]], device='cuda:0')


In [None]:
class BigramLanguaageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None
        else:
            batch_size, time_steps, vocab_size = logits.shape
            logits = logits.view(batch_size * time_steps, vocab_size)
            targets = targets.view(batch_size * time_steps)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self.forward(index) # self(index), will call self.forward automatically
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            index_next = torch.multinomial(probs, num_samples=1)
            index = torch.cat((index, index_next), dim=1)
        return index
    
model = BigramLanguaageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


riYF
*3F27])L[u]BD4.n1r].t8;X nH0_Yb
49j
vQ&FdJie)bhjf7ciA[7]:x:3,Y[hDoGEWAgTT&U
js, 22VLxb_AQ(LyXC_A
:D2St"wjJqi0CQ2(AlOtKuH"RqJyawJZQwv54B,sC";oEIT9y10Kq
'40hR:p;C?RuSyi5I6:FZOVygM,RnRwnNDbbb﻿BhAX9lKf?GvnITZnsB5.H?UimyqqZL*4fS2Iqy-o5[ij﻿KpX36Nhi-exyf﻿y80lJZ_s(geX'*Uj'YBo&FW)TAQn:3rEVawcUlJZH(3DIba]3,]BB.bqMGWwf7_R:3NE(RwfaX2AqDUlzl]s﻿Nt-JZvaX"oHPuZL)l-"wH0feyxGfZEd1g1]6dofS&mphzbvQNzFd]O!aqq d3,xf7D,IGY[HGy6ebAm
gs&TZ9j::3IOJD?7AAut*1r:L9&LZ;l(,pKp8gWO4﻿y&u5HE)O!oH0 W8wp"89WTGO'y890rV]8hl-8Np[


During training, we follow this 4-step update process:

1.  Make Predictions → `logits = model.forward(xb)`
2.  Compute Loss (Error) → `loss = F.cross_entropy(logits, yb)`
3.  Compute Gradients → `loss.backward()`
4.  Update Weights → `optimizer.step()`

The optimizer adjusts each weight using the learning rate:
    `new weight = old weight − learning rate × gradient`


Gradient Descent is the mathematical technique we use to train neural networks.
It adjusts the weights of the model to reduce the loss and make better predictions.

It follows this formula:
$$
W_{\text{new}} = W_{\text{old}} - \alpha \times \frac{\partial \text{Loss}}{\partial W}
$$


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    xb, yb = get_batch('train')

    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

In [None]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)