In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
block_size = 8
batch_size = 4
max_iterations=10000
learning_rate = 3e-4
eval_interval = 1000

cpu


In [21]:
with open('wizard_of_oz.txt','r',encoding='utf-8') as f:
    text = f.read()

print(len(text))


232315


In [22]:
print(text[:200])



DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW


In [23]:
chars = sorted(set(text))
print(chars)
vocab_size = len(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


Tokenization

In [24]:
string_to_int = {ch:i for i,ch in enumerate(chars)} 
int_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[ch] for ch in s]
decode = lambda x: ''.join([int_to_string[i] for i in x])       

In [25]:
print(encode('hello'))  

[61, 58, 65, 65, 68]


In [26]:
decode(encode('hello')) 

'hello'

In [27]:
data = torch.tensor(encode(text),dtype=torch.long)  
data[:100]

tensor([80,  0,  0, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])

train-val split

In [29]:
n = 0.8
train_data = data[:int(n*len(data))]    
val_data = data[int(n*len(data)):] 

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)-block_size,(batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) 
    x, y = x.to(device), y.to(device)
    return x,y

x,y = get_batch('train')   
print("Input",x)
print("Target",y)

Input tensor([[68, 76, 67,  1, 73, 61, 58,  1],
        [72, 72, 62, 68, 67,  1, 68, 59],
        [74, 72, 74, 54, 65,  1, 59, 54],
        [ 1, 57, 58, 56, 65, 54, 71, 58]])
Target tensor([[76, 67,  1, 73, 61, 58,  1, 73],
        [72, 62, 68, 67,  1, 68, 59,  1],
        [72, 74, 54, 65,  1, 59, 54, 56],
        [57, 58, 56, 65, 54, 71, 58, 57]])


In [30]:
# block_size = 8
x = train_data[:block_size]
y = train_data[1:block_size+1]  

for t in range(block_size):
    context = x[:t+1]
    target = y[t]   
    print(context, target)  

tensor([80]) tensor(0)
tensor([80,  0]) tensor(0)
tensor([80,  0,  0]) tensor(28)
tensor([80,  0,  0, 28]) tensor(39)
tensor([80,  0,  0, 28, 39]) tensor(42)
tensor([80,  0,  0, 28, 39, 42]) tensor(39)
tensor([80,  0,  0, 28, 39, 42, 39]) tensor(44)
tensor([80,  0,  0, 28, 39, 42, 39, 44]) tensor(32)


In [31]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_interval)
        for k in range(eval_interval):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [32]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index


model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=50)[0].tolist())
print(generated_chars)


YzAI.W1vp
Ygc !3W1U[lW1' fTSs
5'tJKMy
uXbzN7xye4MA


In [33]:
optimizer = torch.optim.Adam(m.parameters(), lr=learning_rate)

for iter in range(max_iterations):
    x, y = get_batch('train')
    logits, loss = m(x, y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if iter % eval_interval == 0:
        print(f'Iteration {iter}, loss: {loss.item()}')

Iteration 0, loss: 4.770748138427734
Iteration 1000, loss: 4.535679340362549
Iteration 2000, loss: 4.374311923980713
Iteration 3000, loss: 4.1980462074279785
Iteration 4000, loss: 4.207821369171143
Iteration 5000, loss: 3.9627573490142822
Iteration 6000, loss: 3.7831859588623047
Iteration 7000, loss: 3.3063316345214844
Iteration 8000, loss: 3.590050220489502
Iteration 9000, loss: 3.0274829864501953


In [34]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iterations):
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss: 3.208, val loss: 3.236
step: 1000, train loss: 3.110, val loss: 3.135
step: 2000, train loss: 3.029, val loss: 3.055
step: 3000, train loss: 2.962, val loss: 2.972
step: 4000, train loss: 2.883, val loss: 2.920
step: 5000, train loss: 2.835, val loss: 2.868
step: 6000, train loss: 2.787, val loss: 2.829
step: 7000, train loss: 2.748, val loss: 2.784
step: 8000, train loss: 2.723, val loss: 2.751
step: 9000, train loss: 2.670, val loss: 2.715
2.5962371826171875


In [36]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


r, t Tpto asoazm; ouncxf, maksqngrath:Proufandw Goy.Hve d r Pand g ghitidisulDzopI8lkBy y et?Tweyow&Hut M

grd he vD;kwe e ttin&K;apinyebuced.Drd piou.Bsorl,"cu4o
ad t)KO."foidjRph-;KOWqHG0qzo.!V﻿ht,"ARqHAY)?0uilame foprog t.
awonthecxLendear fE2NMOefy.
Tw4)q1Lq?]﻿E, set adL?N8
a oyomw,fas, w. tooigghenlam, J3unontllth,"DNVQuamomy F:Rq1ked cl-n eaim om;K_Z0lthqN;4Oe wsin bortlngy my D1m trhted seeerogr w P)tm, md st;OTESzi o W_ay t
844)e
Teylthir
"of
"WcerenthathoncovG[land vondin ceviome gll_
T
