In [134]:
import torch
import torch.nn as nn
import torch.nn.functional as F

device = "cuda" if torch.cuda.is_available() else "cpu"

# =================== Hyperparameters ===================  
block_size = 8
batch_size = 4
max_iters = 200000
# eval_interval = 2500
learning_rate = 3e-4
eval_iters = 1000
# ======================================================= 

device

'cuda'

In [135]:
with open("wizard_of_oz.txt", "r", encoding="utf-8") as f:
    text = f.read()

chars = sorted(set(text))
print(chars)
vocab_size = len(chars)
vocab_size

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


81

In [136]:
print(text[:200])

  DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW


In [137]:
# Tokenizers 
string_to_int = { ch:i for i, ch in enumerate(chars)}
int_to_string = { i:ch for i, ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: "".join([int_to_string[i] for i in l])

In [138]:
encoded_hello = encode("Hello, world!")
decoded_hello = decode(encoded_hello)
encoded_hello, decoded_hello

([32, 58, 65, 65, 68, 9, 1, 76, 68, 71, 65, 57, 2], 'Hello, world!')

In [139]:
data = torch.tensor(encode(text), dtype=torch.long)
data[:100]

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])

In [140]:
# Doing train/test split
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]


def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
# print(x.shape)
print(x)
print('targets:')
print(y)

inputs:
tensor([[ 1, 54, 67, 57,  1, 73, 61, 58],
        [65, 65, 62, 67, 60,  1, 54, 72],
        [67, 60,  1, 68, 67,  1, 73, 68],
        [58, 57,  1, 68, 67, 58,  9,  1]], device='cuda:0')
targets:
tensor([[54, 67, 57,  1, 73, 61, 58, 62],
        [65, 62, 67, 60,  1, 54, 72,  1],
        [60,  1, 68, 67,  1, 73, 68, 76],
        [57,  1, 68, 67, 58,  9,  1, 63]], device='cuda:0')


In [141]:
# Creating a bigram model example (sequential)
block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size + 1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context} target is {target}")

When input is tensor([80]) target is 1
When input is tensor([80,  1]) target is 1
When input is tensor([80,  1,  1]) target is 28
When input is tensor([80,  1,  1, 28]) target is 39
When input is tensor([80,  1,  1, 28, 39]) target is 42
When input is tensor([80,  1,  1, 28, 39, 42]) target is 39
When input is tensor([80,  1,  1, 28, 39, 42, 39]) target is 44
When input is tensor([80,  1,  1, 28, 39, 42, 39, 44]) target is 32


In [142]:
# Creating the model
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size) -> None:
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index
    
model = BigramLanguageModel(vocab_size=vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
generated_chars


'\n3*(0\ufeff!5s"j[8cvjvZWiYkcvJJnt:xTjeE!gc9XT4fzRsl6!?ViY"qNm!cViUFN9f.JRP]7XtGh));haKZWI_p0K;]FMCi_hzg-G\'v2w*4Q;hIy\nU2,kg9fwy7**QyM*-oE\ufeff6t2HL\n!-uJ30o*4f]TUz1r )pzfu&)wNe3&uHWydTNL::DEkWC6B,aOCV(mlttWF\'D SX9Iy!?-NrDtkO38E\'9lmD\'yP;\'1zrroj!bd4&8h3!f0HLqLL\nJXgfYo_8[8DJ!bmuAEwWv5ej0KTf-M;"hK9;GkQyCXI_ ?"hz7*1ryd&P.\nRkgo 23SZP!nOcX,V2G\';3gT Ie.S&m7avSR0K]T5ChD;?T,x?.TBJbdA\ufeffSD)B]htwM):Ix5xv6bmx3]rej!kn4Hb"Ms3(kc\nNI4j*k\nUUTyTGD4n!as*2K]l,.*Nyqfg-bQv;[HE4f6XNZgi5\'\'r7Vlres4uo*vsBU]3"vvt1t(a!S3L5*[v.A-\'X,TfPQ3C'

In [143]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [144]:
# Optimizing the model
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    # Get train batches
    xb, yb = get_batch("train")

    # Loss 
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

loss.item()

step: 0, train loss: 4.810, val loss: 4.783
step: 1000, train loss: 4.554, val loss: 4.538
step: 2000, train loss: 4.327, val loss: 4.313
step: 3000, train loss: 4.122, val loss: 4.117
step: 4000, train loss: 3.916, val loss: 3.922
step: 5000, train loss: 3.755, val loss: 3.756
step: 6000, train loss: 3.613, val loss: 3.602
step: 7000, train loss: 3.456, val loss: 3.468
step: 8000, train loss: 3.342, val loss: 3.338
step: 9000, train loss: 3.234, val loss: 3.230
step: 10000, train loss: 3.121, val loss: 3.131
step: 11000, train loss: 3.037, val loss: 3.055
step: 12000, train loss: 2.974, val loss: 2.986
step: 13000, train loss: 2.911, val loss: 2.933
step: 14000, train loss: 2.853, val loss: 2.871
step: 15000, train loss: 2.807, val loss: 2.831
step: 16000, train loss: 2.761, val loss: 2.787
step: 17000, train loss: 2.718, val loss: 2.749
step: 18000, train loss: 2.696, val loss: 2.727
step: 19000, train loss: 2.672, val loss: 2.709
step: 20000, train loss: 2.654, val loss: 2.669
step:

2.3630480766296387

In [150]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=1000)[0].tolist())
print(generated_chars)


l flideam icely-hend GEus athire t smy s hay athe ace, wheteave bid thou wigord Wingr "NVisasizartinskery thalongrere ge the anthe ace t ais nthe e sedizackend w,"Withe mup.

E Buska b fouril ondn ino  o! Thend y t fl w te s.
"
o t's che
m.
woruly, u tin oopleleckyspin, BAUThisuratishy

tingeathicthe omatthed s, a flle pebr allorebe f lfls t opoferinesizagr casernd dinfoublalvevind

rou he an isorablare lendamoof  Caksthoutin nd arwitrketthes
me ren ttheoorapin glard klfuram
ad t wot fathed wee s

asty. bos hins y,  totond Thersheattorftoy, ve nemey o s we sintheinlath.


msthes arezere "ichowagr me  telerkeas athathrin ther-ceremay APug s dusangep  angyin

ble b--plowhea my Itilshed.
" hugsagstereveo r be  hourarush ang
ty id domoth te s theanoupupang Mrop otanetoung houbelithonthan'verebore l,
"
"Wiest we Welede Swang Bunguby. klie
Wiouat acherond nt Jindid d-alk. groroousag ucowand.
d ot w d tfon'l hasthund hou flthe thy, a bro t thanoundg ves.

lothery thared ipled, a ang chowe  p