In [2]:
import torch

device = 'cuda' if torch.cuda.is_available() else "cpu"
print(device)

block_size = 8
batch_size = 4

cuda


In [3]:
!python --version

Python 3.9.18


In [4]:
with open("wizard_of_oz.txt", "r", encoding="utf-8") as f:
    text = f.read()

chars = sorted(set(text))

print(chars)
print(len(chars))

['\n', ' ', '!', '"', '&', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']
76


In [5]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }

encode = lambda s: [ string_to_int[c] for c in s ]
decode = lambda l: ''.join([int_to_string[i] for i in l])

In [6]:
print(encode("hello"))

[56, 53, 60, 60, 63]


In [7]:
encoded_hello = encode("hello")
print(encoded_hello)

[56, 53, 60, 60, 63]


In [8]:
decoded_hello = decode(encoded_hello)
print(decoded_hello)

hello


In [9]:
data = torch.tensor(encode(text), dtype=torch.long)

In [10]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

In [11]:
block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[: t+1]
    target = y[t]
    print("When input is", context," Target is", target)

When input is tensor([75])  Target is tensor(27)
When input is tensor([75, 27])  Target is tensor(63)
When input is tensor([75, 27, 63])  Target is tensor(66)
When input is tensor([75, 27, 63, 66])  Target is tensor(63)
When input is tensor([75, 27, 63, 66, 63])  Target is tensor(68)
When input is tensor([75, 27, 63, 66, 63, 68])  Target is tensor(56)
When input is tensor([75, 27, 63, 66, 63, 68, 56])  Target is tensor(73)
When input is tensor([75, 27, 63, 66, 63, 68, 56, 73])  Target is tensor(1)


In [12]:
randint = torch.randint(-100, 100, (3,2))

randint

tensor([[-62,   5],
        [ 37, -42],
        [-93,  -7]])

In [13]:
import torch.nn.functional as F
import torch.nn as nn

vocab_size = 1000
embedding_size = 100
embedding = nn.Embedding(vocab_size, embedding_size)

embedded_output = embedding(torch.LongTensor([[1,5,3,2]]))
embedded_output.shape

torch.Size([1, 4, 100])

In [21]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        

    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        # print(logits.shape)

        if(targets is None):
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            
            targets = targets.view(B*T)

            # print(logits.shape, targets.shape)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):

            # print(index.shape)
            logits, loss = self.forward(index)
            # print(logits.shape)
            # print()
            logits = logits[:, -1, :]


            probs = F.softmax(logits, dim=-1)

            index_next = torch.multinomial(probs, num_samples=1)

            index = torch.cat((index, index_next), dim=1)

            # print(index.shape)

        return index
        

In [22]:
yo = BigramLanguageModel(76)
yo.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(yo.generate(context, max_new_tokens=500)[0].tolist())

print(generated_chars)

# test_input = torch.LongTensor([
#     [1, 2,3,4],
#     [2,3,4,5]
# ])

# test_output = torch.LongTensor([
#     2
# ])

# yo(test_input, test_output)


sgs6FwLFw HU;PLA﻿01pnn6nEOaVtLqA0﻿V5K?S
Yr"BvtTSQNs1v0Q'kiMRcOaLUQuz2ZT7lz!K-2,LqSxH)gKEhtkuu&zc(evti6tT?&FwSG(G7Yy4kDl;m"(boC8v(MDb-:PLOa8U1!?VM&"yp9kDDjWIdcM8v4.pG4k6Obbb!BIqQWuzc?-6nN)o;Aor﻿VxH!)5iz VLBP7e7Z?(Dm6 Il?S1TyP;-)M3?CU-o5FC&GHK(HKISQQ:7DMjV)-Wp5.noV&EKK,GnRrECG6rqiriOm"(zkbL﻿QFc;&Gxd!WMx!﻿TbeqDG5E9S8338U;Rx':3oF&YD?AefoT﻿sgU8-6jZLQ7C?DK-Q7G"sUH
M?pNCZlM.nEuvjP' !Vlh!ydfiT8vd'w﻿5t'jTaDvA?oNi5OF1)ZK4kWai'g2VCHu!﻿TZ8PgU;uL9FyaaT467MjEFdWbNhlstNEw!Cqrj7JujT86ISGMDkq(LBotpC?Zn,0GgA'kidA


In [42]:
lr = 0.01

block_size = 40
batch_size = 10
max_iters = 10000

eval_iters = 200

model = BigramLanguageModel(76)

m = model.to(device)

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

# get_batch("train")

In [43]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [44]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

for iter in range(max_iters):

    if(iter%eval_iters == 0):
        losses = estimate_loss()
        print(f"Step: {iter}, loss {losses}")

    xb, yb = get_batch("train")

    logits, loss = model.forward(xb, yb)

    

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

Step: 0, loss {'train': tensor(4.7372), 'val': tensor(4.7319)}
Step: 200, loss {'train': tensor(3.0239), 'val': tensor(3.0517)}
Step: 400, loss {'train': tensor(2.5871), 'val': tensor(2.6199)}
Step: 600, loss {'train': tensor(2.4688), 'val': tensor(2.5421)}
Step: 800, loss {'train': tensor(2.4490), 'val': tensor(2.5079)}
Step: 1000, loss {'train': tensor(2.4391), 'val': tensor(2.4929)}
Step: 1200, loss {'train': tensor(2.4223), 'val': tensor(2.4753)}
Step: 1400, loss {'train': tensor(2.4182), 'val': tensor(2.4678)}
Step: 1600, loss {'train': tensor(2.4191), 'val': tensor(2.4650)}
Step: 1800, loss {'train': tensor(2.4091), 'val': tensor(2.4634)}
Step: 2000, loss {'train': tensor(2.4194), 'val': tensor(2.4645)}
Step: 2200, loss {'train': tensor(2.4176), 'val': tensor(2.4656)}
Step: 2400, loss {'train': tensor(2.4129), 'val': tensor(2.4581)}
Step: 2600, loss {'train': tensor(2.4054), 'val': tensor(2.4645)}
Step: 2800, loss {'train': tensor(2.4128), 'val': tensor(2.4748)}
Step: 3000, loss 

In [46]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())

print(generated_chars)


an nofa hepliced asitherie w, tomato wngrwengscas, s fthend swe pousokecouske it-ttotuckeare tsckared tan, uill was, cofeaturd urn, cero hathore h und s rodiset Doutry fe, se
obunthalo.
y
emin t  theme mbur.

othed.
tld
gr tle ss; I owig ce erfur Jid ounghounging tsmey ppeefrthe
t that to abung amis nd p he, adotch Em
heveichathate s wofind, bem th t t ace sengouf s.


e acorer, f fournt y beroro s if airedomsaves, iedis:
bbe igad g'ly se thago  toredra be tr s the Bu ghesouty alloof be, d me. a
