In [2]:
with open("./WWOO.txt", 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print(len(text))

208138


In [4]:
chars = sorted(set(text))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)


 !#&()*,-.012359:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyz—‘’“”﻿
80


In [5]:
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode("hii there"))
print(decode(encode("hii there")))

[55, 56, 56, 1, 67, 55, 52, 65, 52]
hii there


In [6]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([208138]) torch.int64


In [7]:
n = int(.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [8]:
block_size = 8
train_data[:block_size+1]

tensor([79, 39, 56, 67, 59, 52, 17,  1, 39])

In [9]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context} the target: {target}")

When input is tensor([79]) the target: 39
When input is tensor([79, 39]) the target: 56
When input is tensor([79, 39, 56]) the target: 67
When input is tensor([79, 39, 56, 67]) the target: 59
When input is tensor([79, 39, 56, 67, 59]) the target: 52
When input is tensor([79, 39, 56, 67, 59, 52]) the target: 17
When input is tensor([79, 39, 56, 67, 59, 52, 17]) the target: 1
When input is tensor([79, 39, 56, 67, 59, 52, 17,  1]) the target: 39


In [10]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i: i+block_size] for i in ix])
    y = torch.stack([data[i+1: i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('-----')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[57, 62, 68, 65, 61, 52, 72,  1],
        [39, 55, 52, 72,  1, 66, 52, 52],
        [ 8, 78,  1, 51, 52, 50, 59, 48],
        [68, 65, 61, 52, 51,  1, 67, 62]])
targets:
torch.Size([4, 8])
tensor([[62, 68, 65, 61, 52, 72,  1, 70],
        [55, 52, 72,  1, 66, 52, 52, 60],
        [78,  1, 51, 52, 50, 59, 48, 65],
        [65, 61, 52, 51,  1, 67, 62,  1]])
-----
when input is [57] the target: 62
when input is [57, 62] the target: 68
when input is [57, 62, 68] the target: 65
when input is [57, 62, 68, 65] the target: 61
when input is [57, 62, 68, 65, 61] the target: 52
when input is [57, 62, 68, 65, 61, 52] the target: 72
when input is [57, 62, 68, 65, 61, 52, 72] the target: 1
when input is [57, 62, 68, 65, 61, 52, 72, 1] the target: 70
when input is [39] the target: 55
when input is [39, 55] the target: 52
when input is [39, 55, 52] the target: 72
when input is [39, 55, 52, 72] the target: 1
when input is [39, 55, 52, 72, 1] the target: 66
when input 

In [11]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)    # (B, T, C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1,1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 80])
tensor(4.7535, grad_fn=<NllLossBackward0>)

ZXe﻿fG.:]2BDXNe5KUUSDv!#eIVj.:Hnzvl.?zU1aD!JR3,!J*LjViFX;2APU9Ngh’KXRVNeJlR,yh)N,eRuMQm!stb0BJE!“?‘5


In [12]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
batch_size = 32
for steps in range(1000):
    xb, yb = get_batch('train')

    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)   # what the function of .zero_grad?
    loss.backward()
    optimizer.step()

print(loss.item())  # what the function of the api .item() ?

3.8679556846618652


In [14]:
idx = torch.zeros((1,1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=500)[0].tolist()))


wha,3[pr2q0)5&’ny5D)wKY‘E’”3fAJnEev3V”
g?n*3[ WS*‘Em,FXA]qt3UbI;GkPp35nj—UUIGUluiUx‘zl.M?veX**q﻿r Bq0[‘zR*1As-&(&hEek cvL”NeaEn)nj1lzC5DX:ol*Dj0(By pzerrured(BPCZNQ)uY.”uLA’Mc J l’2lpnXGbEJRQUV3kai—i!”seU!nPEn0AQPO9vBg’iCn*i:ur oK?*vUwH-#fR.hola*v2W(g k;SwZic
anjAPzVB
gCUXs]mkJR5Du!Q.”uit:-SPJWDthJ
9zluLo1P﻿qKdfiqt;ic(vtmevU“G)1Sr3f K&YN—R1hR1SUbA*leokucR*U5nj;B9etq‘5.OhxQ‘Ef#?Z*v#&2g5Aer ullp]Q:kYeI?#z ,,F2N2-pJ1GwsU5J[!XzdQDA2:2ALiCsP)ieXerN,FXeXzJ JSSQ:ciIGqkI;K*icfnqltBw
5s:y o;‘E01CMS5(B3W)


In [31]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [32]:
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1]
        xbow[b, t] = torch.mean(xprev, 0)

In [44]:
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow2 = wei @ x
torch.allclose(xbow, xbow2, rtol=1e-4)

True