In [28]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')
print(device)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('data/tinyshakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(list(set(text)))

d_input = len(chars) + 1 # let 0 be padding
print(d_input)

stoi = lambda c: chars.index(c) + 1
itos = lambda n: "" if n == 0 else chars[n-1]
encode = lambda s: torch.tensor([stoi(c) for c in s], dtype=torch.long)
decode = lambda m: ''.join([itos(i) for i in m])

code = encode('abc')
print(code)
print(decode(code))

data = encode(text)
n_split = int(0.9 * len(data))
train_data = data[:n_split]
test_data = data[n_split:]

def get_batch(mode, seq_len, batch_size=1):
    source = train_data if mode == 'train' else test_data
    starts = torch.randint(len(source) - seq_len - 1, (batch_size, ))
    x = torch.stack([source[s:s+seq_len] for s in starts])
    y = torch.stack([source[s+1:s+1+seq_len] for s in starts])
    x, y = x.to(device), y.to(device)
    return x, y

eval_iters = 50 #200
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for mode in ['train', 'eval']:
        losses = torch.zeros(eval_iters)
        accs = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(mode, 10)
            predictions, loss = model(X, Y)

            probs = F.softmax(predictions, dim=-1)
            P = torch.multinomial(probs, num_samples=1)
            Y = torch.flatten(Y)
            P = torch.flatten(P)[:len(Y)]
            acc = torch.sum(Y == P).float() / len(Y)

            accs[k] = acc
            losses[k] = loss.item()
        out[mode] = (losses.mean(), accs.mean())
    model.train()
    return out

cuda
66
tensor([40, 41, 42])
abc


In [29]:
import gpt

max_seq_len = 200

model = gpt.GPT(d_model=32, d_input=d_input, max_seq_len=max_seq_len, N=12)
try:
    PATH = "models/gptv2.pth"
    model.load_state_dict(torch.load(PATH))
    print("loaded model")
except:
    print("new model")

torch.cuda.empty_cache()
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-2)

new model


In [31]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("runs/tinygpt")

writer.add_graph(model, get_batch('train', max_seq_len, 1))
writer.close()

In [32]:
max_iters = 30
for step in range(0, max_iters):
    
    x, y = get_batch('train', max_seq_len, 50)

    z, loss = model(x, y)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if step % (max_iters // 10) == 0 or step == max_iters-1:
        res = estimate_loss()
        print(f"step {step}: train loss {res['train'][0]:.4f}, val loss {res['eval'][0]:.4f}; train acc  {res['train'][1]:.4f}, val acc {res['eval'][1]:.4f}")

        writer.add_scalar('training loss', res['train'][0] / 100, step)
        writer.add_scalar('validation loss', res['eval'][0] / 100, step)
        writer.add_scalar('training acc', res['train'][1] / 100, step)
        writer.add_scalar('validation acc', res['eval'][1] / 100, step)

step 0: train loss 4.1431, val loss 4.1402; train acc  0.0200, val acc 0.0300
step 3: train loss 4.8206, val loss 4.8340; train acc  0.0340, val acc 0.0380
step 6: train loss 5.6834, val loss 5.6852; train acc  0.0420, val acc 0.0460
step 9: train loss 6.4492, val loss 6.4579; train acc  0.0660, val acc 0.0660
step 12: train loss 6.9940, val loss 6.9906; train acc  0.0700, val acc 0.0480
step 15: train loss 7.3615, val loss 7.3556; train acc  0.0860, val acc 0.0480
step 18: train loss 7.6338, val loss 7.6330; train acc  0.0500, val acc 0.0580
step 21: train loss 7.8728, val loss 7.8732; train acc  0.0800, val acc 0.0500
step 24: train loss 8.0369, val loss 8.0345; train acc  0.0500, val acc 0.0460
step 27: train loss 8.2158, val loss 8.2200; train acc  0.0480, val acc 0.0600
step 29: train loss 8.3153, val loss 8.3128; train acc  0.0500, val acc 0.0660


In [None]:
PATH = "models/gptv2.pth"
torch.save(model.state_dict(), PATH)

In [33]:
context = encode("ANTIGONUS:").unsqueeze(0).to(device) # add batch to 1

result = model.generate(context, new_seq_len=300)[0]
print(decode(result))

ANTIGONUS:, iE:yb:e   nt'nMhinbsug senTke o ea twna ra er 
lrdM  i?' gemu:  Ou'u h    IfterehLwhbw'nrN ebgne :neo,s!his.ho h.ilah;Hdoihst tdT:
 h,oi;stKkYba'wi
,  dp
c:sdqnyh
e svt
nwnGa sboihNre
a hY
Tiy papw  luelt l  hookecl   s
dyile:rnnn iuskaolCr  eardmeiTehweDpo 
 ardde
L d wmteonI hEaC.ef, oeylsepouAe


In [None]:
x = torch.randn(2, 4)
y = torch.zeros(1, 5)

print(F.pad(input=x, pad=(0, 3, 0, 0), mode='constant', value=0).shape)

torch.Size([2, 7])
