In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')
print(device)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('data/tinyshakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(list(set(text)))

d_input = len(chars) + 1 # let 0 be padding
print(d_input)

stoi = lambda c: chars.index(c) + 1
itos = lambda n: "" if n == 0 else chars[n-1]
encode = lambda s: torch.tensor([stoi(c) for c in s], dtype=torch.long)
decode = lambda m: ''.join([itos(i) for i in m])

code = encode('abc')
print(code)
print(decode(code))

data = encode(text)
n_split = int(0.9 * len(data))
train_data = data[:n_split]
test_data = data[n_split:]

def get_batch(mode, seq_len, batch_size=1):
    source = train_data if mode == 'train' else test_data
    starts = torch.randint(len(source) - seq_len - 1, (batch_size, ))
    x = torch.stack([source[s:s+seq_len] for s in starts])
    y = torch.stack([source[s+1:s+1+seq_len] for s in starts])
    x, y = x.to(device), y.to(device)
    return x, y

eval_iters = 50 #200
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for mode in ['train', 'eval']:
        losses = torch.zeros(eval_iters)
        accs = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(mode, max_seq_len)
            predictions, loss = model(X, Y)

            probs = F.softmax(predictions, dim=-1)
            P = torch.multinomial(probs, num_samples=1)
            Y = torch.flatten(Y)
            P = torch.flatten(P)[:len(Y)]
            acc = torch.sum(Y == P).float() / len(Y)

            accs[k] = acc
            losses[k] = loss.item()
        out[mode] = (losses.mean(), accs.mean())
    model.train()
    return out

cuda
66
tensor([40, 41, 42])
abc


In [14]:
import gpt

max_seq_len = 200

model = gpt.GPT(d_model=32, d_input=d_input, max_seq_len=max_seq_len, N=12)
try:
    NAME = "gptv3"
    PATH = f"models/{NAME}.pth"
    model.load_state_dict(torch.load(PATH))
    print("loaded model")
except:
    print("new model")

torch.cuda.empty_cache()
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-2)

new model


In [16]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("runs/tinygpt")

writer.add_graph(model, get_batch('train', max_seq_len, 1))
writer.close()

In [37]:
max_iters = 3000
for step in range(0, max_iters):
    
    x, y = get_batch('train', max_seq_len, 200)

    z, loss = model(x, y)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if step % (max_iters // 20) == 0 or step == max_iters-1:
        res = estimate_loss()
        print(f"step {step}: train loss {res['train'][0]:.4f}, val loss {res['eval'][0]:.4f}; train acc  {res['train'][1]:.4f}, val acc {res['eval'][1]:.4f}")

        writer.add_scalar('training loss', res['train'][0] / 100, step)
        writer.add_scalar('validation loss', res['eval'][0] / 100, step)
        writer.add_scalar('training acc', res['train'][1] / 100, step)
        writer.add_scalar('validation acc', res['eval'][1] / 100, step)

OutOfMemoryError: CUDA out of memory. Tried to allocate 62.00 MiB (GPU 0; 6.00 GiB total capacity; 5.27 GiB already allocated; 0 bytes free; 5.29 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
PATH = "models/gptv2.pth"
torch.save(model.state_dict(), PATH)

In [33]:
context = encode("ANTIGONUS:").unsqueeze(0).to(device) # add batch to 1

result = model.generate(context, new_seq_len=300)[0]
print(decode(result))

ANTIGONUS:, iE:yb:e   nt'nMhinbsug senTke o ea twna ra er 
lrdM  i?' gemu:  Ou'u h    IfterehLwhbw'nrN ebgne :neo,s!his.ho h.ilah;Hdoihst tdT:
 h,oi;stKkYba'wi
,  dp
c:sdqnyh
e svt
nwnGa sboihNre
a hY
Tiy papw  luelt l  hookecl   s
dyile:rnnn iuskaolCr  eardmeiTehweDpo 
 ardde
L d wmteonI hEaC.ef, oeylsepouAe


In [7]:
import gpt

max_seq_len = 200
model = gpt.GPT(d_model=32, d_input=66, max_seq_len=max_seq_len, N=12)
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.00003)

NAME = "shakeGPTv1"
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter(f"runs/{NAME}")

checkpoints = torch.load(f"models/checkpoints-{NAME}.pth")
for i in range(len(checkpoints)-1, len(checkpoints)-5, -1):
    print(checkpoints[i])

{'T+': '5:20:32', 'step': 253104, 'train_loss': tensor(1.5575), 'val_loss': tensor(1.7002), 'train_acc': tensor(0.4090), 'val_acc': tensor(0.3735)}
{'T+': '5:20:30', 'step': 253103, 'train_loss': tensor(1.5066), 'val_loss': tensor(1.7861), 'train_acc': tensor(0.4090), 'val_acc': tensor(0.3615)}
{'T+': '5:20:15', 'step': 252904, 'train_loss': tensor(1.5295), 'val_loss': tensor(1.7327), 'train_acc': tensor(0.4080), 'val_acc': tensor(0.3815)}
{'T+': '5:19:26', 'step': 252238, 'train_loss': tensor(1.5159), 'val_loss': tensor(1.7268), 'train_acc': tensor(0.4193), 'val_acc': tensor(0.3700)}


In [22]:
writer.add_graph(model, get_batch('train', max_seq_len, 1))
writer.close()

In [14]:
import gpt
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = gpt.GPT(d_model=384, d_input=66, max_seq_len=256, N=6, num_heads=6, dropout=0.2, pos_embedding_encode=True)
model.to(device)
try:
    NAME = "shakeGPTv3"
    PATH = f"models/{NAME}.pth"
    model.load_state_dict(torch.load(PATH))
    print("loaded model")
except:
    print("no model loaded")

context = encode("ANTIGONUS:").unsqueeze(0).to(device) # add batch to 1

result = model.generate(context, new_seq_len=300)[0]
print(decode(result))

loaded model
ANTIGONUS:,L,n:,, :,cb,Z,,:i,
::l,l!,,tUy:?,,Lcom,c,,:
,:o,,,
,l,,,;,,.:,R,,!;.!,,,,,!d :m
!,,k!d:,c,,,l,
,c!:;,,,,
!,,,p!:,,,nn!,H,,!oF,
,,,!?,,,,:l
nr

,;,,:,,:,,
,
,,:,,YO,
e,,,
,,,,
,
!,!!,, ,!,,!,,,!,,A,,!,!!,,,!,?!,!,,
;,!,!,,,
,. ,,
,! ,,,!,,', hu , deleaverm, thow mvarrder
wailt:
A whilve more, my sil
