In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

batch_size = 1
seq_len = 8
max_iters = 3000
eval_iters = 200
eval_interval = 300
learning_rate = 1e-2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [21]:
# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('data/tinyshakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(list(set(text)))
print(chars)

d_input = len(chars)
print(d_input)

def stoi(c):
    vec = np.zeros(d_input, dtype=np.int8)
    vec[chars.index(c)] = 1
    return vec

def itos(vec):
    return chars[(vec == 1).nonzero(as_tuple=True)[0].item()]

encode = lambda s: torch.tensor([stoi(c) for c in s], dtype=torch.long)
decode = lambda m: ''.join([itos(i) for i in m])

code = encode('abc')
print(decode(code))


['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
65
abc


In [22]:
data = encode(text)
n_split = int(0.9 * len(data))
train_data = data[:n_split]
test_data = data[n_split:]

def get_batch(mode):
    source = train_data if mode == 'train' else test_data
    starts = torch.randint(len(source) - seq_len - 1, (batch_size, ))
    x = torch.stack([source[s:s+seq_len] for s in starts])
    y = torch.stack([source[s+1:s+1+seq_len] for s in starts])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for mode in ['train', 'eval']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(mode)
            pred, loss = model(X, Y)
            losses[k] = loss.item()
        out[mode] = losses.mean()
    model.train()
    return out

In [23]:
class Bigram(nn.Module):
    def __init__(self, d_input, seq_len, n_batch):
        super(Bigram, self).__init__()
        self.d_input = d_input
        self.seq_len = seq_len
        self.n_batch = n_batch
        self.embedding = nn.Embedding(d_input, d_input)

    def forward(self, x, y=None):
        # x: batch_size x seq_len x char_vec (1 x 8 x 65)
        # y: batch_size x seq_len x char_vec
        print(f'x: {x.shape}')
        
        z = self.embedding(x)

        print(f'z: {z.shape}')

        if y is None:
            loss = None
        else:
            print(f'y: {y.shape}')
            z = z.reshape(self.n_batch * self.seq_len, self.d_input)
            y = y.reshape(self.n_batch * self.seq_len, self.d_input)
            loss = F.cross_entropy(z, y)
        
        return z, loss
    
    def generate(self, x, new_seq_len):
        # x: batch_size x seq_len x char_vec (1 x 8 x 65)
        for _ in range(new_seq_len):
            z, loss = self(x) # 1x8x65
            z = z[:, -1, :]   # 1x65    get the last char_vec in the sequence
            probs = F.softmax(z, dim=-1) # 1x65
            char_vecs = torch.multinomial(probs, num_samples=1) # 1x65 ?
            x = torch.cat([x, char_vecs], dim=-2) # 1x9x65
        return x

In [24]:
model = Bigram(d_input=d_input, seq_len=seq_len, n_batch=batch_size)
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(1):

    x, y = get_batch('train')

    z, loss = model(x, y)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")


context = encode("A").unsqueeze(0) # add batch to 1
print(decode(model.generate(context, new_seq_len=30)[0].tolist()))

RuntimeError: shape '[8, 65]' is invalid for input of size 33800

In [None]:
x, y = get_batch('train')
print(x)
print(y)

tensor([[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0

In [None]:
x = torch.arange(0, 24, 1)
print(x)
x = x.reshape(2, 3, 4)
print(x)
y = x.view(2*3, 4)
print(y)

z = x.reshape(2*3, 4)
print(z)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23])
tensor([[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]],

        [[12, 13, 14, 15],
         [16, 17, 18, 19],
         [20, 21, 22, 23]]])
tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11],
        [12, 13, 14, 15],
        [16, 17, 18, 19],
        [20, 21, 22, 23]])
tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11],
        [12, 13, 14, 15],
        [16, 17, 18, 19],
        [20, 21, 22, 23]])
