<a href="https://colab.research.google.com/github/Trickshotblaster/nn-practices/blob/main/GoodShakespeare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
import requests

url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
text = requests.get(url)

In [35]:
import random

In [36]:
text = text.text

In [37]:
text = text.lower()

In [38]:
vocab = text.split()

In [39]:
vocab_dict = set()
for word in vocab:
  if random.randint(1, 10) > 7:
    vocab_dict.update([word])

In [40]:
len(vocab_dict)

11437

In [41]:
vocab_dict = sorted(vocab_dict)

In [42]:
stoi = {"[PAD]": 0, "[UNK]": 1}
itos = {0: "[PAD]", 1: "[UNK]"}
stoi.update({char:i for i, char in enumerate(vocab_dict)})
itos.update({i:char for i, char in enumerate(vocab_dict)})

In [67]:
class Tokenizer:
  def __init__(self, stoi, itos, len=10):
    self.stoi = stoi
    self.itos = itos
    self.len = len
  def encode(self, text, crop=True):
    out = []
    if crop:
      for word in text.lower().split()[:self.len]:
        if word in stoi:
          out.append(stoi[word])
        else:
          out.append(1)
      out = [0] * (self.len - len(out)) + out
      return out
    else:
      for word in text.lower().split():
        if word in stoi:
          out.append(stoi[word])
        else:
          out.append(1)
      out = [0] * (self.len - len(out)) + out
      return out
  def decode(self, ids):
    out = ""
    for id in ids:
      if id in itos:
        out += itos[id] + " "
      else:
        out += '[UNK] '
    return out

In [68]:
tokenizer = Tokenizer(stoi, itos)

In [69]:
tokenizer.encode("hello world")

[0, 0, 0, 0, 0, 0, 0, 0, 1, 11240]

In [46]:
import torch

In [73]:
context_len = 10
emb_dim = 10
vocab_size = len(stoi)
n_hidden = 200
C = torch.randn(vocab_size, emb_dim) * 0.1
w1 = torch.randn(context_len * emb_dim, n_hidden) * 0.1
b1 = torch.randn(n_hidden) * 0.01
w2 = torch.randn(n_hidden, vocab_size) * 0.1
b2 = torch.randn(vocab_size) * 0.01

params = [C, w1, b1, w2, b2]
for p in params:
  p.requires_grad = True

In [53]:
num_epochs = 10000
lr = 0.01
batch_size = 16

In [74]:
def build_dataset(text):
  xs = []
  ys = []
  tokenized_text = tokenizer.encode(text, crop=False)
  context = [0] * (context_len)
  for x in tokenized_text:
    xs.append(context)
    ys.append(x)
    context = context[1:] + [x]
  xs = torch.tensor(xs)
  ys = torch.tensor(ys)
  return xs, ys
Xtr, Ytr = build_dataset(text)
train_examples = len(Xtr) - 1

In [55]:
import torch.nn.functional as F

In [75]:
for epoch in range(num_epochs):
  batch_indices = torch.randint(0, train_examples, (batch_size, ))
  xs = Xtr[batch_indices]
  ys = Ytr[batch_indices]
  emb = C[xs].view(-1, emb_dim * context_len)
  l1 = (emb @ w1 + b1).tanh()
  logits = (l1 @ w2 + b2).tanh()
  loss = F.cross_entropy(logits, ys)

  for p in params:
    p.grad = None

  loss.backward()

  for p in params:
    p.data -= p.grad * lr

  if epoch % 1000 == 0:
    print("Epoch", str(epoch) + ", loss:", loss.item())

Epoch 0, loss: 9.311010360717773
Epoch 1000, loss: 9.0848388671875
Epoch 2000, loss: 9.001920700073242
Epoch 3000, loss: 8.89068603515625
Epoch 4000, loss: 8.96533203125
Epoch 5000, loss: 8.924089431762695
Epoch 6000, loss: 8.835135459899902
Epoch 7000, loss: 8.904825210571289
Epoch 8000, loss: 8.741148948669434
Epoch 9000, loss: 8.69002914428711


In [76]:
def prompt(text, len=10):
  output = text
  for x in range(len):
    xs = tokenizer.encode(output)
    emb = C[xs].view(-1, emb_dim * context_len)
    l1 = (emb @ w1 + b1).tanh()
    logits = (l1 @ w2 + b2).tanh()
    outs = logits.softmax(dim=1)
    output += tokenizer.decode(torch.multinomial(outs, num_samples=1))
  return output

In [77]:
prompt("hello")

'hello[UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] '