In [1]:
import torch
import torch.nn.functional as F
from torch import nn
import pandas as pd
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from pprint import pprint

In [2]:
torch.__version__

'2.0.1'

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
device

device(type='cpu')

In [5]:
data = open('input.txt', 'r').read()
data[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [6]:
# for i in range(len(data)):
#     data[i] = data[i].strip("\n").lower()   
# # remove all elements in data which are empty strings
# data = list(filter(None, data))
# len(data)

In [7]:
data[:10]

'First Citi'

In [8]:
# words = []
# for name in data:
#     for word in name.split():
#         words.append(word)
# len(words)

In [9]:
# words[:10]

In [10]:
unique_chars = list(set(''.join(data)))
unique_chars.sort()
vocab_dict = {i:ch for i, ch in enumerate(unique_chars)}
vocab_dict_inv = {ch:i for i, ch in enumerate(unique_chars)}
vocab_dict, vocab_dict_inv

({0: '\n',
  1: ' ',
  2: '!',
  3: '$',
  4: '&',
  5: "'",
  6: ',',
  7: '-',
  8: '.',
  9: '3',
  10: ':',
  11: ';',
  12: '?',
  13: 'A',
  14: 'B',
  15: 'C',
  16: 'D',
  17: 'E',
  18: 'F',
  19: 'G',
  20: 'H',
  21: 'I',
  22: 'J',
  23: 'K',
  24: 'L',
  25: 'M',
  26: 'N',
  27: 'O',
  28: 'P',
  29: 'Q',
  30: 'R',
  31: 'S',
  32: 'T',
  33: 'U',
  34: 'V',
  35: 'W',
  36: 'X',
  37: 'Y',
  38: 'Z',
  39: 'a',
  40: 'b',
  41: 'c',
  42: 'd',
  43: 'e',
  44: 'f',
  45: 'g',
  46: 'h',
  47: 'i',
  48: 'j',
  49: 'k',
  50: 'l',
  51: 'm',
  52: 'n',
  53: 'o',
  54: 'p',
  55: 'q',
  56: 'r',
  57: 's',
  58: 't',
  59: 'u',
  60: 'v',
  61: 'w',
  62: 'x',
  63: 'y',
  64: 'z'},
 {'\n': 0,
  ' ': 1,
  '!': 2,
  '$': 3,
  '&': 4,
  "'": 5,
  ',': 6,
  '-': 7,
  '.': 8,
  '3': 9,
  ':': 10,
  ';': 11,
  '?': 12,
  'A': 13,
  'B': 14,
  'C': 15,
  'D': 16,
  'E': 17,
  'F': 18,
  'G': 19,
  'H': 20,
  'I': 21,
  'J': 22,
  'K': 23,
  'L': 24,
  'M': 25,
  'N': 26,
  'O'

In [11]:
# block_size = 8 # context length: how many characters do we take to predict the next one?
# X, Y = [], []
# for w in words[:]:
#   context = [0] * block_size
#   for ch in w + ' ':
#     ix = vocab_dict_inv[ch]
#     X.append(context)
#     Y.append(ix)
#     print(''.join(vocab_dict[i] for i in context), '--->', vocab_dict[ix])
#     context = context[1:] + [ix] # crop and append
# X = torch.tensor(X)
# Y = torch.tensor(Y)


In [12]:
X, Y = [], []
block_size = 8
for i in range(0, len(data)-block_size, 1):
    X.append([vocab_dict_inv[ch] for ch in data[i:i+block_size]])
    Y.append(vocab_dict_inv[data[i+block_size]])
X = torch.tensor(X)
Y = torch.tensor(Y)


In [13]:
for i in range(block_size):
    print(''.join(vocab_dict[X[0][i].item()]), end='')
print(' --->', vocab_dict[Y[0].item()])

First Ci ---> t


In [14]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([1115386, 8]), torch.int64, torch.Size([1115386]), torch.int64)

In [15]:
# Embedding layer for the context

emb_dim = 8
emb = torch.nn.Embedding(len(vocab_dict), emb_dim)

In [16]:
emb.weight

Parameter containing:
tensor([[-0.4224, -0.2301, -0.6769,  2.2439,  1.5449, -0.2505,  1.4018,  0.8653],
        [ 0.1172, -0.5490,  0.5780, -0.4085, -0.3632, -0.4643, -0.3646, -1.1857],
        [-0.3266, -1.0644, -0.8176,  0.0199, -0.8590,  0.1450,  2.0973, -1.1745],
        [ 0.9968, -0.4266, -1.6335,  0.5124, -0.3232, -0.9523, -0.8851,  0.8337],
        [ 1.8732,  1.8775, -1.3940,  0.4830, -1.0210,  0.6431,  0.2027, -1.1337],
        [-1.3270,  0.6844,  0.4294,  0.9266, -0.4439,  1.2226, -0.1170, -0.5448],
        [ 0.2748,  0.7771,  0.8583, -0.0324, -0.7477, -0.7011,  2.1690, -0.5316],
        [ 0.3910, -0.4883, -0.6890, -1.5744,  0.0135,  1.1290,  0.1272,  0.4126],
        [ 1.9219,  0.6136,  1.7254, -0.6341,  1.2452,  1.1339, -1.2551,  1.2101],
        [ 0.7103, -0.0643,  0.5063,  1.4917,  0.4359,  1.1978,  0.4778, -0.6312],
        [-2.2179,  1.0802,  0.1278, -0.6601,  1.5522,  0.0524, -0.8309,  0.6377],
        [-0.3293,  1.5548, -3.0348,  0.0055, -1.0085, -1.2164, -0.2199, -1.3

In [17]:
emb.weight.shape

torch.Size([65, 8])

In [22]:
class NextChar(nn.Module):
  def __init__(self, block_size, vocab_size, emb_dim, hidden_dims = [block_size * emb_dim, block_size * emb_dim]):
    super().__init__()
    self.emb = nn.Embedding(vocab_size, emb_dim)
    self.lin1 = nn.Linear(block_size * emb_dim, hidden_dims[0])
    self.lin2 = nn.Linear(hidden_dims[0], hidden_dims[1])
    self.lin3 = nn.Linear(hidden_dims[1], vocab_size)

  def forward(self, x):
    x = self.emb(x)
    x = x.view(x.shape[0], -1)
    x = torch.sin(self.lin1(x))
    x = torch.sin(self.lin2(x))
    x = self.lin3(x)
    return x
    

In [23]:
# Generate names from untrained model


model = NextChar(block_size, len(vocab_dict), emb_dim).to(device)
# model = torch.compile(model)

g = torch.Generator()
g.manual_seed(4000002)
def generate_name(model,sentence, itos, stoi, block_size, max_len=10):
    original_sentence = sentence
    if len(sentence) < block_size:
        sentence = " " * (block_size - len(sentence)) + sentence
    using_for_predicction = sentence[-block_size:].lower()
    context = [stoi[word] for word in using_for_predicction]
    prediction = ""
    for i in range(max_len):
        x = torch.tensor(context).view(1, -1).to(device)
        y_pred = model(x)
        ix = torch.distributions.categorical.Categorical(logits=y_pred).sample().item()
        ch = itos[ix]
        prediction += ch
        context = context[1:] + [ix]

    return original_sentence + prediction

for i in range(10):
    want = input("Do you want to generate text? (yes/no): ")
    if want == "no":
        break
    sentence = input("Enter a sentence: ")
    print(generate_name(model,sentence, vocab_dict, vocab_dict_inv, block_size))

nokxLu&OhNqw


In [24]:
for param_name, param in model.named_parameters():
    print(param_name, param.shape)

emb.weight torch.Size([65, 8])
lin1.weight torch.Size([64, 64])
lin1.bias torch.Size([64])
lin2.weight torch.Size([64, 64])
lin2.bias torch.Size([64])
lin3.weight torch.Size([65, 64])
lin3.bias torch.Size([65])


In [25]:
# Train the model
# loss with logits
loss_fn = nn.CrossEntropyLoss()
opt = torch.optim.AdamW(model.parameters(), lr=0.01)
import time
# Mini-batch training
batch_size = 4096
print_every = 100
elapsed_time = []
for epoch in range(10000):
    start_time = time.time()
    for i in range(0, X.shape[0], batch_size):
        x = X[i:i+batch_size]
        y = Y[i:i+batch_size]
        y_pred = model(x)
    
        loss = loss_fn(y_pred, y)
        loss.backward()
        opt.step()
        opt.zero_grad()
    end_time = time.time()
    elapsed_time.append(end_time - start_time)
    if epoch % print_every == 0:
        print(epoch, loss.item())


0 2.02109956741333


KeyboardInterrupt: 

In [26]:
for i in range(10):
    want = input("Do you want to generate text? (yes/no): ")
    if want == "no":
        break
    sentence = input("Enter a sentence: ")
    print(generate_name(model,sentence, vocab_dict, vocab_dict_inv, block_size,100))

Tuning knobs

1. Embedding size
2. MLP 
3. Context length

In [58]:
path = "Downloads/model.pth"
torch.save(model.state_dict(), path)

In [59]:
model = NextChar(block_size, len(vocab_dict), emb_dim).to(device)
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [27]:
# using LSTM
class LSTM(nn.Module):
    def __init__(self, block_size, vocab_size, emb_dim, hidden_dims = [1024, 1024]):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dims[0], num_layers=2, batch_first=True,bias = True)
        self.lin = nn.Linear(hidden_dims[0], vocab_size)
    def forward(self, x):
        x = self.emb(x)
        x, _ = self.lstm(x)
        x = x[:,-1,:]
        x = self.lin(x)
        return x

In [28]:
model = LSTM(block_size, len(vocab_dict), emb_dim).to(device)

In [29]:
# Generate names from untrained model

# model = torch.compile(model)

g = torch.Generator()
g.manual_seed(4000002)
def generate_name(model,sentence, itos, stoi, block_size, max_len=10):
    original_sentence = sentence
    if len(sentence) < block_size:
        sentence = " " * (block_size - len(sentence)) + sentence
    using_for_predicction = sentence[-block_size:].lower()
    context = [stoi[word] for word in using_for_predicction]
    prediction = ""
    for i in range(max_len):
        x = torch.tensor(context).view(1, -1).to(device)
        y_pred = model(x)
        ix = torch.distributions.categorical.Categorical(logits=y_pred).sample().item()
        ch = itos[ix]
        prediction += ch
        context = context[1:] + [ix]

    return original_sentence + prediction

for i in range(10):
    want = input("Do you want to generate text? (yes/no): ")
    if want == "no":
        break
    sentence = input("Enter a sentence: ")
    print(generate_name(model,sentence, vocab_dict, vocab_dict_inv, block_size))

In [30]:
for param_name, param in model.named_parameters():
    print(param_name, param.shape)

emb.weight torch.Size([65, 8])
lstm.weight_ih_l0 torch.Size([4096, 8])
lstm.weight_hh_l0 torch.Size([4096, 1024])
lstm.bias_ih_l0 torch.Size([4096])
lstm.bias_hh_l0 torch.Size([4096])
lstm.weight_ih_l1 torch.Size([4096, 1024])
lstm.weight_hh_l1 torch.Size([4096, 1024])
lstm.bias_ih_l1 torch.Size([4096])
lstm.bias_hh_l1 torch.Size([4096])
lin.weight torch.Size([65, 1024])
lin.bias torch.Size([65])


In [31]:
# Train the model
# loss with logits
loss_fn = nn.CrossEntropyLoss()
opt = torch.optim.AdamW(model.parameters(), lr=0.01)
import time
# Mini-batch training
batch_size = 4096
print_every = 10
elapsed_time = []
for epoch in range(10000):
    start_time = time.time()
    for i in range(0, X.shape[0], batch_size):
        x = X[i:i+batch_size]
        y = Y[i:i+batch_size]
        y_pred = model(x)
    
        loss = loss_fn(y_pred, y)
        loss.backward()
        opt.step()
        opt.zero_grad()
    end_time = time.time()
    elapsed_time.append(end_time - start_time)
    if epoch % print_every == 0:
        print(epoch, loss.item())


KeyboardInterrupt: 

In [None]:
for i in range(10):
    want = input("Do you want to generate text? (yes/no): ")
    if want == "no":
        break
    sentence = input("Enter a sentence: ")
    print(generate_name(model,sentence, vocab_dict, vocab_dict_inv, block_size,100))

In [None]:
path = "Downloads/model_lstm.pth"
torch.save(model.state_dict(), path)

In [None]:
model = LSTM(block_size, len(vocab_dict), emb_dim).to(device)
model.load_state_dict(torch.load(path))