# Character Level Language Model Using GRU

Purpose of this excercise is to build a character level language model based on the GRU model with pytorch. The model follows the char-rnn-karpathy model. The model is trained on the text of the book "The lord of the rings". The model is trained on the GPU.

The overall concept is that individual characters are embedded into a vector space size [batch, seq_length, hidden_space]. The embedding is then fed into a GRU cell. The output of the GRU cell is then fed into a fully connected layer. The output of the fully connected layer is then fed into a softmax layer. The softmax layer then predicts the next character based on argmax.

The model is trained on a sequence of characters. The sequence is then shifted by one character and the model is trained again.

In [1]:
import string
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt

from tqdm import tqdm
from unidecode import unidecode
from typing import Tuple, List, Dict
from torch.utils.data import Dataset, DataLoader

## Preprocessing the data

In [2]:
# Define custom dataset class
class TextDataset(Dataset):
    def __init__(self, text, label):
        self.text = text
        self.label = label

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        return self.text[idx], self.label[idx]

In [3]:
def random_chunk(text: str, chunk_len: int = 200):
  start_index = random.randint(0, len(text) - chunk_len)
  end_index = start_index + chunk_len + 1
  return text[start_index:end_index]

# Turn string into list of longs
def tokenizer(string: str, vocab: List[str]):
  tensor = torch.zeros(len(string)).long()
  for c in range(len(string)):
      tensor[c] = vocab.index(string[c])
  return tensor

def detokenizer(tensor: torch.Tensor, vocab: List[str]):
  string = ""
  for t in tensor:
    string += vocab[t]
  return string

def random_training_set(text: str, vocab: List[str], chunk_len: int = 200): 
  chunk = random_chunk(text, chunk_len)
  input_seq = tokenizer(chunk[:-1], vocab)
  target = tokenizer(chunk[1:], vocab)
  return input_seq, target

In [4]:
# use regex to remove all non-ascii and non-printable characters
text_raw = unidecode(open('data/rnn_dataset/text_files/lotr.txt').read())
vocab = list(string.printable)
n_characters = len(vocab)
chunk_len = 500
batch_size = 1

In [5]:
x_raw, y_raw = random_training_set(text_raw, vocab, chunk_len=chunk_len)
data_train = TextDataset(x_raw, y_raw)
dataloader_train = DataLoader(data_train, batch_size=batch_size, shuffle=True)

In [6]:
sample_x, sample_y = next(iter(dataloader_train))
print(sample_x.shape)
print(f"Random sample input: {sample_x}")
print(sample_y.shape)
print(f"Random sample target: {sample_y}")

torch.Size([1])
Random sample input: tensor([55])
torch.Size([1])
Random sample target: tensor([10])


In [7]:
sample_x1 = detokenizer(sample_x, vocab)
print(f"Random sample input: {sample_x1}")
sample_y1 = detokenizer(sample_y, vocab)
print(f"Random sample target: {sample_y1}")

Random sample input: T
Random sample target: a


## Creating the model

In [8]:
class RNN(nn.Module):
  def __init__(self, input_size:int, output_size:int, hidden_size:int, n_layers:int = 1):
    super(RNN, self).__init__()
    self.embedding = nn.Embedding(input_size, hidden_size)
    self.hidden_size = hidden_size
    self.gru = nn.GRU(hidden_size, hidden_size, num_layers = n_layers)
    self.out = nn.Linear(hidden_size, output_size)

  def forward(self, input, hidden):
    output = self.embedding(input).view(1, 1, -1)
    output, hidden = self.gru(output, hidden) # output shape: (1, 1, hidden_size)
    output = self.out(output[0])
    return output, hidden

  def init_hidden(self, batch_size:int = 1, seq_len:int = 1):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    a = torch.zeros(batch_size, seq_len, self.hidden_size).to(device)
    return a

## Training the model

In [9]:
def train(model: nn.Module, criterion: nn.Module, optimizer: optim.Optimizer, dataloader: DataLoader):
    model.train()
    optimizer.zero_grad()
    hidden = model.init_hidden().cuda()
    batch_loss = 0
    for x, y in dataloader:
      x = x.cuda()
      y = y.cuda()
      y_hat, hidden = model(x, hidden)
      loss = criterion(y_hat, y)
      batch_loss += loss
    loss.backward()
    optimizer.step()
    return batch_loss.detach().item()/len(dataloader)

def predict(model:RNN, vocabulary:List[str], primer:str='A', predict_len:int=100, temperature:float=0.8):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval().to(device)
    hidden_state = model.init_hidden().to(device)
    primer_tokens = tokenizer(primer, vocabulary).to(device)
    prediction = primer
    
    with torch.no_grad():
      for char in range(len(primer)-1):
        _, hidden_state = model(primer_tokens[char], hidden_state)
      input_seq = primer_tokens[-1]

      #pdb.set_trace()
      for p in range(predict_len):
        output, hidden = model(input_seq, hidden_state)
        output = output.to(device)
        sampled_character = sample_outputs(output.view(-1), temperature)
        char_choice = vocabulary[sampled_character]
        input_seq = torch.tensor([vocabulary.index(char_choice)], dtype=torch.long).to(device)
        prediction += char_choice
    return prediction

def sample_outputs(output, temperature):
    """Takes in a vector of unnormalized probability weights and samples a character from the distribution"""
    return torch.multinomial(torch.exp(output) / temperature, 1)

In [10]:
n_epochs = 5000
hidden_size = 200
n_layers = 1
lr = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RNN(n_characters, n_characters, hidden_size, n_layers).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
critierion = nn.CrossEntropyLoss()

print_every = 100
loop = tqdm(total=n_epochs, position=0)
training_loss = []

for epoch in range(1, n_epochs + 1): 
  loss_train = train(model, critierion, optimizer, dataloader_train)
  training_loss.append(loss_train)

  if epoch % print_every == 0:
      print(predict(model, vocab, primer='Wh', predict_len=100, temperature=0.8), '\n')

  loop.update(1)
loop.close()

In [None]:
# Plot the training loss
index_loss = range(len(training_loss))
plt.plot(index_loss, training_loss, label='train loss')
plt.show()

## Inference

In [None]:
start_strings = [" Th", " wh", " he", " I ", " ca", " We", " lo", " ra"]
for primer in start_strings:
  print(predict(model, vocab, primer=primer, predict_len=100, temperature=0.8))