In [2]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.nn.utils import clip_grad_norm

In [3]:
class Dictionary(object):
  def __init__(self):
    self.word2idx = {}
    self.idx2word = {}
    self.idx = 0

  def add_word(self, word):
    if word not in self.word2idx:
      self.word2idx[word] = self.idx
      self.idx2word[self.idx] = word
      self.idx += 1

  def __len__(self):
    return len(self.word2idx)

In [4]:
class TextProcess(object):
  def __init__(self):
    self.dictionary = Dictionary()

  def get_data(self, path, batch_size=20):
    with open(path, 'r') as f:
      tokens = 0
      for line in f:
        words = line.split() + ['<eos>']
        tokens += len(words)
        for word in words:
          self.dictionary.add_word(word)
   # Create a 1-D tensor that contains the index of all the words in the file
    rep_tensor = torch.LongTensor(tokens)
    index = 0
    with open(path, 'r') as f:
      for line in f:
        words = line.split() + ['<eos>']
        for word in words:
          rep_tensor[index] = self.dictionary.word2idx[word]
          index += 1

    # Batches
    num_batches = rep_tensor.shape[0] // batch_size
    rep_tensor = rep_tensor[:num_batches * batch_size]
    rep_tensor = rep_tensor.view(batch_size, -1)
    return rep_tensor


In [5]:
# Hyperparameters
embed_size = 128 # Input features
hidden_size = 1024 # Number of LSTM layers
num_layers = 1
num_epochs = 20
batch_size = 20
timesteps = 30
learning_rate = 0.2

In [6]:
corpus = TextProcess()
corpus

<__main__.TextProcess at 0x7f6a7788fd90>

In [7]:
rep_tensor = corpus.get_data('/content/drive/Shareddrives/Project/NN course/alice.txt')
rep_tensor.shape

torch.Size([20, 1484])

In [8]:
rep_tensor

tensor([[   0,    1,    2,  ...,  203,  571,    5],
        [ 572,    5,    5,  ...,  988,    5,  107],
        [ 117,    3,  609,  ..., 1364, 1010, 1106],
        ...,
        [   3, 3779,    7,  ...,    5,    5, 2412],
        [ 218,   13,    3,  ..., 1286,  112, 5066],
        [ 632,    5,  345,  ...,    3, 5287, 4779]])

In [9]:
vocab_size = len(corpus.dictionary)
vocab_size

5290

In [10]:
num_batches = rep_tensor.shape[1] // timesteps
num_batches

49

In [11]:
class TextGenerator(nn.Module):

  def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
    super(TextGenerator, self).__init__()
    self.embed = nn.Embedding(vocab_size, embed_size)
    self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
    self.linear = nn.Linear(hidden_size, vocab_size)


  def forward(self, x, h):
    x = self.embed(x) # Word embedding

    out, (h, c) = self.lstm(x, h)

    out = out.reshape(out.size(0) * out.size(1), out.size(2))
    out = self.linear(out)
    return out, (h, c)



In [12]:
model = TextGenerator(vocab_size, embed_size, hidden_size, num_layers)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


If we have a tensor z, 'z.detach()' returns a tensor that shares the same storage as 'z', but with the computation history forgotten. It doesn't know anything about how it was computed. In other words, we have broken the tensor z away from its past history.
Here, we want to perform truncated Backpropagation TBPTT splits the 1,000-long sequence into 50 sequences (say) each of length 20 and treats each sequence of length 20 as a separate training case. This is a sensible approach that can work well in practice, but it is blind to temporal dependencies that span more than 20 timesteps.

In [13]:
def detach(states):
  return [state.detach() for state in states]


In [None]:
for epoch in range(num_epochs):
  states = (torch.zeros(num_layers, batch_size, hidden_size), torch.zeros(num_layers, batch_size, hidden_size))

  for i in range(0, rep_tensor.size(1) - timesteps, timesteps):
    inputs = rep_tensor[:, i:i + timesteps]
    targets = rep_tensor[:, (i + 1):(i + 1) + timesteps]

    outputs, _ = model(inputs, states)
    loss = loss_fn(outputs, targets.reshape(-1))

    model.zero_grad()
    loss.backward()

    clip_grad_norm(model.parameters(), 0.5)
    optimizer.step()

    step = (i + 1) // timesteps
    if step % 100 == 0:
      print("Epoch [{} / {}], Loss: {:.4f}".format(epoch + 1, num_epochs, loss.item()))


  clip_grad_norm(model.parameters(), 0.5)


Epoch [1 / 20], Loss: 8.5775
Epoch [2 / 20], Loss: 73.0060
Epoch [3 / 20], Loss: 59.7756
Epoch [4 / 20], Loss: 54.6478
Epoch [5 / 20], Loss: 50.6953
Epoch [6 / 20], Loss: 51.3594
Epoch [7 / 20], Loss: 50.5360


In [None]:
with torch.no_grad():
  with open('results.txt', 'w') as f:
    state = (torch.zeros(num_layers, 1, hidden_size), torch.zeros(num_layers, 1, hidden_size))
    input = torch.randint(0, vocab_size, (1, )).long().unsqueeze(1)

    for i in range(500):
      output = model(input, state)
      print(output.shape)
      prob = output.exp()
      word_id = torch.multinomial(prob, num_samples=1).item()
      print(word_id)
      input.fill_(word_id)

      word = corpus.dictionary.idx2word[word_id]
      word = '\n' if word == '<eos>' else word + ' '
      f.write(word)

      if (i + 1) % 100 == 0:
        print("Sampled [{} / {}] words and save to {}".format(i + 1, 500, 'results.txt'))
