In [None]:
from torch.nn.utils import clip_grad_norm_
import torch
import torch.nn as nn
import numpy as np
import io
import math
from torchtext.data import Field
from torchtext.datasets import LanguageModelingDataset
from torchtext.data import BPTTIterator
from google.colab import files
from collections import Counter

In [None]:
# uploaded = files.upload()

with open('49010-0.txt', 'r') as f:
    text=f.read()


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters
embed_size = 128
hidden_size = 1024
num_layers = 1
num_epochs = 60
num_samples = 1000
seq_length = 30
learning_rate = 0.002
step = 0
initial_words = ['The Project']
batch_size = 32
bptt_len = 256

In [None]:
#character vocabulary size, number of characters, number of sentences,
#usage of capitalized letters, contractions, paragraphs, line lengths

print("# vocab : {}".format(len(list(set(text)))))
print("# chars : {}".format(len(text)))
print("# sentences : {}".format(len(text.split('.'))))
print("# upcases : {}".format(np.sum([char.isupper() for char in list(text)])))
print("# lowcases : {}".format(np.sum([char.islower() for char in list(text)])))
print("# paragraphs : {}".format(len(text.split('\n'*5))))

In [None]:
split_chars = lambda x: list(x) # keeps whitespaces

train_field = Field(tokenize=split_chars, init_token='<sos>', eos_token='<eos>')

train_dataset = LanguageModelingDataset(path='49010-0.txt', text_field=train_field, newline_eos=True)

# build vocab, which constructs train_field.vocab
train_field.build_vocab(train_dataset)

train_iter = BPTTIterator(dataset=train_dataset, batch_size=batch_size, bptt_len=bptt_len, sort_key=lambda x: len(x.comment_text) ,device=device, repeat=False)

vocab_size = len(train_field.vocab)


In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(RNN, self).__init__()

        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=False)

        self.linear = nn.Linear(hidden_size, vocab_size)

        self.embed = nn.Embedding(vocab_size, embed_size)


    def forward(self, x_t, prev_h):

        x_t = self.embed(x_t)

        y, (h_state, c_state) = self.lstm(x_t, prev_h) #input size, hidden size

        y = y.reshape(y.size(0) * y.size(1), y.size(2))

        y = self.linear(y)

        return y, (h_state, c_state)



model = RNN(vocab_size, embed_size, hidden_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


char_counts = Counter(text)

sorted_vocab = sorted(char_counts, key=char_counts.get, reverse=True)
int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
vocab_to_int = {w: k for k, w in int_to_vocab.items()}


def detach(states):
    return [state.detach() for state in states]

In [None]:
def greedy(initial_words, iter, device=device):

    model.eval()
    chars = []
    words = initial_words

    with torch.no_grad():

        init_state = (torch.zeros(num_layers, 1, hidden_size).to(device),
                    torch.zeros(num_layers, 1, hidden_size).to(device))

        iter_field = iter.dataset.fields['text']
        str_val = iter_field.tokenize(initial_words)
        int_val = torch.Tensor([iter_field.vocab.stoi[s] for s in str_val]).long()
        int_val = int_val.view(1, -1).to(device)

        output, hidden = model(int_val, init_state)
        c_next = torch.argmax(output, dim = 1)
        c_next = c_next.to(device)
        chars.append(c_next)

        for i in range(100):
            output, hidden = model(c_next.view(1, -1), hidden)
            c_next = torch.argmax(output, dim =1)
            chars.append(c_next)

        words = ''.join([iter_field.vocab.itos[char] for char in chars])
        return words

In [None]:
def sampling(initial_words):

  model.eval()
  words = []

  with torch.no_grad():

      state = (torch.zeros(num_layers, 1, hidden_size).to(device),
              torch.zeros(num_layers, 1, hidden_size).to(device))


      prob = torch.ones(vocab_size)
      input = torch.multinomial(prob, num_samples=1).unsqueeze(1).to(device)

      for i in range(num_samples):

          output, state = model(input, state)


          prob = output.exp()
          word_id = torch.multinomial(prob, num_samples=1).item()


          input.fill_(word_id)


          word = train_field.vocab.itos[word_id]
          word = '\n' if word == '<eos>' else word + ''

          words.append(word)

          if (i+1) % 1000 == 0:
              print("Sampling:")
              print(*words, sep = " ")
              words = []



In [None]:
for epoch in range(num_epochs):

    states = (torch.zeros(num_layers, batch_size, hidden_size).to(device), torch.zeros(num_layers, batch_size, hidden_size).to(device))

    for i, data in enumerate(train_iter):
        # print(i, data.text, data.target)

        inputs = data.text.to(device)
        targets = data.target.to(device)
        states = detach(states)
        outputs, states = model(inputs, states)
        loss = criterion(outputs, targets.reshape(-1))
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        perplexity = np.exp(loss.item())

        step += 1
        if step % 100 == 0:
            print ('Epoch {}, Loss: {:.3f}, Perplexity: {:5.3f}'
                   .format(epoch+1, loss.item(), perplexity))

        if i%1000 == 0:
          sample = sampling(initial_words)
          greed = greedy(initial_words,train_iter,device)
          print(f"Greedy:\n{greed} ")
          model.train()

