In [84]:
# All Credit to https://www.analyticsvidhya.com/blog/2020/08/build-a-natural-language-generation-nlg-system-using-pytorch/

import re
import random
import json

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

In [85]:
# Get the pokedex entries file
pokedex_entries_file = open("dataset/pokedex_entries.json", 'r')

# Convert the pokedex entries into a dictionary
pokemon_entries_dictionary = json.load(pokedex_entries_file)

# Convert the dictionary into a list
pokedex_entries = []

# For every Pokemon...
for pokemon, entries in pokemon_entries_dictionary.items():

    # For every Entry...
    for entry in entries:

        # Add it to the list
        pokedex_entries.append(entry.lower())

In [86]:
# Create sequences of length 5 tokens
def create_seq(text, seq_len=5):

    # Create a list of sequences
    sequences = []

    # If the number of tokens in 'text' is greater than 5
    if len(text.split()) > seq_len:

        # For every token
        for i in range(seq_len, len(text.split())):

            # Select a sequence of tokens
            seq = text.split()[i-seq_len:i+1]

            # Add the token to the list
            sequences.append(" ".join(seq))

        return sequences

    # if the number of tokens in 'text' is less than or equal to 5
    else:

      return [text]

In [87]:
# Create a sequence for each pokedex entry
seqs = [create_seq(i) for i in pokedex_entries]

# Merge the list of all sequences
seqs = sum(seqs, [])

# Count the sequences
len(seqs)

58466

In [88]:
# Create the inputs (x) and targets (y)
x = []
y = []

for s in seqs:
  x.append(" ".join(s.split()[:-1]))
  y.append(" ".join(s.split()[1:]))

In [89]:
# Create a mapping from integers to tokens
int2token = {}
cnt = 0

for w in set(" ".join(pokedex_entries).split()):
  int2token[cnt] = w
  cnt+= 1

# Create a token to integer mapping
token2int = {t: i for i, t in int2token.items()}

# Find the size of our vocabulary
vocab_size = len(int2token)

In [90]:
# Convert a list of words into their integer representations
def get_integer_seq(seq):
  return [token2int[w] for w in seq.split()]

# Convert the input and target sequences into their integer representations
x_int = np.array([get_integer_seq(i) for i in x])
y_int = np.array([get_integer_seq(i) for i in y])

In [91]:
# Get a set of batches from the test and training sets
def get_batches(arr_x, arr_y, batch_size):

    # The previous batch end
    prv = 0

    # Create the batches of size 'batch size'
    for n in range(batch_size, arr_x.shape[0], batch_size):
      x = arr_x[prv:n,:]
      y = arr_y[prv:n,:]
      prv = n
      yield x, y

In [92]:
class WordLSTM(nn.Module):

    def __init__(self, n_hidden=256, n_layers=4, drop_prob=0.3, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr

        self.emb_layer = nn.Embedding(vocab_size, 200)

        ## define the LSTM
        self.lstm = nn.LSTM(200, n_hidden, n_layers,
                            dropout=drop_prob, batch_first=True)

        ## define a dropout layer
        self.dropout = nn.Dropout(drop_prob)

        ## define the fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)

    def forward(self, x, hidden):
        """ Forward pass through the network.
            These inputs are x, and the hidden/cell state `hidden`. """

        ## pass input through embedding layer
        embedded = self.emb_layer(x)

        ## Get the outputs and the new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)

        ## pass through a dropout layer
        out = self.dropout(lstm_output)

        #out = out.contiguous().view(-1, self.n_hidden)
        out = out.reshape(-1, self.n_hidden)

        ## put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden


    def init_hidden(self, batch_size):
        """ initializes hidden state """
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        # if GPU is available
        if (torch.cuda.is_available()):
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())

        # if GPU is not available
        else:
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_())

        return hidden

In [93]:
# Instantiate the model
net = WordLSTM()

# Use GPU
net.cuda()

WordLSTM(
  (emb_layer): Embedding(5569, 200)
  (lstm): LSTM(200, 256, num_layers=4, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=5569, bias=True)
)

In [94]:
def train(net, epochs=10, batch_size=32, lr=0.001, clip=1, print_every=32):

    # optimizer
    opt = torch.optim.Adam(net.parameters(), lr=lr)

    # loss
    criterion = nn.CrossEntropyLoss()

    # push model to GPU
    net.cuda()

    counter = 0

    net.train()

    for e in range(epochs):

        # initialize hidden state
        h = net.init_hidden(batch_size)

        for x, y in get_batches(x_int, y_int, batch_size):
            counter+= 1

            # convert numpy arrays to PyTorch arrays
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

            # push tensors to GPU
            inputs, targets = inputs.cuda(), targets.cuda().to(torch.int64)

            # detach hidden states
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()

            # get the output from the model
            output, h = net(inputs, h)

            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(-1))

            # back-propagate error
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)

            # Update weights
            opt.step()

            if counter % print_every == 0:

              print("Epoch: {}/{}...".format(e+1, epochs),
                    "Step: {}...".format(counter))

In [95]:
# Train the model
train(net, batch_size = 32, epochs=20, print_every=256)

Epoch: 1/20... Step: 256...
Epoch: 1/20... Step: 512...
Epoch: 1/20... Step: 768...
Epoch: 1/20... Step: 1024...
Epoch: 1/20... Step: 1280...
Epoch: 1/20... Step: 1536...
Epoch: 1/20... Step: 1792...
Epoch: 2/20... Step: 2048...
Epoch: 2/20... Step: 2304...
Epoch: 2/20... Step: 2560...
Epoch: 2/20... Step: 2816...
Epoch: 2/20... Step: 3072...
Epoch: 2/20... Step: 3328...
Epoch: 2/20... Step: 3584...
Epoch: 3/20... Step: 3840...
Epoch: 3/20... Step: 4096...
Epoch: 3/20... Step: 4352...
Epoch: 3/20... Step: 4608...
Epoch: 3/20... Step: 4864...
Epoch: 3/20... Step: 5120...
Epoch: 3/20... Step: 5376...
Epoch: 4/20... Step: 5632...
Epoch: 4/20... Step: 5888...
Epoch: 4/20... Step: 6144...
Epoch: 4/20... Step: 6400...
Epoch: 4/20... Step: 6656...
Epoch: 4/20... Step: 6912...
Epoch: 4/20... Step: 7168...
Epoch: 5/20... Step: 7424...
Epoch: 5/20... Step: 7680...
Epoch: 5/20... Step: 7936...
Epoch: 5/20... Step: 8192...
Epoch: 5/20... Step: 8448...
Epoch: 5/20... Step: 8704...
Epoch: 5/20... St

In [96]:
# Predict the next token
def predict(net, tkn, h=None):

  # tensor inputs
  x = np.array([[token2int[tkn]]])
  inputs = torch.from_numpy(x)

  # push to GPU
  inputs = inputs.cuda()

  # detach hidden state from history
  h = tuple([each.data for each in h])

  # get the output of the model
  out, h = net(inputs, h)

  # get the token probabilities
  p = F.softmax(out, dim=1).data

  p = p.cpu()

  p = p.numpy()
  p = p.reshape(p.shape[1],)

  # get indices of top 3 values
  top_n_idx = p.argsort()[-3:][::-1]

  # randomly select one of the three indices
  sampled_token_index = top_n_idx[random.sample([0,1,2],1)[0]]

  # return the encoded value of the predicted char and the hidden state
  return int2token[sampled_token_index], h


# function to generate text
def sample(net, size, prime='it is'):

    # push to GPU
    net.cuda()

    net.eval()

    # batch size is 1
    h = net.init_hidden(1)

    toks = prime.split()

    # predict next token
    for t in prime.split():
      token, h = predict(net, t, h)

    toks.append(token)

    # predict subsequent tokens
    for i in range(size-1):
        token, h = predict(net, toks[-1], h)
        toks.append(token)

    return ' '.join(toks)

In [101]:
for i in range(10):
    print(sample(net, 15, prime="mew is able to "))


mew is able to live in huge colonies even if you sleep in the morning the frequency of the
mew is able to live in thunderclouds it freely finishes it with ultrasonic waves to identify for clouds as
mew is able to control minds this legendary cries to inflate them in thunderclouds it freely teleports in the
mew is able to live in thunderclouds its body to check its ears are made by the ultrasonic cries
mew is able to appear as a pet it is stricken with a dark daylight unmoving by flapping it
mew is able to live 10000 name in caves during the day and location using reflections of the skies
mew is able to appear from thunderclouds this pokemon can expand a legendary used that that has a texture
mew is able to appear as it eats the enemy in caves during the day it stays with a
mew is able to control prey and inflates its enemy it emits ultrasonic hours its mouth it uses its
mew is able to live in huge places they wont see the victim in a dark day because exposure
