<a href="https://colab.research.google.com/github/PamelaVQ/Base-ML/blob/master/Pytorch_Basics/RNN_for_Sentence_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Use Recurrent Neural Network for Sentence Generation

Reference Links:

http://karpathy.github.io/2015/05/21/rnn-effectiveness/

https://gist.github.com/karpathy/d4dee566867f8291f086

https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py


In [2]:
from tensorflow.keras.utils import get_file
import io
import sklearn
import sklearn.feature_extraction
import numpy as np
from torch import nn
import torchvision
import torch

In [3]:
def pattern_text(start_pattern, end_pattern, data):
  # pattern = f'{start_pattern}(.*){end_pattern}'
  # result = re.search(re.escape(pattern), data)
  result = data[data.find(start_pattern)+len(start_pattern):data.rfind(end_pattern)]
  return result

In [4]:
path = get_file("agatha_christie", origin="https://www.gutenberg.org/files/863/863-0.txt")
with io.open(path, encoding='utf-8') as read_file:
  data_agatha_christie = read_file.readlines()
# start_text = """*** START OF THIS PROJECT GUTENBERG EBOOK THE MYSTERIOUS AFFAIR AT STYLES ***""".lower()
# end_text = """*** END OF THIS PROJECT GUTENBERG EBOOK THE MYSTERIOUS AFFAIR AT STYLES ***""".lower()
# data_agatha_christie = pattern_text(start_text, end_text, data)
# print(f'agatha_christie corpus length:{len(data_agatha_christie)}')

path = get_file("lewis_carroll", origin="https://www.gutenberg.org/files/11/11-0.txt")
with io.open(path, encoding='utf-8') as read_file:
  data_lewis_carroll = read_file.readlines()
# start_text = """*** START OF THIS PROJECT GUTENBERG EBOOK ALICE’S ADVENTURES IN WONDERLAND ***""".lower()
# end_text = """*** END OF THIS PROJECT GUTENBERG EBOOK ALICE’S ADVENTURES IN WONDERLAND ***""".lower()
# data_lewis_carroll = pattern_text(start_text, end_text, data)
# print(f'lewis_carroll corpus length:{len(data_lewis_carroll)}')

Downloading data from https://www.gutenberg.org/files/863/863-0.txt
Downloading data from https://www.gutenberg.org/files/11/11-0.txt


In [5]:
def reverse_dictionary(my_map):
  return {v: k for k, v in my_map.items()}

def create_vocab(corpus):
  # corpus = [x for x in data if len(x)>0]
  vectorizer = sklearn.feature_extraction.text.CountVectorizer(min_df=1)
  X = vectorizer.fit_transform(corpus)
  analyzer = vectorizer.build_analyzer()
  output_corpus = [x for x in [list(map(lambda x: vectorizer.vocabulary_.get(x), analyzer(line))) for line in corpus] if x]
  word2idx = vectorizer.vocabulary_
  # print(f'word2idx: {word2idx}')
  idx2word = reverse_dictionary(word2idx)
  # print(f'idx2word: {idx2word}')
  return word2idx, idx2word, output_corpus

word2idx, idx2word, output_corpus = create_vocab(data_agatha_christie)

In [6]:
def get_vocab_details(corpus):
  max_length = max([len(corpus) for corpus in corpus])
  vocab_size = len(corpus)
  return max_length, vocab_size

max_length, vocab_size = get_vocab_details(output_corpus)

In [7]:
class RNN(nn.Module):
    def __init__(self, max_length, vocab_size):
      super(RNN, self).__init__()
      # self.embed = nn.Embedding(input_size, 128)
      self.lstm = nn.LSTM(128, input_shape=(max_length, vocab_size))
      self.dense = nn.Dense(vocab_size, activation='softmax')

    def forward(self, x):
      x = self.lstm(x)
      x = self.dense(x)

Understanding LSTM: [Pytorch LSTM Docs](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html)

In [8]:
# testing LSTM
lstm = nn.LSTM(3, 3)  # Input dim is 3, output dim is 3
inputs = [torch.randn(1, 3) for _ in range(5)]  # make a sequence of length 5
print(inputs[0].shape)
# initialize the hidden state.
hidden = (torch.randn(1, 1, 3),
          torch.randn(1, 1, 3))
print(hidden)

torch.Size([1, 3])
(tensor([[[-0.2508, -0.1271, -1.1803]]]), tensor([[[ 0.8972,  0.3283, -0.1603]]]))


In [9]:
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden) # (3,3) ((3,3), (3,3))

In [10]:
# do all 3 inputs at once
inputs_1 = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))
output, hidden = lstm(inputs_1, hidden)
print(output.shape)
print(hidden[0].shape)

torch.Size([5, 1, 3])
torch.Size([1, 1, 3])


Learning: [An LSTM for Part-of-Speech Tagging](https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html#example-an-lstm-for-part-of-speech-tagging)

In [11]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [12]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

In [13]:
class LSTMSequenceGenerator(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size):
      super(LSTMSequenceGenerator, self).__init__()
      self.hidden_dim = hidden_dim
      self.embeddings = nn.Embedding(vocab_size, embedding_dim)
      self.lstm = nn.LSTM(embedding_dim, hidden_dim)
      self.dense = nn.Dense(embedding_dim, activation='softmax')

    def forward(self, sentence):
      embs = self.embeddings(sentence)
      lstm_out, _ = self.lstm(embs.view(len(sentence), 1, -1))
      out = self.dense(lstm_out.view(len(sentence), 1, -1))
      return out
