<a href="https://colab.research.google.com/github/The-KS101/AmesHousingRegression/blob/master/StateOfTheUnionGRU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Using time series Deep Learning tools to build a character Level Language Model
####Creating an RNN model to derive a language model 
#####Import the necessary libraries



In [None]:
import sys
import numpy as np
import pickle
import random
import re
import os
from nltk.corpus import gutenberg
from keras.models import Sequential
from keras.layers import Dense, Bidirectional, Dropout
from keras.layers import BatchNormalization, SimpleRNN, GRU, LSTM
from keras.callbacks import LambdaCallback, ModelCheckpoint
from keras.utils.data_utils import get_file
from __future__ import print_function

Using TensorFlow backend.


Firstly we download the state union pack from nltk corpus which contains various text corpus of state union documents.

In [None]:
import nltk
nltk.download('state_union')

[nltk_data] Downloading package state_union to /root/nltk_data...
[nltk_data]   Unzipping corpora/state_union.zip.


True

Here we import all the documents in the state union folder and get all the words into a text variable which we will process for our train and test set.

In [None]:
corpora_dir = '/root/nltk_data/corpora/state_union'
file_list = []
docs = []

for root, _, files in os.walk(corpora_dir):
  for filename in files:
    file_list.append(os.path.join(corpora_dir, filename))

print('Read {} files'.format(len(file_list)))
for files in file_list:
  with open(files, 'r') as fin:
    try:
      str_fin = fin.read().lower().replace('\n', '')
      docs.append(str_fin)
    except UnicodeDecodeError:
      pass

text = ' '.join(docs)
print('Corpus Length is ' + str(len(text)))

Read 66 files
Corpus Length is 1915949


Now we map our letters to numbers which, as we know, is what our computer program runs through. To do this we will create 2 dictionaries mapping each letter to a number and each number to a letter for ease.

In [None]:
characters = sorted(list(set(text)))
print('Total Characters: ', len(characters))
charIndices = dict((l, i) for i, l in enumerate(characters))
indicesChar = dict((i, l) for i, l in enumerate(characters))

Total Characters:  57


Now we break up te text into smaller splits which we will feed our neural network. This splits will contain 40 input characters and 1 output character per sequence.

In [None]:
training_seq = []
output_char = []
seq_length, stride = 40, 3
for i in range(0, len(text)-seq_length, stride):
  training_seq.append(text[i : i +seq_length])
  output_char.append(text[i + seq_length])

In [None]:
print('Number of training rows: ', len(training_seq))
print('First Sequence: ', training_seq[0])
print('Next char: ', output_char[0])
print('Second Sequence: ', training_seq[1])
print('Next char: ', output_char[1])

Number of training rows:  638637
First Sequence:  president lyndon b. johnson's annual mes
Next char:  s
Second Sequence:  sident lyndon b. johnson's annual messag
Next char:  e


Now we vectorize our input texts and one hot encode each variable.

In [None]:
x = np.zeros((len(training_seq), seq_length, len(characters)), dtype=np.bool)
y = np.zeros((len(training_seq), len(characters)), dtype=np.bool)
for i, sequence in enumerate(training_seq):
  for j, char in enumerate(sequence):
    x[i, j, charIndices[char]] = 1
  y[i, charIndices[output_char[i]]] = 1
print('Data Vectorization Completed.')
print('Feature Vector Space: ', x.shape)
print('Label vector space: ', y.shape)


Data Vectorization Completed.
Feature Vector Space:  (638637, 40, 57)
Label vector space:  (638637, 57)


We now creat a function that uses threshold sampling to redistribute the softmax prediction probabilitites of our model.

In [None]:
def sample(softmax_predictions, sample_threshold=0.1):
  softmax_preds = np.asarray(softmax_predictions).astype('float64')
  log_preds = np.log(softmax_preds)/sample_threshold
  exp_preds = np.exp(log_preds)
  norm_preds = exp_preds/np.sum(exp_preds)
  prob = np.random.multinomial(1, norm_preds, 1)
  return np.argmax(prob)

Now we build our neural network, we will create multiple neural networks to test their performance in generating texts with respect to Hamlet.

In [None]:
#First we create a callback function
def on_epoch_end(epoch, _):
  global model, model_name
  print()
  print('----- Generating text after Epoch: {}'.format(epoch))
  start_index = random.randint(0, len(text) - seq_length - 1)
  end_index = start_index + seq_length
  sampThresh = [0.2, 0.5, 1.0, 1.2]
  for thresh in sampThresh:
    print('----Sampling Threshold : ', thresh)
    generated = ''
    sentence = text[start_index:end_index]
    generated += sentence
    print('Input sequence to generate from "{}" '.format(sentence))
    sys.stdout.write(generated)
    for i in range(400):
      x_pred = np.zeros((1, seq_length, len(characters)))
      for n, char in enumerate(sentence):
        x_pred[0, n, charIndices[char]] = 1
      preds = model.predict(x_pred, verbose=0)[0]
      next_index = sample(preds, thresh)
      next_char = indicesChar[next_index]
      generated += next_char
      sentence = sentence[1:] + next_char
      sys.stdout.write(next_char)
      sys.stdout.flush()
    print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

Now we create a helper function to train, sample and save a list of RNN models.


In [None]:
def test_models(lists, epochs=10):
  global model, model_name

  for net in lists:
    print('Initiating Compilation...')
    model = net()
    model_name = re.split(' ', str(net))[1]
    fp = '/content/sample_data/{}.h5'.format(model_name)
    checkpoint = ModelCheckpoint(fp, monitor='loss',
                                 verbose=0, save_best_only=True, mode='min')
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    print('Compiled: ', str(model_name))
    network = model.fit(x, y, batch_size=128,
                        epochs=epochs, callbacks=[print_callback, checkpoint])
    model.summary()
    with open('/content/sample_data/{}hist.pkl'.format(model_name)
              , 'wb') as file_pi:
      pickle.dump(model.history, file_pi)

Now we create multiple RNNs and GRUs to test the output of various neural networks.

In [None]:
#Simple RNN Stacked Model
def SimpleRNN_stacked_model():
  model=Sequential()
  model.add(LSTM(128, input_shape=(seq_length, len(characters)),
                      return_sequences=True))
  model.add(LSTM(128))
  model.add(Dense(len(characters), activation='softmax'))
  return model

#GRU model
def GRU_stacked_model():
  model = Sequential()
  model.add(GRU(128, input_shape=(seq_length, len(characters)),
                return_sequences=True))
  model.add(GRU(128))
  model.add(Dense(len(characters), activation='softmax'))
  return model

#Bidirectional GRU
def Bi_directional_GRU():
  model = Sequential()
  model.add(Bidirectional(GRU(128, input_shape=(seq_length, len(characters)),
                              return_sequences=True)))
  model.add(Bidirectional(GRU(128)))
  model.add(Dense(len(characters), activation='softmax'))
  return model

#Larger GRU
def larger_GRU():
  model = Sequential()
  model.add(GRU( 128, input_shape=(seq_length, len(characters)),
                return_sequences=True, dropout=0.2,  recurrent_dropout=0.2))
  model.add(GRU(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
  model.add(GRU(128, dropout=0.2, recurrent_dropout=0.2))
  model.add(Dense(128, activation='relu'))
  model.add(Dense(len(characters), activation='softmax'))
  return model

We now pass all the created models to a list to be passed to the test_models function.

In [None]:
print(seq_length)
all_models = [GRU_stacked_model,]
test_models(all_models, epochs=30)            

40
Initiating Compilation...
Compiled:  GRU_stacked_model
Epoch 1/30

----- Generating text after Epoch: 0
----Sampling Threshold :  0.2
Input sequence to generate from "n capital fund, with a focused and noble" 
n capital fund, with a focused and noble and decame the sourd and the enterpation of the sure this people in the enterpation of the security of the americans and the security of the sourd and the security and enterpaining the sure that the continue to the sure to the americans in the people in the security of the security of the security of the sourd and defense to the sure the sourded to be the security of the sure the sourd to the pro
----Sampling Threshold :  0.5
Input sequence to generate from "n capital fund, with a focused and noble" 
n capital fund, with a focused and noble to recome the more to distries in the continue to and the must years and federal sure to the perion of the livent to end the surment in the enter our better of the most should dependent to the streng