# Assignment 7
# Implementing RNN

Contents :

* Importing libraries
* Data extraction
* Data engineeering: converting characters into their respective integer representations
* Applying Sequential model
    * LSTM: number of neurons = 256
    * Dropout: rate = 0.2
* Mini-batch size = 128 (no. of samples per gradient update)
* Number of epochs = 100
* Number of characters to process in single go = 1000

In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils


In [0]:
#Read the data, turn it into lower case
data = open("shakespeare_input_mod.txt").read().lower()
#This get the set of characters used in the data and sorts them
chars = sorted(list(set(data)))
#Total number of characters used in the data
totalChars = len(data)
#Number of unique chars
numberOfUniqueChars = len(chars)

#This allows for characters to be represented by numbers
CharsForids = {char:Id for Id, char in enumerate(chars)}

#Converts numbers into the corresponding characters
idsForChars = {Id:char for Id, char in enumerate(chars)}



In [5]:
#How many timesteps e.g how many characters we want to process in one go
numberOfCharsToLearn = 1000

#Since our timestep sequence represents a process for every 100 chars we omit
#the first 100 chars so the loop runs a 100 less or there will be index out of
#range
counter = totalChars - numberOfCharsToLearn

#Input data
charX = []
#output data
y = []
#This loops through all the characters in the data skipping the first 100
# for i in range(0, counter, 1):
for i in range(0, counter, 1):  
    #This one goes from 0-100 so it gets 100 values starting from 0 and stops
    #just before the 100th value
    theInputChars = data[i:i+numberOfCharsToLearn]
    #With no : you start with 0, and so you get the actual 100th value
    #Essentially, the output Chars is the next char in line for those 100 chars
    #in X
    theOutputChars = data[i + numberOfCharsToLearn]
    #Appends every 100 chars ids as a list into X
    charX.append([CharsForids[char] for char in theInputChars])
    #For every 100 values there is one y value which is the output
    y.append(CharsForids[theOutputChars])

#Len charX represents how many of those time steps we have
#Our features are set to 1 because in the output we are only predicting 1 char
#Finally numberOfCharsToLearn is how many character we process
X = np.reshape(charX, (len(charX), numberOfCharsToLearn, 1))

#This is done for normalization
X = X/float(numberOfUniqueChars)

#This sets it up for us so we can have a categorical(#feature) output format
y = np_utils.to_categorical(y)
print(y)




[[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]]


In [0]:
model = Sequential()   # The sequential model is linear stack of layer
#Since we know the shape of our Data we can input the timestep and feature data
#The number of timestep sequence are dealt with in the fit function
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))    # No. of neurons: 256, No. of timestep: X.shape[1], No. of features: X.shape[2]
model.add(Dropout(0.2))
#number of features on the output
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, epochs=100, batch_size=128)
model.save_weights("weights.hdf5")    #to save weights in the file
#model.load_weights("weights.hdf5")   #to load weights from the file

randomVal = np.random.randint(0, len(charX)-1)
randomStart = charX[randomVal]    # chossing any random character number for initiating character
for i in range(500):
    x = np.reshape(randomStart, (1, len(randomStart), 1))    # reshaping randomStart into the shape (1,len(randomStart)); order = 1
    x = x/float(numberOfUniqueChars)
    pred = model.predict(x)
    index = np.argmax(pred)
    randomStart.append(index)
    randomStart = randomStart[1: len(randomStart)]
print("".join([idsForChars[value] for value in randomStart]))    #creating an empty string and appending it with words

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100