In [10]:
#import dependencies
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

In [6]:
#load data
#loading data and opening our input data in the form of a text file
#have to define file encoding as default encoding doesn't match the file encoding
file = open("frankenstein.txt", encoding="utf-8").read()

In [7]:
#tokenization : it is the process of breaking a stream of text input into words, phrases, symbols or meaningful elements 
#standardization
def tokenize_words(input):
    #lowercase everything to standardize
    input=input.lower()
    #instantiate teh tokenizer
    tokenizer=RegexpTokenizer(r'\w+')
    #tokenizing the text into tokens
    tokens = tokenizer.tokenize(input)
    #filtering the stopwords using lambda
    filtered = filter(lambda token: token not in stopwords.words('english'),tokens)
    return "".join(filtered)
processed_inputs = tokenize_words(file)

In [8]:
#chars to numbers
#convert characters in our input to numbers
#we'll sort out the list of the set of all characters that appear in our input text and then use the enumerate function
#we'll then create the dictionary that stores the keys and values, or the characters and the numbers that represent them
chars=sorted(list(set(processed_inputs)))
chars_to_num=dict((c,i) for i,c in enumerate(chars))

In [14]:
#check if words to chars or chars to num has worked?
#just so we can get an idea of whether our process of converting words to characters has worked 
#we print the length of our variables
input_len=len(processed_inputs)
vocab_len=len(chars)
print("Total number of characters: ",input_len)
print("total vocab: ",vocab_len)

Total number of characters:  233296
total vocab:  42


In [15]:
#seq length
#we are defining hwo long we want an individual sequence here
#an individual sequence si a complete mapping of input characters as integers
seq_length = 100
x_data =[]
y_data=[]

In [16]:
#loop through the sequence
#here we ar egoing through the entire list of inputs and converting the chars to numbers with a for loop
#this will create  bunch of sequences where each sequence starts with the next character in the input data
#beginning with the first character
for i in range(0, input_len-seq_length, 1):
    #define input and output response
    #input is current character plus the desired sequence length
    in_seq = processed_inputs[i:i + seq_length]
    #out sequence is the initial characetr plus the total sequence length
    out_seq = processed_inputs[i + seq_length]
    #converting the list of characters to integers based on previous values and appendng the values to our list
    x_data.append([chars_to_num[char] for char in in_seq])
    y_data.append(chars_to_num[out_seq])
    
#check to see how many total input sequences we have
n_patterns = len(x_data)
print("Total Patterns: ",n_patterns)

Total Patterns:  233196


In [17]:
#convert input sequence to np array that our network can use
X = numpy.reshape(x_data, (n_patterns ,seq_length, 1))
X = X/float(vocab_len)

In [18]:
#one-hot encoding
y = np_utils.to_categorical(y_data)

In [19]:
#creatign the model
#creating a sequential model
#dropout is used to prevent overfitting
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
#compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

Instructions for updating:
Colocations handled automatically by placer.


In [20]:
#saving weights
filepath="model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [21]:
#fit model andlet it train
model.fit(X,y,epochs=10, batch_size=256, callbacks=desired_callbacks)

Instructions for updating:
Use tf.cast instead.
Epoch 1/10

Epoch 00001: loss improved from inf to 2.93760, saving model to model_weights_saved.hdf5
Epoch 2/10

Epoch 00002: loss improved from 2.93760 to 2.91644, saving model to model_weights_saved.hdf5
Epoch 3/10

Epoch 00003: loss improved from 2.91644 to 2.90779, saving model to model_weights_saved.hdf5
Epoch 4/10

Epoch 00004: loss improved from 2.90779 to 2.87411, saving model to model_weights_saved.hdf5
Epoch 5/10

Epoch 00005: loss improved from 2.87411 to 2.84017, saving model to model_weights_saved.hdf5
Epoch 6/10

Epoch 00006: loss improved from 2.84017 to 2.78469, saving model to model_weights_saved.hdf5
Epoch 7/10

Epoch 00007: loss improved from 2.78469 to 2.72021, saving model to model_weights_saved.hdf5
Epoch 8/10

Epoch 00008: loss improved from 2.72021 to 2.65326, saving model to model_weights_saved.hdf5
Epoch 9/10

Epoch 00009: loss improved from 2.65326 to 2.58950, saving model to model_weights_saved.hdf5
Epoch 10/10

<keras.callbacks.callbacks.History at 0x28394c02f08>

In [22]:
#recompile model with the saved weights
filename="model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [29]:
#output of the model back to characters
num_to_char = dict((i,c) for i, c in enumerate(chars))

In [32]:
#random seed to help generate
start=numpy.random.randint(0, len(x_data)-1)
pattern=x_data[start]
print("Random Seed: ")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed: 
" hertotownpartlycuriositypartlyidlenesswentlecturingroomwaldmanenteredshortlyprofessorunlikecolleague "


In [35]:
#generate the text
for i in range(1000):
    x=numpy.reshape(pattern,(1, len(pattern),1))
    x=x/float(vocab_len)
    prediction = model.predict(x,verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern=pattern[1:len(pattern)]

rtedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedsertedserted

In [None]:
# I tried generating for larger epochs but due to network issues I couldn't so the output doesn't actually make much sense