In [72]:
import numpy as np
import sys
from nltk.tokenize import RegexpTokenizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [73]:
#import nltk
#nltk.download()
#nltk.download('stopwords')

In [74]:
file = open("frankenstein-2.txt").read()

In [75]:
def tokenize_words(input):
    input = input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return "".join(filtered)

processed_inputs = tokenize_words(file)
    
    

In [76]:
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i, c in enumerate(chars))

In [77]:
input_len = len(processed_inputs)
vocab_len = len(chars)

In [78]:
print(input_len, vocab_len)

233296 42


In [79]:
seq_length = 100
x_data = []
y_data = []

In [80]:
for i in range(0, input_len - seq_length, 1):
    in_seq = processed_inputs[i:i+seq_length]
    out_seq = processed_inputs[i+seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])
n_patterns = len(x_data)
print(n_patterns)

233196


In [81]:
X = np.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [82]:
y = np_utils.to_categorical(y_data)

In [83]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.25))
model.add(LSTM(256, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation = 'softmax'))

In [84]:
model.compile(loss='categorical_crossentropy',optimizer = 'adam')

In [85]:
filepath = 'model_Weights.hdf5'
checkpoint = ModelCheckpoint(filepath,monitor = 'loss',verbose = 1, save_best_only=True,mode='min')
desired_callbacks=[checkpoint]

In [86]:
model.fit(X,y,epochs = 4, batch_size = 256, callbacks = desired_callbacks)

Epoch 1/4

Epoch 00001: loss improved from inf to 2.93511, saving model to model_Weights.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.93511 to 2.91586, saving model to model_Weights.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.91586 to 2.90982, saving model to model_Weights.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.90982 to 2.87830, saving model to model_Weights.hdf5


<keras.callbacks.callbacks.History at 0x7fe4fa1d9668>

In [87]:
filename = "model_Weights.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [88]:
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [91]:
start = np.random.randint(0, len(x_data)-1)
pattern = x_data[start]
print("Random Seed:")
print("\"",''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" ligencesearchphilosopherstoneelixirlifelattersoonobtainedundividedattentionwealthinferiorobjectglory "


In [92]:
for i in range(1000):
  x = np.reshape(pattern, (1,len(pattern),1))
  x = x/float(vocab_len)
  prediction = model.predict(x, verbose = 0)
  index = np.argmax(prediction)
  result = num_to_char[index]
  seq_in = [num_to_char[value] for value in pattern]
  sys.stdout.write(result)
  pattern.append(index)
  pattern = pattern[1:len(pattern)]

ereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereerereer