In [1]:
import numpy as np
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.layers import CuDNNLSTM, Dropout, Dense
from keras.models import Sequential
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [2]:
!wget https://www.gutenberg.org/files/84/84-0.txt

--2019-07-05 04:00:50--  https://www.gutenberg.org/files/84/84-0.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 450783 (440K) [text/plain]
Saving to: ‘84-0.txt’


2019-07-05 04:00:50 (1.13 MB/s) - ‘84-0.txt’ saved [450783/450783]



In [0]:
file = open('84-0.txt').read()

In [0]:
def tokenize_words(input):
  # lowercase all words
  input = input.lower()
  
  # instantiate the tokenizer
  tokenizer = RegexpTokenizer(r'\w+')
  tokens = tokenizer.tokenize(input)
  
  # remove the stop words from the text
  filtered = filter(lambda token: token not in stopwords.words("english"), tokens)
  return " ".join(filtered)

In [0]:
# processing the file
processed_input = tokenize_words(file)

In [0]:
chars = sorted(list(set(processed_input)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [6]:
input_len = len(processed_input)  
vocab_len = len(chars)  
print ("Total number of characters:", input_len)  
print ("Total vocab:", vocab_len) 

Total number of characters: 269995
Total vocab: 43


In [0]:
seq_length = 100  
x_data = []  
y_data = []  

In [0]:
# loop through inputs, start at the beginning and go until we hit
# the final character we can create a sequence out of
for i in range(input_len - seq_length):
  # Input is the current character plus desired sequence length
  in_seq = processed_input[i:i + seq_length]
  
  # Out sequence is the initial character plus total sequence length
  out_seq = processed_input[i + seq_length]
  
  # We now convert list of characters to integers based on dict previously created
  x_data.append([char_to_num[char] for char in in_seq])
  y_data.append([char_to_num[out_seq]])

In [9]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns) 

Total Patterns: 269895


In [0]:
X = np.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [0]:
# one hot encoding our labels
y = np_utils.to_categorical(y_data)

In [12]:
model = Sequential()
model.add(CuDNNLSTM(512, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))  
model.add(Dropout(0.2))  
model.add(CuDNNLSTM(256, return_sequences=True))  
model.add(Dropout(0.2))  
model.add(CuDNNLSTM(128))  
model.add(Dropout(0.2))  
model.add(Dense(y.shape[1], activation='softmax'))  

W0705 04:35:47.192234 140279809783680 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0705 04:35:47.210285 140279809783680 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0705 04:35:47.943999 140279809783680 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0705 04:35:48.512077 140279809783680 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0705 04:35:48.521933 

In [13]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cu_dnnlstm_1 (CuDNNLSTM)     (None, 100, 512)          1054720   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 512)          0         
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     (None, 100, 256)          788480    
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 256)          0         
_________________________________________________________________
cu_dnnlstm_3 (CuDNNLSTM)     (None, 128)               197632    
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 43)                5547      
Total para

In [14]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

W0705 04:35:48.902786 140279809783680 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0705 04:35:48.930813 140279809783680 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3295: The name tf.log is deprecated. Please use tf.math.log instead.



In [0]:
filepath = "model_weights_saved.hdf5"  
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')  
desired_callbacks = [checkpoint]  

In [16]:
history = model.fit(X, y, epochs=25, batch_size=512, callbacks=desired_callbacks)  

W0705 04:35:49.042517 140279809783680 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/25

Epoch 00001: loss improved from inf to 2.94545, saving model to model_weights_saved.hdf5
Epoch 2/25

Epoch 00002: loss improved from 2.94545 to 2.78778, saving model to model_weights_saved.hdf5
Epoch 3/25

Epoch 00003: loss improved from 2.78778 to 2.61663, saving model to model_weights_saved.hdf5
Epoch 4/25

Epoch 00004: loss improved from 2.61663 to 2.51343, saving model to model_weights_saved.hdf5
Epoch 5/25

Epoch 00005: loss improved from 2.51343 to 2.42591, saving model to model_weights_saved.hdf5
Epoch 6/25

Epoch 00006: loss improved from 2.42591 to 2.35180, saving model to model_weights_saved.hdf5
Epoch 7/25

Epoch 00007: loss improved from 2.35180 to 2.28569, saving model to model_weights_saved.hdf5
Epoch 8/25

Epoch 00008: loss improved from 2.28569 to 2.23087, saving model to model_weights_saved.hdf5
Epoch 9/25

Epoch 00009: loss improved from 2.23087 to 2.17772, saving model to model_weights_saved.hdf5
Epoch 10/25

Epoch 00010: loss improved from 2.17772 to 2.1

KeyboardInterrupt: ignored