In [1]:
import spacy
import random
import numpy as np
import keras

from pickle import dump,load
from random import randint

from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

with open("moby_dick_four_chapters.txt") as f:
    doc = f.read()

Using TensorFlow backend.


In [2]:
# loading spacy model for preprocessing the text
nlp = spacy.load('en_core_web_sm',disable=['parser', 'tagger','ner'])

# Gives error if length> 1m so text length needs to be set explicitly
nlp.max_length = 1198623

In [3]:
# function to clean unnecessary tokens that d cause overfitting

def clean_text(doc):
    return [token.text.lower() for token in nlp(doc) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [4]:
tokens = clean_text(doc)

In [5]:
len(tokens)

11394

In [6]:
# creating sequences of tokens

# 25 training words , then one target word
train_len = 25+1 

text_sequences = []

for i in range(train_len, len(tokens)):
    
    # grabbing train_len 26 words
    seq = tokens[i-train_len:i]
    
    # adding to list of sequences
    text_sequences.append(seq)

In [7]:
len(text_sequences)

11368

In [8]:
text_sequences[0]

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on']

In [9]:
# integer encoding words in sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [10]:
# replaced words in text with particular ids 
# sequences with shifting one word over
sequences = np.array(sequences)
sequences[:10].T

array([[ 964,   14,  265,   51,  263,  416,   87,  222,  129,  111],
       [  14,  265,   51,  263,  416,   87,  222,  129,  111,  962],
       [ 265,   51,  263,  416,   87,  222,  129,  111,  962,  262],
       [  51,  263,  416,   87,  222,  129,  111,  962,  262,   50],
       [ 263,  416,   87,  222,  129,  111,  962,  262,   50,   43],
       [ 416,   87,  222,  129,  111,  962,  262,   50,   43,   37],
       [  87,  222,  129,  111,  962,  262,   50,   43,   37,  321],
       [ 222,  129,  111,  962,  262,   50,   43,   37,  321,    7],
       [ 129,  111,  962,  262,   50,   43,   37,  321,    7,   23],
       [ 111,  962,  262,   50,   43,   37,  321,    7,   23,  555],
       [ 962,  262,   50,   43,   37,  321,    7,   23,  555,    3],
       [ 262,   50,   43,   37,  321,    7,   23,  555,    3,  150],
       [  50,   43,   37,  321,    7,   23,  555,    3,  150,  261],
       [  43,   37,  321,    7,   23,  555,    3,  150,  261,    6],
       [  37,  321,    7,   23,  5

In [11]:
# splitting training and target sequences
X = sequences[:,:-1]
y = sequences[:,-1]

In [13]:
vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

2709

In [14]:
# keras padding sequences need an extra space to hold zero
y = to_categorical(y, num_classes=vocabulary_size+1) 

In [15]:
seq_len = X.shape[1] # setting seq_len = 25

In [16]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(input_dim=vocabulary_size+1,output_dim=seq_len,input_length=seq_len))
    model.add(LSTM(seq_len*2, return_sequences=True))
    model.add(LSTM(seq_len*2))
    model.add(Dense(seq_len*6, activation='relu'))

    model.add(Dense(vocabulary_size+1, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

In [17]:
model = create_model(vocabulary_size,seq_len)

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 25)            67750     
_________________________________________________________________
lstm_1 (LSTM)                (None, 25, 50)            15200     
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_1 (Dense)              (None, 150)               7650      
_________________________________________________________________
dense_2 (Dense)              (None, 2710)              409210    
Total params: 520,010
Trainable params: 520,010
Non-trainable params: 0
_________________________________________________________________


In [19]:
# fit model
history = model.fit(X, y, batch_size=128, epochs=200,verbose=1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/20

In [20]:
# save the model to file
model.save('epoch_200.h5')

# save the tokenizer
dump(tokenizer, open('tokenizer', 'wb'))

In [30]:
seed_text = ' '.join(text_sequences[0])

In [31]:
seed_text

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

In [36]:
tokenizer.texts_to_sequences([seed_text])

[[964,
  14,
  265,
  51,
  263,
  416,
  87,
  222,
  129,
  111,
  962,
  262,
  50,
  43,
  37,
  321,
  7,
  23,
  555,
  3,
  150,
  261,
  6,
  2704,
  14,
  24]]

In [41]:
# function to generate specific number of words followings a seed text

def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):

    output_text = []
    
    for i in range(num_gen_words):
        
        encoded_text = tokenizer.texts_to_sequences([seed_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        seed_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    return ' '.join(output_text)

In [42]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=25)

'shore i thought i would sail about you inquire the heavy footfall of the passage and you her harpooneer would be not seen i be'