In [2]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
import numpy as np
import pickle

In [16]:
file = open('book3.txt')   #processed book
data = file.read()
data = data.replace('\ufeff', "")
data = data[1:-1]
print("first line: " , data[:100])
print("last line: ", data[-100:])
print("Total words in this: ",len(data.split()))

first line:  roject gutenberg's the adventures of sherlock holmes by arthur conan doyle this ebook is for the use
last line:  lapped the hat upon his head it came right over the forehead and settled upon the bridge of his nose
Total words in this:  52217


In [17]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])   #see we passing a list here important 

pickle.dump(tokenizer, open('M2tokenizer.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]  #see here texts to seq returns a list 

# fit_on_texts Updates internal vocabulary based on a list of texts. 
# This method creates the vocabulary index based on word frequency. 
# So if you give it something like, "The cat sat on the mat." 
# It will create a dictionary s.t. word_index["the"] = 1; word_index["cat"] = 2

In [18]:
vocab_size = len(tokenizer.word_index) + 1  #number of unique words as the list is 0 based index add 1
print(vocab_size)

5834


In [19]:
X = []
Y = []
for i in range(0 , len(sequence_data)-4):
    word = []
    for j in range(i , i+3):   
        word.append(sequence_data[j])
    X.append(word)
    Y.append(sequence_data[i+3])


print(tokenizer.sequences_to_texts([X[0]]))
print(X[0])
print(tokenizer.sequences_to_texts([[Y[0]]]))
print(Y[0])

["roject gutenberg's the"]
[2894, 2895, 1]
['adventures']
1003


In [20]:
X = np.array(X)
Y = np.array(Y)
Y = to_categorical(Y, num_classes=vocab_size)
print(Y[0])
print(Y[0][1546])

[0. 0. 0. ... 0. 0. 0.]
0.0


In [21]:
model = Sequential([
    Embedding(vocab_size, 10, input_length=3),
    LSTM(1000, return_sequences=True),
    LSTM(1000),
    Dense(1000, activation="relu"),
    Dense(vocab_size, activation="softmax")
]
)

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 3, 10)             58340     
_________________________________________________________________
lstm_2 (LSTM)                (None, 3, 1000)           4044000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 1000)              8004000   
_________________________________________________________________
dense_2 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
dense_3 (Dense)              (None, 5834)              5839834   
Total params: 18,947,174
Trainable params: 18,947,174
Non-trainable params: 0
_________________________________________________________________


In [22]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [23]:
history = model.fit(X , Y , epochs=60, batch_size=64)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


In [25]:
pickle.dump(history.history, open('M2history.pkl', 'wb'))

In [26]:
model.save('m2.h5')