In [1]:
#!pip install regex
#!pip install tensorflow

In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [4]:
file = open('1661-0.txt', encoding="utf8")

#store file in a list
lines = []
for i in file:
    lines.append(i)

#Convert list to string
data = ""
for i in lines:
    data = ' '.join(lines)
    
#Replace unnecessary stuff with space
data = data.replace('\n','').replace('\r','').replace('\ufeff','').replace('"','')

#Remove unnecessary space
data = data.split()
data = ' '.join(data)
data[:500]

"Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.net Title: The Adventures of Sherlock Holmes Author: Arthur Conan Doyle Release Date: November 29, 2002 [EBook #1661] Last Updated: May 20, 2019 Language: English Character set en"

In [5]:
len(data)

578728

## Apply Tokenization

In [23]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

#Saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl','wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]

[145, 4789, 1, 1020, 4, 128, 34, 45, 611, 2235, 2236, 30, 1021, 15, 23]

In [24]:
len(sequence_data)

111252

In [25]:
vocabulary_size = len(tokenizer.word_index) + 1
print(vocabulary_size)

8931


In [27]:
sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)
    
print("The length of sequences: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The length of sequences:  111249


array([[ 145, 4789,    1, 1020],
       [4789,    1, 1020,    4],
       [   1, 1020,    4,  128],
       [1020,    4,  128,   34],
       [   4,  128,   34,   45],
       [ 128,   34,   45,  611],
       [  34,   45,  611, 2235],
       [  45,  611, 2235, 2236],
       [ 611, 2235, 2236,   30],
       [2235, 2236,   30, 1021]])

In [28]:
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])
    
X = np.array(X)
y = np.array(y)

In [29]:
print("Data: ", X[:10])
print("Response: ", y[:10])

Data:  [[ 145 4789    1]
 [4789    1 1020]
 [   1 1020    4]
 [1020    4  128]
 [   4  128   34]
 [ 128   34   45]
 [  34   45  611]
 [  45  611 2235]
 [ 611 2235 2236]
 [2235 2236   30]]
Response:  [1020    4  128   34   45  611 2235 2236   30 1021]


In [30]:
y = to_categorical(y, num_classes=vocabulary_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

## Creating the model

In [31]:
model = Sequential()
model.add(Embedding(vocabulary_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocabulary_size, activation="softmax"))

In [32]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 3, 10)             89310     
                                                                 
 lstm_2 (LSTM)               (None, 3, 1000)           4044000   
                                                                 
 lstm_3 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense_2 (Dense)             (None, 1000)              1001000   
                                                                 
 dense_3 (Dense)             (None, 8931)              8939931   
                                                                 
Total params: 22,078,241
Trainable params: 22,078,241
Non-trainable params: 0
_________________________________________________________________


## Building the model

In [21]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_words.h5", monitor="loss", verbose=1, save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))
model.fit(X, y, epochs=70, batch_size=64, callbacks=[checkpoint])

Epoch 1/70
Epoch 1: loss improved from inf to 0.76373, saving model to next_words.h5
Epoch 2/70
Epoch 2: loss improved from 0.76373 to 0.66366, saving model to next_words.h5
Epoch 3/70
Epoch 3: loss improved from 0.66366 to 0.64627, saving model to next_words.h5
Epoch 4/70
Epoch 4: loss improved from 0.64627 to 0.62844, saving model to next_words.h5
Epoch 5/70
Epoch 5: loss improved from 0.62844 to 0.61105, saving model to next_words.h5
Epoch 6/70
Epoch 6: loss improved from 0.61105 to 0.59672, saving model to next_words.h5
Epoch 7/70
Epoch 7: loss improved from 0.59672 to 0.58450, saving model to next_words.h5
Epoch 8/70
Epoch 8: loss improved from 0.58450 to 0.56658, saving model to next_words.h5
Epoch 9/70
Epoch 9: loss improved from 0.56658 to 0.56071, saving model to next_words.h5
Epoch 10/70
Epoch 10: loss improved from 0.56071 to 0.55153, saving model to next_words.h5
Epoch 11/70
Epoch 11: loss improved from 0.55153 to 0.54276, saving model to next_words.h5
Epoch 12/70
Epoch 12:

<keras.callbacks.History at 0x7f1e69055d00>

## **Prediction**

In [33]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

#Load the model and tokenizer
model = load_model("next_words.h5")
tokenizer = pickle.load(open("token.pkl", "rb"))

def predict_next_word(model, tokenizer, text):
  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = " "

  for key, value in tokenizer.word_index.items():
    if value == preds:
      predicted_word = key
      break
  print(predicted_word)
  return predicted_word

In [35]:
while(True):
  text = input("Enter the text: ")

  if text == "0":
    print("Execution Completed")
    break
  
  else:
    try:
      text = text.split(" ")
      text = text[-3:]
      print(text)

      predict_next_word(model, tokenizer, text)
    
    except Exception as e:
      print("Error Occured: ",e)
      continue

Enter the text: of Sherlock Holmes
['of', 'Sherlock', 'Holmes']
by
Enter the text: single man of
['single', 'man', 'of']
the
Enter the text: Sir Williams and 
['Williams', 'and', '']
then
Enter the text: the lady is
['the', 'lady', 'is']
correct
Enter the text: the student is
['the', 'student', 'is']
one
Enter the text: 0
Execution Completed
