# Next Word Prediction
<img src="predict.png" width="700px">

### Importing Important Libraries

In [1]:
!pip3 install tensorflow
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os
from tensorflow.keras.models import load_model



### Importing Dataset

In [2]:
file = open("1661-0.txt", "r", encoding = "utf8")
lines = []

for i in file:
    lines.append(i)
    
print("The First Line: ", lines[0])
print("The Last Line: ", lines[-1])

The First Line:  ﻿Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle

The Last Line:  subscribe to our email newsletter to hear about new eBooks.


### Data Cleaning

In [3]:
data = ""

for i in lines:
    data = ' '. join(lines)
    
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '')
data[:360]

"Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle  This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever.  You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.net   Title: The Adventures of Sherlo"

In [4]:
import string

translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
new_data = data.translate(translator)

new_data[:500]

'Project Gutenberg s The Adventures of Sherlock Holmes  by Arthur Conan Doyle  This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever   You may copy it  give it away or re use it under the terms of the Project Gutenberg License included with this eBook or online at www gutenberg net   Title  The Adventures of Sherlock Holmes  Author  Arthur Conan Doyle  Release Date  November 29  2002  EBook  1661  Last Updated  May 20  2019  Language  English  Characte'

In [5]:
z = []

for i in data.split():
    if i not in z:
        z.append(i)
        
data = ' '.join(z)
data[:500]

"Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle This eBook is for the use anyone anywhere at no cost and with almost restrictions whatsoever. You may copy it, give it away or re-use under terms Gutenberg License included this online www.gutenberg.net Title: Holmes Author: Release Date: November 29, 2002 [EBook #1661] Last Updated: May 20, 2019 Language: English Character set encoding: UTF-8 *** START OF THIS PROJECT GUTENBERG EBOOK THE ADVENTURES SHERLOCK HOLMES Prod"

### Data Tokenization

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function.
pickle.dump(tokenizer, open('tokenizer1.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:10]

[37, 114, 4, 38, 18, 39, 19, 40, 115, 116]

In [7]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

791


In [8]:
sequences = []

for i in range(1, len(sequence_data)):
    words = sequence_data[i-1:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  995


array([[ 37, 114],
       [114,   4],
       [  4,  38],
       [ 38,  18],
       [ 18,  39],
       [ 39,  19],
       [ 19,  40],
       [ 40, 115],
       [115, 116],
       [116, 117]])

In [9]:
X = []
y = []

for i in sequences:
    X.append(i[0])
    y.append(i[1])
    
X = np.array(X)
y = np.array(y)
print("The Data is: ", X[:5])
print("The responses are: ", y[:5])

The Data is:  [ 37 114   4  38  18]
The responses are:  [114   4  38  18  39]


In [10]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

### Data Modelling

In [11]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1, 10)             7910      
_________________________________________________________________
lstm (LSTM)                  (None, 1, 1000)           4044000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 1000)              8004000   
_________________________________________________________________
dense (Dense)                (None, 1000)              1001000   
_________________________________________________________________
dense_1 (Dense)              (None, 791)               791791    
Total params: 13,848,701
Trainable params: 13,848,701
Non-trainable params: 0
_________________________________________________________________


### Callbacks

In [13]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard

checkpoint = ModelCheckpoint("nextword1.h5", monitor='loss', verbose=1,
    save_best_only=True, mode='auto')

reduce = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001, verbose = 1)

logdir='logsnextword1'
tensorboard_Visualization = TensorBoard(log_dir=logdir)

### Model Compiling

In [14]:
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))

### Model Fitting

In [15]:
model.fit(X, y, epochs=200, batch_size=70, callbacks=[checkpoint, reduce, tensorboard_Visualization])

Epoch 1/200

Epoch 00001: loss improved from inf to 6.67580, saving model to nextword1.h5
Epoch 2/200

Epoch 00002: loss improved from 6.67580 to 6.66584, saving model to nextword1.h5
Epoch 3/200

Epoch 00003: loss improved from 6.66584 to 6.63284, saving model to nextword1.h5
Epoch 4/200

Epoch 00004: loss improved from 6.63284 to 6.52332, saving model to nextword1.h5
Epoch 5/200

Epoch 00005: loss improved from 6.52332 to 6.37722, saving model to nextword1.h5
Epoch 6/200

Epoch 00006: loss improved from 6.37722 to 6.25465, saving model to nextword1.h5
Epoch 7/200

Epoch 00007: loss improved from 6.25465 to 6.05856, saving model to nextword1.h5
Epoch 8/200

Epoch 00008: loss improved from 6.05856 to 5.92299, saving model to nextword1.h5
Epoch 9/200

Epoch 00009: loss improved from 5.92299 to 5.81981, saving model to nextword1.h5
Epoch 10/200

Epoch 00010: loss improved from 5.81981 to 5.75998, saving model to nextword1.h5
Epoch 11/200

Epoch 00011: loss improved from 5.75998 to 5.7026

<keras.callbacks.History at 0x7fb65915bc50>

### Next Word Prediction Function

In [16]:
# Load the model and tokenizer

model = load_model('nextword1.h5')
tokenizer = pickle.load(open('tokenizer1.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):
    for i in range(3):
        sequence = tokenizer.texts_to_sequences([text])[0]
        sequence = np.array(sequence)
        
        preds_x = model.predict(sequence)
        preds = np.argmax(preds_x,axis=1)
        predicted_word = ""
        
        for key, value in tokenizer.word_index.items():
            if value == preds:
                predicted_word = key
                break
        print("Next predicted word for the given set of words or phrase is: ",predicted_word)
        return predicted_word

### Next Word Prediction Demo

In [18]:
print("**************************************Start of Next Word Prediction***********************************************")
print(" ")
while(True):

    text = input("Enter some words or  a phrase: ")
    
    if text == "Stop the Prediction":
        print(" ")
        print("**************************************End of Next Word Prediction***********************************************")
        break
    
    else:
        try:
            text = text.split(" ")
            text = text[-1]

            text = ''.join(text)
            Predict_Next_Words(model, tokenizer, text)
            
        except:
            continue

**************************************Start of Next Word Prediction***********************************************
 
Enter some words or  a phrase: Hotel Cosmopolitan Jewel
Next predicted word for the given set of words or phrase is:  robbery
Enter some words or  a phrase: The Adventures of Sherlock
Next predicted word for the given set of words or phrase is:  holmes
Enter some words or  a phrase: General Information About Project
Next predicted word for the given set of words or phrase is:  gutenberg
Enter some words or  a phrase: Stop the Prediction
 
**************************************End of Next Word Prediction***********************************************


### Thus, next word is predicted based on a set of words or phases.