<a href="https://colab.research.google.com/github/VikasSharma707/LGMVIP-Data-Science/blob/main/NextWordPredictionModel_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **LGM-VIP Data Science Internship Programme 2021**
## **ADVANCED LEVEL TASK**
# **Name: Next Word Prediction**

**libraries**

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [None]:
path = 'nextword.txt'

**Preprocessing the Data**

In [None]:
file = open("nextword.txt", "r", encoding = "utf8")

# store file in list
lines = []
for i in file:
    lines.append(i)

# Convert list to string
data = ""
for i in lines:
  data = ' '. join(lines) 

#replace unnecessary stuff with space
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')  #new line, carriage return, unicode character --> replace by space

#remove unnecessary spaces 
data = data.split()
data = ' '.join(data)
data[:200]

"Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give i"

In [None]:
print(len(data))

573660


In [None]:
tokenizer = Tokenizer(num_words=len(data), oov_token='<OOV>')
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]

[143, 4681, 2, 987, 6, 126, 34, 47, 557, 2165, 2166, 28, 988, 15, 23]

In [None]:
print(len(sequence_data))

108958


In [None]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

8625


In [None]:
sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  108955


array([[ 143, 4681,    2,  987],
       [4681,    2,  987,    6],
       [   2,  987,    6,  126],
       [ 987,    6,  126,   34],
       [   6,  126,   34,   47],
       [ 126,   34,   47,  557],
       [  34,   47,  557, 2165],
       [  47,  557, 2165, 2166],
       [ 557, 2165, 2166,   28],
       [2165, 2166,   28,  988]])

In [None]:
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])
    
X = np.array(X)
y = np.array(y)

In [None]:
print("Data: ", X[:10])
print("Response: ", y[:10])

Data:  [[ 143 4681    2]
 [4681    2  987]
 [   2  987    6]
 [ 987    6  126]
 [   6  126   34]
 [ 126   34   47]
 [  34   47  557]
 [  47  557 2165]
 [ 557 2165 2166]
 [2165 2166   28]]
Response:  [ 987    6  126   34   47  557 2165 2166   28  988]


In [None]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

**MODEL**

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             86250     
                                                                 
 lstm (LSTM)                 (None, 3, 1000)           4044000   
                                                                 
 lstm_1 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 8625)              8633625   
                                                                 
Total params: 21,768,875
Trainable params: 21,768,875
Non-trainable params: 0
_________________________________________________________________


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_words.h5", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))
model.fit(X, y, epochs=20, batch_size=64, callbacks=[checkpoint])

Epoch 1/20
Epoch 00001: loss improved from inf to 6.38954, saving model to next_words.h5
Epoch 2/20
Epoch 00002: loss improved from 6.38954 to 5.80273, saving model to next_words.h5
Epoch 3/20
Epoch 00003: loss improved from 5.80273 to 5.47527, saving model to next_words.h5
Epoch 4/20
Epoch 00004: loss improved from 5.47527 to 5.20780, saving model to next_words.h5
Epoch 5/20
Epoch 00005: loss improved from 5.20780 to 4.97880, saving model to next_words.h5
Epoch 6/20
Epoch 00006: loss improved from 4.97880 to 4.75281, saving model to next_words.h5
Epoch 7/20
Epoch 00007: loss improved from 4.75281 to 4.51912, saving model to next_words.h5
Epoch 8/20
Epoch 00008: loss improved from 4.51912 to 4.27710, saving model to next_words.h5
Epoch 9/20
Epoch 00009: loss improved from 4.27710 to 4.02094, saving model to next_words.h5
Epoch 10/20
Epoch 00010: loss improved from 4.02094 to 3.76171, saving model to next_words.h5
Epoch 11/20
Epoch 00011: loss improved from 3.76171 to 3.50096, saving mo

<keras.callbacks.History at 0x7f357a9af250>

**Prediction**

In [None]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer
model = load_model('next_words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""
  
  for key, value in tokenizer.word_index.items():
      if value == preds:
          predicted_word = key
          break
  
  print(predicted_word)
  return predicted_word

In [None]:
while(True):
  text = input("Enter your line: ")
  
  if text == "0":
      print("Execution completed.....")
      break
  
  else:
      try:
          text = text.split(" ")
          text = text[-3:]
          print(text)
        
          Predict_Next_Words(model, tokenizer, text)
          
      except Exception as e:
        print("Error occurred: ",e)
        continue

['Adventures']
of
['Arthur', 'Conan']
i
