<a href="https://colab.research.google.com/github/Onkar-stac/Next-Word-Predictor/blob/main/nextwordpredictor_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#import library
import re
import requests
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense,Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [None]:
# data to extract
url = "https://www.gutenberg.org/cache/epub/5200/pg5200.txt"

In [None]:
#function to extract data from the url
def get_book(url):
  raw = requests.get(url).text
  #discarding the beginning of the data
  start = re.search(r"\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*",raw).end()
  #discarding the end of the data
  stop=re.search(r"II", raw).start()
  text=raw[start:stop] #relevant data
  return text

In [None]:
#processing
def preprocess(sentence):
  return re.sub('[^A-Za-z0-9.]+',' ',sentence).lower()

In [None]:
#calling the functions
book = get_book(url)
processed_book=preprocess(book)
print(processed_book)



In [None]:
len(processed_book)

37409

In [None]:
# EDA
len(re.findall(r'the',processed_book))
processed_book = re.sub(r'\si\s', " I ",processed_book)
processed_book = re.sub(r'[^\w\s]'," ",processed_book)
print(processed_book)



In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([processed_book])
#saving the tokenizer
pickle.dump(tokenizer,open('token.pkl','wb'))
sequence_data = tokenizer.texts_to_sequences([processed_book])[0] #vectorize the text corpus
sequence_data[:15]

[642, 643, 644, 421, 422, 645, 646, 647, 648, 75, 421, 422, 15, 57, 163]

In [None]:
len(sequence_data)

7275

In [None]:
vocab_size = len(tokenizer.word_index)+1 #number of unique words in the text corpus
print(vocab_size)

1355


In [None]:
sequences = []
for i in range(3,len(sequence_data)):
  words = sequence_data[i-3:i+1]
  sequences.append(words)
print("The length of sequences are: ",len(sequences))
sequences = np.array(sequences)
sequences[:10]

The length of sequences are:  7272


array([[642, 643, 644, 421],
       [643, 644, 421, 422],
       [644, 421, 422, 645],
       [421, 422, 645, 646],
       [422, 645, 646, 647],
       [645, 646, 647, 648],
       [646, 647, 648,  75],
       [647, 648,  75, 421],
       [648,  75, 421, 422],
       [ 75, 421, 422,  15]])

In [None]:
X=[]
Y=[]
for i in sequences:
  X.append(i[0:3])
  Y.append(i[3])
X=np.array(X)
Y=np.array(Y)


In [None]:
Y=to_categorical(Y,num_classes=vocab_size)
Y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [None]:
model = Sequential()
model.add(Embedding(vocab_size,10,input_length=3))
model.add(LSTM(1000))
model.add(Dropout(0.2))
model.add(Dense(vocab_size,activation="softmax"))

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             13550     
                                                                 
 lstm (LSTM)                 (None, 1000)              4044000   
                                                                 
 dropout (Dropout)           (None, 1000)              0         
                                                                 
 dense (Dense)               (None, 1355)              1356355   
                                                                 
Total params: 5,413,905
Trainable params: 5,413,905
Non-trainable params: 0
_________________________________________________________________


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
file_name_path="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(file_name_path, monitor='loss', 
verbose=1, save_best_only=True, mode='min')
model.compile(loss="categorical_crossentropy",optimizer=Adam(learning_rate=0.001))
callbacks = [checkpoint]

In [None]:
model.fit(X, Y, epochs=50, batch_size=64, callbacks=callbacks) 

Epoch 1/50
Epoch 00001: loss improved from inf to 6.29201, saving model to weights-improvement-01-6.2920.hdf5
Epoch 2/50
Epoch 00002: loss improved from 6.29201 to 5.89983, saving model to weights-improvement-02-5.8998.hdf5
Epoch 3/50
Epoch 00003: loss improved from 5.89983 to 5.85193, saving model to weights-improvement-03-5.8519.hdf5
Epoch 4/50
Epoch 00004: loss improved from 5.85193 to 5.78508, saving model to weights-improvement-04-5.7851.hdf5
Epoch 5/50
Epoch 00005: loss improved from 5.78508 to 5.67312, saving model to weights-improvement-05-5.6731.hdf5
Epoch 6/50
Epoch 00006: loss improved from 5.67312 to 5.50373, saving model to weights-improvement-06-5.5037.hdf5
Epoch 7/50
Epoch 00007: loss improved from 5.50373 to 5.32038, saving model to weights-improvement-07-5.3204.hdf5
Epoch 8/50
Epoch 00008: loss improved from 5.32038 to 5.14452, saving model to weights-improvement-08-5.1445.hdf5
Epoch 9/50
Epoch 00009: loss improved from 5.14452 to 4.96295, saving model to weights-impro

<keras.callbacks.History at 0x7fd7c2ff7410>

In [None]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle
model = load_model('weights-improvement-50-0.6095.hdf5')
tokenizer = pickle.load(open('token.pkl','rb'))

def Predict_Next_Words(model,tokenizer,text):
  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""

  for key,value in tokenizer.word_index.items():
    if value==preds:
      predicted_word=key
      break
  print(predicted_word)
  return predicted_word

In [None]:
while(True):
  text = input("Enter: ")
  if text=="0":
    print("Execution completed")
    break
  else:
    text=text.split(" ")
    text=text[-3:]
    print(text)
    Predict_Next_Words(model,tokenizer,text)

Enter: A collection of textile
['collection', 'of', 'textile']
samples
Enter: Samsa was a travelling
['was', 'a', 'travelling']
and
Enter: he had recently cut out of an
['out', 'of', 'an']
illustrated
Enter: 0
Execution completed
