In [4]:
with open('../input/next-word-prediction-dataset/data.txt', 'r') as file:
    data = file.read()

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
import re

In [6]:
data = re.sub(r"[^a-zA-Z0-9]"," ",data)
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
preprocessed_data = tokenizer.texts_to_sequences([data])[0]

In [7]:
data

In [8]:
tokenizer.word_index

In [9]:
vocab_size = len(tokenizer.word_index)+1

In [10]:
import numpy as np
np.shape(preprocessed_data)

In [11]:
def get_training_data(preprocessed_data):
    
    x = []
    y = []
    for i in range(5,len(preprocessed_data)):
        x.append(preprocessed_data[i-5:i])
        y.append(preprocessed_data[i])
    
    x, y = np.array(x), np.array(y)

    #x = np.reshape(x, (x.shape[0],))
    return x,y

In [12]:
x_train, y_train = get_training_data(preprocessed_data)

In [13]:
y_train = to_categorical(y_train, num_classes=len(tokenizer.word_index)+1)

In [14]:
x_train[:5]

## Model creation

In [15]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense,Dropout,LSTM, Embedding
from tensorflow.keras.callbacks import EarlyStopping

In [16]:
x_train.shape

In [17]:
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1],1))

In [18]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=5))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))
print(model.summary())

In [19]:
from tensorflow import keras
from keras.utils.vis_utils import plot_model

keras.utils.plot_model(model, to_file='plot.png', show_layer_names=True)

In [20]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam

checkpoint = ModelCheckpoint("next_words.h5", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))
model.fit(x_train, y_train, epochs=70, batch_size=100, callbacks=[checkpoint])

## Testing

In [31]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer
#model = load_model('next_words.h5')
#tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""
  
  for key, value in tokenizer.word_index.items():
      if value == preds:
          predicted_word = key
          break
  
  print(predicted_word)
  return predicted_word

In [None]:
while(True):
  text = input("Enter your line: ")
  
  if text == "0":
      print("Execution completed.....")
      break
  
  else:
      try:
          text = text.split(" ")
          text = text[-5:]
          print(text)
        
          Predict_Next_Words(model, tokenizer, text)
          
      except Exception as e:
        print("Error occurred: ",e)
        continue

In [22]:
test_list = ['It is the seventh largest country by area  the second',
'Modern humans arrived on the Indian subcontinent from',
'Indus Valley Civilisation of the third millennium',
'proclaiming social orders unlinked to']

In [33]:
for i in range(len(test_list)):
    text = test_list[i]
    try:
        text = text.split(" ")
        text = text[-6:-1]
        print(text)
        
        Predict_Next_Words(model, tokenizer, text)
          
    except Exception as e:
        print("Error occurred: ",e)
        continue