<a href="https://colab.research.google.com/github/Raiyankhan640/Next-Word-Predictor-using-LSTM/blob/main/Next_Word_Prediction_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [4]:
# Read the text from the file
with open('dataset.txt', 'r', encoding='utf-8') as file:
    faqs = file.read()

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([faqs])
len(tokenizer.word_index)

1088

In [6]:
#tokenizer.word_index

In [7]:
#converting the dataset for applying supervised algorithm
input_sequences = []
for sentence in faqs.split('\n'):
        tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

        for i in range(1, len(tokenized_sentence)):
            n_gram_sequence = tokenized_sentence[:i+1]
            input_sequences.append(n_gram_sequence)

In [8]:
# input_sequences

In [9]:
#applying zero padding to have an equal number in every vector
print(f"Total sequences: {len(input_sequences)}")
max_len = max([len(x) for x in input_sequences])
print(f"Maximum sequence length: {max_len}")


Total sequences: 3401
Maximum sequence length: 13


In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequences, maxlen = max_len, padding = 'pre')

In [11]:
padded_input_sequences

array([[   0,    0,    0, ...,    0,  567,  568],
       [   0,    0,    0, ...,    0,  387,  299],
       [   0,    0,    0, ...,  387,  299,   35],
       ...,
       [   0,    0,    0, ...,    0,  284,  195],
       [   0,    0,    0, ...,  284,  195,  380],
       [   0,    0,    0, ...,  195,  380, 1088]], dtype=int32)

In [12]:
#Now sepearated the labels
X = padded_input_sequences[:, :-1]
y = padded_input_sequences[:, -1]

In [13]:
#Now apply one hot encoding on y for multiclass classification
from tensorflow.keras.utils import to_categorical
y = to_categorical(y, num_classes = len(tokenizer.word_index) + 1)

In [14]:
y.shape

(3401, 1089)

In [15]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [16]:
#Apply LSTM
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [17]:
vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 1089


In [18]:
max_len = max([len(x) for x in input_sequences])  # longest sequence length
max_len

13

In [19]:
#building the model
model = Sequential()
model.add(Embedding(vocab_size, 100))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(150))
model.add(Dense(vocab_size, activation='softmax'))

In [20]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.build(input_shape=(None, max_len -1)) # Build the model with input shape
model.summary()

In [23]:
#train the model
model.fit(X, y, epochs=1000, verbose=0)

<keras.src.callbacks.history.History at 0x7ec1237a2120>

In [26]:
#Predict
import time, numpy as np
text = "Artificial intelligence is"

for i in range(10):
  # tokenize
  token_text = tokenizer.texts_to_sequences([text])[0]
  # padding
  padded_token_text = pad_sequences([token_text], maxlen=56, padding='pre')
  # predict
  pos = np.argmax(model.predict(padded_token_text))

  for word,index in tokenizer.word_index.items():
    if index == pos:
      text = text + " " + word
      print(text)
      time.sleep(2)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Artificial intelligence is transforming
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Artificial intelligence is transforming the
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Artificial intelligence is transforming the world
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Artificial intelligence is transforming the world of
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Artificial intelligence is transforming the world of technology
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Artificial intelligence is transforming the world of technology and
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Artificial intelligence is transforming the world of technology and business
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Artificial intelligence is t