# Next Word Prediction using LSTM
Next Word Prediction means predicting the most likely word or phrase that will come next in a sentence or text. It is like having an inbuilt feature on an application that suggests the next word as you type or speak. The Next Word Prediction Models are used in applications like messaging apps, search engines, virtual assistants, and autocorrect features on smartphones.
start by collecting a diverse dataset of text documents,
preprocess the data by cleaning and tokenizing it,
prepare the data by creating input-output pairs,
engineer features such as word embeddings,
select an appropriate model like an LSTM or GPT,
train the model on the dataset while adjusting hyperparameters,
improve the model by experimenting with different techniques and architectures.

*   start by collecting a diverse dataset of text documents
*   preprocess the data by cleaning and tokenizing it


*   preprocess the data by cleaning and tokenizing it
*   prepare the data by creating input-output pairs


*   engineer features such as word embeddings
*   select an appropriate model like an LSTM


*   train the model on the dataset while adjusting hyperparameters
*   improve the model by experimenting with different techniques and architectures








In [1]:
#import libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [2]:
# Read the text file
with open('Data.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [3]:
tokenizer = Tokenizer()  #tokenize the text to create a sequence of words

In [4]:
tokenizer.fit_on_texts([text])

In [5]:
total_words = len(tokenizer.word_index) + 1

In [6]:
total_words

1099

In [7]:
#create input-output pairs by splitting the text into sequences of tokens
#and forming n-grams from the sequences
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)  #n-gram sequence represents the input context, with the last token being the target or predicted word

In [8]:
input_sequences

[[2, 12],
 [2, 12, 83],
 [2, 12, 83, 62],
 [2, 12, 83, 62, 63],
 [2, 12, 83, 62, 63, 35],
 [2, 12, 83, 62, 63, 35, 84],
 [2, 12, 83, 62, 63, 35, 84, 5],
 [2, 12, 83, 62, 63, 35, 84, 5, 407],
 [2, 12, 83, 62, 63, 35, 84, 5, 407, 408],
 [2, 12, 83, 62, 63, 35, 84, 5, 407, 408, 175],
 [2, 12, 83, 62, 63, 35, 84, 5, 407, 408, 175, 5],
 [2, 12, 83, 62, 63, 35, 84, 5, 407, 408, 175, 5, 108],
 [2, 12, 83, 62, 63, 35, 84, 5, 407, 408, 175, 5, 108, 255],
 [2, 12, 83, 62, 63, 35, 84, 5, 407, 408, 175, 5, 108, 255, 109],
 [2, 12, 83, 62, 63, 35, 84, 5, 407, 408, 175, 5, 108, 255, 109, 32],
 [2, 12, 83, 62, 63, 35, 84, 5, 407, 408, 175, 5, 108, 255, 109, 32, 7],
 [2, 12, 83, 62, 63, 35, 84, 5, 407, 408, 175, 5, 108, 255, 109, 32, 7, 9],
 [2, 12, 83, 62, 63, 35, 84, 5, 407, 408, 175, 5, 108, 255, 109, 32, 7, 9, 51],
 [2,
  12,
  83,
  62,
  63,
  35,
  84,
  5,
  407,
  408,
  175,
  5,
  108,
  255,
  109,
  32,
  7,
  9,
  51,
  256],
 [2,
  12,
  83,
  62,
  63,
  35,
  84,
  5,
  407,
  408,
  

In [9]:
max_sequence_len = max([len(seq) for seq in input_sequences])   #pad the input sequences to have equal length
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [10]:
input_sequences

array([[   0,    0,    0, ...,    0,    2,   12],
       [   0,    0,    0, ...,    2,   12,   83],
       [   0,    0,    0, ...,   12,   83,   62],
       ...,
       [   0,    0,    0, ..., 1097, 1098,  238],
       [   0,    0,    0, ..., 1098,  238,    3],
       [   0,    0,    0, ...,  238,    3,  206]], dtype=int32)

In [11]:
# let’s split the sequences into input and output
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [12]:
X

array([[   0,    0,    0, ...,    0,    0,    2],
       [   0,    0,    0, ...,    0,    2,   12],
       [   0,    0,    0, ...,    2,   12,   83],
       ...,
       [   0,    0,    0, ...,   51, 1097, 1098],
       [   0,    0,    0, ..., 1097, 1098,  238],
       [   0,    0,    0, ..., 1098,  238,    3]], dtype=int32)

In [13]:
y

array([ 12,  83,  62, ..., 238,   3, 206], dtype=int32)

In [14]:
# let’s convert the output to one-hot encode vectors
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

In [15]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [17]:
#let’s build a neural network architecture to train the model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1)) #‘total_words’, which represents the total number of distinct words in the vocabulary
model.add(LSTM(150))              # it will learn 150 internal representations or memory cells
model.add(Dense(total_words, activation='softmax')) # ‘softmax’ activation function to convert the predicted scores into probabilities, indicating the likelihood of each word being the next one in the sequence
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 230, 100)          109900    
                                                                 
 lstm_1 (LSTM)               (None, 150)               150600    
                                                                 
 dense_1 (Dense)             (None, 1099)              165949    
                                                                 
Total params: 426449 (1.63 MB)
Trainable params: 426449 (1.63 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [18]:
#let’s compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7aaabc241a50>

In [21]:
seed_text = "I have mentioned"
next_words = 10

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word

print(seed_text)  #code generates the next word predictions based on a given seed text

I have mentioned that mrs gisburn was rich and it was immediately perceptible


In [22]:
model.save('LSTM_model.h5')  #HDF5 is a format designed to store large amounts of numerical data, commonly used in machine learning applications for storing trained models

  saving_api.save_model(


In [23]:
model.save('LSTM.keras')   #this file contains a Keras model saved in HDF5 format

In [24]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Tokenizer saved as 'tokenizer.pickle'")

Tokenizer saved as 'tokenizer.pickle'


In [25]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.36.1-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==1.0.1 (from gradio)
  Downloading gradio_client-1.0.1-py3-none-any.whl (318 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.1/318.1 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━

In [26]:
!pip install --upgrade gradio



In [29]:
import gradio as gr
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# Load the pre-trained LSTM model
model = load_model('LSTM_model.h5')

# Load the tokenizer used during model training
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Define parameters
max_sequence_length = 50  # Adjust according to your trained model

# Function to predict next words
def predict_next_words(input_text, num_words_to_predict):
    sequence = tokenizer.texts_to_sequences([input_text])[0]
    predicted_words = []

    for _ in range(num_words_to_predict):
        sequence_padded = pad_sequences([sequence], maxlen=max_sequence_length, padding='pre')
        prediction = model.predict(sequence_padded)
        predicted_word_index = tf.argmax(prediction, axis=-1).numpy()[0]
        predicted_word = tokenizer.index_word.get(predicted_word_index, '')
        predicted_words.append(predicted_word)
        sequence.append(predicted_word_index)
        sequence = sequence[-max_sequence_length:]

    return ' '.join(predicted_words)

# Define Gradio Interface
inputs = gr.Textbox(label="Enter your query", placeholder="Type here")
output_text = gr.Textbox(label="Completed Sentence", interactive=False)

number_of_words = gr.Number(label="Number of words to predict",minimum=1, maximum=20)
number_of_words.value = 4

def predict_and_return(input_text, number_of_words):
    completed_sentence = predict_next_words(input_text, int(number_of_words))
    return completed_sentence

iface = gr.Interface(
    fn=predict_and_return,
    inputs=[inputs, number_of_words],
    outputs=output_text,
    title="Next Word Prediction using LSTM",
    description="Enter a sentence and the number of words to predict, and get the completed sentence."
)

iface.launch()  # Launch the Gradio interface


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://37764ae3d5fe2ee206.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


