# <font color=white><center><b>Next Word Prediction Using LSTM </center><br></font>
This notebook demonstrates a next-word prediction model trained on text data using an LSTM-based neural network. The steps include preprocessing the text, creating input-output sequences for training, building and training the model, and evaluating it using ROUGE scores.

## Import libraries

In [28]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from rouge import Rouge

## Read the text file

In [2]:
with open("The Modern Prometheus.txt", 'r', encoding='utf-8') as file:
    text = file.read()

## Tokenizing the text

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1 # Total vocabulary size

## Creating input sequences for training

In [4]:
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

## Padding the sequences to uniform length

In [5]:
max_sequence_len = max(len(seq) for seq in input_sequences)
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

## Splitting the data into features (X) and labels (y)

In [7]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

## Building and training the LSTM model

In [9]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))

In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 17, 100)           760600    
                                                                 
 lstm (LSTM)                 (None, 150)               150600    
                                                                 
 dense (Dense)               (None, 7606)              1148506   
                                                                 
Total params: 2,059,706
Trainable params: 2,059,706
Non-trainable params: 0
_________________________________________________________________


In [11]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [12]:
model.fit(X, y, epochs=50, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1ce64843ee0>

## Predicting the next n words for a given seed text

In [18]:
seed_text = "The master is"
next_words = 6

def predict_next_n_words(seed_text, next_words):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

predict_next_n_words(seed_text, next_words)



'The master is a person of an excellent disposition'

## Generating test data from the text file

In [30]:
sentences = text.split('\n')

test_sentences = [s.strip() for s in sentences[-6:] if len(s.strip()) > 0]  # Select last 6 sentences for evaluation
reference_sentences = [" ".join(test_sentences[i].split()[:10]) for i in range(len(test_sentences))]

In [31]:
predictions = [predict_next_n_words(sentence, 5) for sentence in test_sentences]
predictions



['This website includes information about Project Gutenberg™, work b is the foundation',
 'including how to make donations to the Project Gutenberg Literary archive foundation “the name of',
 'Archive Foundation, how to help produce our new eBooks, and how to you to make the work',
 'subscribe to our email newsletter to hear about new eBooks. in amusement to be torn']

## Evaluating the model with ROUGE scores

In [33]:
rouge = Rouge()
scores = rouge.get_scores(predictions, reference_sentences, avg=True)

scores

{'rouge-1': {'r': 1.0, 'p': 0.6581959706959708, 'f': 0.7927414833293042},
 'rouge-2': {'r': 1.0, 'p': 0.6077922077922078, 'f': 0.755274931365785},
 'rouge-l': {'r': 1.0, 'p': 0.6581959706959708, 'f': 0.7927414833293042}}