<h1><center>Next Word Prediction using Deep Learning</h1>

In [16]:
# Import relevant libraries
import numpy as np
import os
import re
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
# Directory path
directory_path = 'Data'
# Get the files in directory
files = os.listdir(directory_path)

# Text corpus
text_corpus = ""
# Sentences from corpus
sentences_corpus = []

# Loop over the files
for file in files:
    # Print file name
    print(f'File: {file}')
    # Get the file path
    file_path = os.path.join(directory_path, file)
    # Open the file for reading
    with open(file_path, 'r', encoding='utf-8') as f:
        # Read the file
        text = f.read()
        # Remove extra spaces
        text = text.replace('\n', '').replace('\r', '').replace('\ufeff', '')
        # Add file text to text corpus
        text_corpus += text
        # Split the text into sentences
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
        # Append sentences to corpus
        sentences_corpus.append(sentences)
        # Display first 10 sentences
        for sentence in sentences[:10]:
            print(sentence)


File: Metamorphosis_clean.txt
One morning, when Gregor Samsa woke from troubled dreams, he foundhimself transformed in his bed into a horrible vermin.
He lay on hisarmour-like back, and if he lifted his head a little he could see hisbrown belly, slightly domed and divided by arches into stiff sections.The bedding was hardly able to cover it and seemed ready to slide offany moment.
His many legs, pitifully thin compared with the size of therest of him, waved about helplessly as he looked.“What’s happened to me?” he thought.
It wasn’t a dream.
His room, aproper human room although a little too small, lay peacefully betweenits four familiar walls.
A collection of textile samples lay spread outon the table—Samsa was a travelling salesman—and above it there hung apicture that he had recently cut out of an illustrated magazine andhoused in a nice, gilded frame.
It showed a lady fitted out with a furhat and fur boa who sat upright, raising a heavy fur muff that coveredthe whole of her lower a

In [None]:
# Instantiate the Tokenizer from keras
tokenizer = Tokenizer()
# Fit on the data
tokenizer.fit_on_texts(sentences)
# Get the total words and print
total_words = len(tokenizer.word_index) + 1
print(total_words)

3836


In [None]:
# Create input sequences list
input_sequences = []
# Loop over lines in all the sentences
for line in sentences:
    # Convert sentences to tokens
    token_list = tokenizer.texts_to_sequences([line])[0]
    # Loop over the token list
    for i in range(1, len(token_list)):
        # Append the token list to input sequences list
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Print first 5 sequences
print(input_sequences[:5])

[[53, 149], [53, 149, 49], [53, 149, 49, 15], [53, 149, 49, 15, 97], [53, 149, 49, 15, 97, 884]]


In [None]:
# Get the maximum sequence length
max_sequence_len = max([len(seq) for seq in input_sequences])
# Convert the input sequences to matrix
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
# Get the inputs and labels
X, y = input_sequences[:, :-1], input_sequences[:, -1]
# Convert target data to one-hot encoding
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [None]:
# Define the model
# Use Sequential model
model = Sequential()
# Add Embedding layer
model.add(Embedding(input_dim=total_words, output_dim=10, input_length=max_sequence_len-1))
# Add LSTM layer
model.add(LSTM(128))
# Add output layer
model.add(Dense(total_words, activation='softmax'))
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Define model checkpoint
checkpoint = ModelCheckpoint('model_weights.keras', monitor='loss', save_best_only=True, mode='min')
# Train the model
model.fit(X, y, epochs=120, verbose=1, callbacks=[checkpoint])

Epoch 1/120
[1m624/624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.0408 - loss: 6.9736
Epoch 2/120
[1m624/624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.0499 - loss: 6.3325
Epoch 3/120
[1m624/624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.0538 - loss: 6.2337
Epoch 4/120
[1m624/624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.0676 - loss: 6.0027
Epoch 5/120
[1m624/624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.0767 - loss: 5.8439
Epoch 6/120
[1m624/624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.0887 - loss: 5.6268
Epoch 7/120
[1m624/624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.0899 - loss: 5.4100
Epoch 8/120
[1m624/624[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.1056 - loss: 5.1798
Epoch 9/120
[1m624/624[0

<keras.src.callbacks.history.History at 0x791948757d90>

In [None]:
# Load the model
model = load_model('model_weights.keras')

In [None]:
# Input text which will be used to predict next words
input_text = "So why did his sister not"
# Number of next words to predict
next_words = 5

# Loop over the number of words
for _ in range(next_words):
    # Get the sequences from text
    token_list = tokenizer.texts_to_sequences([input_text])[0]
    # Get the pad sequences
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    # Predict the probabilities from model
    predicted_probs = model.predict(token_list)
    # Get the predicted words
    predicted_word = tokenizer.index_word[np.argmax(predicted_probs)]
    # Add the predicted word to input text
    input_text += " " + predicted_word

# Print the input and predicted words
print("Next predicted words:", input_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
Next predicted words: So why did his sister not even able to make the
