# Project: Next Word Prediction Using LSTM

#### 1 - Data Collection:
We use the text of Shakespeare's 'Hamlet' dataset.

#### 2 - Data Preprocessing:
The text data is tokenized, converted to sequences and padded to ensure uniform input lengths.

#### 4 - Model Training:
The model is trained using the prepared sequences with early stopping.

#### 5 - Model Evaluation:
The model is evaluated using the test data.

#### 6 - Deployment:
A streamlit web app is developed.

## Data Collection

In [3]:
import nltk
nltk.download('brown')
from nltk.corpus import brown
import pandas as pd

## Load the Dataset
data = brown.raw(categories='news')

data = data[:100000]

## save to a file
with open('brown.txt', 'w') as file:
    file.write(data)

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\smitg\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


## Data Preprocessing

In [4]:
import tensorflow as tf

In [5]:
import keras

In [6]:
from tensorflow import keras

In [7]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

## load the dataset
with open('brown.txt', 'r') as file:
    text = file.read().lower()

In [8]:
text



In [9]:
import re

## Remove punctuation (keeping only alphanumeric characters and spaces)
text = re.sub(r'[^a-zA-Z\s]', '', text)

In [10]:
## Tokenize the text also creating index for each word
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index)+1
total_words

2889

In [11]:
tokenizer.word_index

{'theat': 1,
 'ofin': 2,
 'aat': 3,
 'inin': 4,
 'andcc': 5,
 'toto': 6,
 'cd': 7,
 'forin': 8,
 'toin': 9,
 'thatcs': 10,
 'hepps': 11,
 'saidvbd': 12,
 'wouldmd': 13,
 'bebe': 14,
 'onin': 15,
 'byin': 16,
 'wasbedz': 17,
 'isbez': 18,
 'hashvz': 19,
 'ascs': 20,
 'itpps': 21,
 'willmd': 22,
 'thisdt': 23,
 'not': 24,
 'hispp': 25,
 'within': 26,
 'anat': 27,
 'whichwdt': 28,
 'beenben': 29,
 'atin': 30,
 'mrnp': 31,
 'whowps': 32,
 'fromin': 33,
 'havehv': 34,
 'butcc': 35,
 'itspp': 36,
 'thereex': 37,
 'administrationnn': 38,
 'presidentnntl': 39,
 'onecd': 40,
 'moreap': 41,
 'yearnn': 42,
 'nns': 43,
 'areber': 44,
 'orcc': 45,
 'electionnn': 46,
 'jurynn': 47,
 'ofintl': 48,
 'hadhvd': 49,
 'citynn': 50,
 'otherap': 51,
 'theyppss': 52,
 'housenntl': 53,
 'somedti': 54,
 'werebed': 55,
 'statenntl': 56,
 'statenn': 57,
 'plannn': 58,
 'billnn': 59,
 'shouldmd': 60,
 'alsorb': 61,
 'federaljj': 62,
 'underin': 63,
 'newjj': 64,
 'millioncd': 65,
 'lastap': 66,
 'noat': 67,
 'the

In [12]:
## Create input sequences
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [13]:
input_sequences

[[1, 113],
 [1, 113, 74],
 [1, 113, 74, 1105],
 [1, 113, 74, 1105, 1106],
 [1, 113, 74, 1105, 1106, 12],
 [1, 113, 74, 1105, 1106, 12, 315],
 [1, 113, 74, 1105, 1106, 12, 315, 27],
 [1, 113, 74, 1105, 1106, 12, 315, 27, 440],
 [1, 113, 74, 1105, 1106, 12, 315, 27, 440, 2],
 [1, 113, 74, 1105, 1106, 12, 315, 27, 440, 2, 649],
 [1, 113, 74, 1105, 1106, 12, 315, 27, 440, 2, 649, 441],
 [1, 113, 74, 1105, 1106, 12, 315, 27, 440, 2, 649, 441, 316],
 [1, 113, 74, 1105, 1106, 12, 315, 27, 440, 2, 649, 441, 316, 46],
 [1, 113, 74, 1105, 1106, 12, 315, 27, 440, 2, 649, 441, 316, 46, 1107],
 [1, 113, 74, 1105, 1106, 12, 315, 27, 440, 2, 649, 441, 316, 46, 1107, 67],
 [1,
  113,
  74,
  1105,
  1106,
  12,
  315,
  27,
  440,
  2,
  649,
  441,
  316,
  46,
  1107,
  67,
  251],
 [1,
  113,
  74,
  1105,
  1106,
  12,
  315,
  27,
  440,
  2,
  649,
  441,
  316,
  46,
  1107,
  67,
  251,
  10],
 [1,
  113,
  74,
  1105,
  1106,
  12,
  315,
  27,
  440,
  2,
  649,
  441,
  316,
  46,
  1107,
 

In [14]:
## Pad Sequences
max_sequence_len = max([len(x) for x in input_sequences])
max_sequence_len

60

In [15]:
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
input_sequences

array([[   0,    0,    0, ...,    0,    1,  113],
       [   0,    0,    0, ...,    1,  113,   74],
       [   0,    0,    0, ...,  113,   74, 1105],
       ...,
       [   0,    0,    0, ..., 2886, 2887,    6],
       [   0,    0,    0, ..., 2887,    6,   97],
       [   0,    0,    0, ...,    6,   97, 2888]])

In [16]:
## create predictors and labels
import tensorflow as tf
X, y = input_sequences[:, :-1], input_sequences[:, -1]

In [17]:
X

array([[   0,    0,    0, ...,    0,    0,    1],
       [   0,    0,    0, ...,    0,    1,  113],
       [   0,    0,    0, ...,    1,  113,   74],
       ...,
       [   0,    0,    0, ..., 2885, 2886, 2887],
       [   0,    0,    0, ..., 2886, 2887,    6],
       [   0,    0,    0, ..., 2887,    6,   97]])

In [18]:
y

array([ 113,   74, 1105, ...,    6,   97, 2888])

In [19]:
print('X shape: ',X.shape)
print('y shape: ',y.shape)

X shape:  (9504, 59)
y shape:  (9504,)


In [20]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

In [21]:
print('X shape: ',X.shape)
print('y shape: ',y.shape)

X shape:  (9504, 59)
y shape:  (9504, 2889)


## Splitting the data into training data and testing data

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
print('X_train.shape: ',X_train.shape)
print('X_test.shape: ',X_test.shape)
print('y_train.shape: ',y_train.shape)
print('y_test.shape: ',y_test.shape)

X_train.shape:  (7603, 59)
X_test.shape:  (1901, 59)
y_train.shape:  (7603, 2889)
y_test.shape:  (1901, 2889)


## Train the LSTM RNN

In [24]:
total_words

2889

In [33]:
max_sequence_len

60

In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping

## Define the model
model = Sequential()
model.add(Embedding(total_words, 300))
model.add(LSTM(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

In [26]:
## Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [27]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         866700    
                                                                 
 lstm (LSTM)                 (None, None, 150)         270600    
                                                                 
 dropout (Dropout)           (None, None, 150)         0         
                                                                 
 lstm_1 (LSTM)               (None, 100)               100400    
                                                                 
 dense (Dense)               (None, 2889)              291789    
                                                                 
Total params: 1,529,489
Trainable params: 1,529,489
Non-trainable params: 0
_________________________________________________________________


In [28]:
# early_stopping = EarlyStopping(monitor='val_loss' ,patience=10, restore_best_weights=True)

In [29]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Nov_30_19:15:10_Pacific_Standard_Time_2020
Cuda compilation tools, release 11.2, V11.2.67
Build cuda_11.2.r11.2/compiler.29373293_0


In [56]:
## Training the model
history = model.fit(X_train, y_train, epochs=50, validation_data=(X_test, y_test), verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


## Function to predict the next word

In [30]:
from tensorflow.keras.models import load_model
model = load_model('model_50epoch_300embedding.h5')

In [34]:
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    
    # Adjust the token_list to the correct sequence length
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]
        
    # Pad the sequence to ensure it matches the input shape expected by the model
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    
    # Predict the next word
    predicted = model.predict(token_list, verbose=0)
    
    # Get the index of the word with the highest probability
    predicted_word_index = np.argmax(predicted, axis=1)[0]
    
    # Map the index back to the word
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    
    return None

In [38]:
input_text = 'Should I or Should I not'
print(f'Input Text: {input_text}')

# Set max_sequence_len based on training setup
max_sequence_len = 60  # replace with your actual max_sequence_len used during training
next_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)

print(f'Next Word Prediction: {next_word}')

Input Text: Should I or Should I not
Next Word Prediction: providencenptl


## Save the model

In [59]:
model.save('model.h5')

## Save the tokenizer

In [None]:
import pickle

with open('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)