# Next Word Prediction

In this Notebook I have created the model for predicting the next word on the basis of input that the user will give

* **My linkedin id** : [saurabh-maulekhi](https://www.linkedin.com/in/saurabh-maulekhi-326584241/)

* [**MY Hand written Notes on RNN,LSTM,GRU,BiLSTM,BiGRU**](https://www.linkedin.com/posts/saurabh-maulekhi-326584241_rnn-lstm-gru-brnn-notes-activity-7270759679981428736-DjcA/)

* [**Model Web Deployment**](https://huggingface.co/spaces/saurabh091/Next_Word_Prediction)

* [**Github Repo**](https://github.com/Saurabh-Maulekhi/Next-Word-Prediction)

* [**Get my Next Word Prediction model on Kaggle**](https://www.kaggle.com/models/saurabhmaulekhi/next_word_prediction)

# Importing Libraries

In [1]:
import tensorflow as tf 
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import pickle
import time
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,GRU, LSTM, Dense, Bidirectional, Dropout

In [2]:
# getting our text file
with open('/kaggle/input/next-word-prediction/1661-0.txt') as f:
    faqs = f.read()

# Converting Text into Vectors

In [3]:
tokenizer = Tokenizer() # Create a tokenizer

In [4]:
tokenizer.fit_on_texts([faqs])  ## Creates a vocabulary list of all unique words in the sentences

In [5]:
# Save the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle)

In [6]:
keys_list = tokenizer.word_index  ## creates a word to index mapping
print("No. of words = " , len(keys_list))

No. of words =  8931


In [7]:
input_sequences = []

for sentence in faqs.split('\n'):
    #print(sentence)
                        ## converting text sequence into vector
#                           | 
#                           V
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]  ## the [0] index is putting all sequences in one list
#                                        ↑
#  Returns a list of sequences, where each sequence is a list of integers representing the words in the corresponding input text.
    
#                       runing this iteration for [1, len(tokenized_sentence)]
#                           |
#                           V
    for i in range( 1, len(tokenized_sentence)):
        input_sequences.append(tokenized_sentence[:i+1]) ## apppendind tokenized sentences to input_sequences list

In [8]:
## length of the biggest line
max_len = max(len(x) for x in input_sequences)

**`Padding`**:

            The process of adding placeholder values to sequences to make them all the same length.

             This is often necessary because many machine learning models, especially neural networks, require input data to have consistent dimensions.   

In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_input_sequences = pad_sequences(input_sequences, ## vector who's vector we need to padd
                                       maxlen=max_len,  ## length of sequence's vectors 
                                       padding='pre'    ## padding from the starting 
                                      )

In [10]:
padded_input_sequences

array([[   0,    0,    0, ...,    0,  145, 4790],
       [   0,    0,    0, ...,  145, 4790,    1],
       [   0,    0,    0, ..., 4790,    1, 1020],
       ...,
       [   0,    0,    0, ...,    3,  360,   83],
       [   0,    0,    0, ...,  360,   83,  358],
       [   0,    0,    0, ...,   83,  358, 1673]], dtype=int32)

In [11]:
## X will contain all elements instead of last one in list
X = padded_input_sequences[:,:-1]

## y will only contain last element of list
y = padded_input_sequences[:,-1]

In [12]:
X.shape

(101619, 19)

In [13]:
y.shape

(101619,)

In [14]:
print(" Total number of word: " ,len(tokenizer.word_index))

INPUT_LENGTH = len(tokenizer.word_index)+1

 Total number of word:  8931


In [15]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y,num_classes=INPUT_LENGTH)

In [16]:
y.shape

(101619, 8932)

# Architecture

In [17]:
model = Sequential()
model.add(Embedding(INPUT_LENGTH, 100))
model.add(Bidirectional(GRU(units=80, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(GRU(units=80, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(GRU(units=80)))
model.add(Dense(INPUT_LENGTH, activation='softmax'))

In [18]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [19]:
model.summary()

# Training

In [20]:
N_EPOCHS = 135

history = model.fit(X, y, epochs=N_EPOCHS)

Epoch 1/135
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 17ms/step - accuracy: 0.0616 - loss: 6.5479
Epoch 2/135
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 17ms/step - accuracy: 0.1128 - loss: 5.6000
Epoch 3/135
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 17ms/step - accuracy: 0.1328 - loss: 5.2598
Epoch 4/135
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 17ms/step - accuracy: 0.1492 - loss: 5.0073
Epoch 5/135
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 17ms/step - accuracy: 0.1613 - loss: 4.7873
Epoch 6/135
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 17ms/step - accuracy: 0.1721 - loss: 4.5955
Epoch 7/135
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 17ms/step - accuracy: 0.1811 - loss: 4.4169
Epoch 8/135
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 17ms/step - accuracy: 0.1985 - loss: 4.2214


In [21]:
model.save('/kaggle/working/nxt_word_prediction.h5')

# Words Prediction / Model Testing

For Word prediction i have saved model and tokenizer file in the output section 

In [22]:
text = 'what'  ## initial word

no_of_words = 10 ## no. of words to predict

for i in range(no_of_words):
    # tokenize 
    token_text = tokenizer.texts_to_sequences([text])[0]
    # padding
    padded_token_text = pad_sequences([token_text], maxlen=56, padding='pre')
    # predict 
    pos = np.argmax(model.predict(padded_token_text))  ## predicting new word
    
    for word, index in tokenizer.word_index.items():
        if index == pos:
            text = text + " " + word
            print(text)
            time.sleep(0.5) ## time in seconds to predict new word

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 383ms/step
what to
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
what to do
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
what to do i
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
what to do i cannot
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
what to do i cannot submit
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
what to do i cannot submit to
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
what to do i cannot submit to us
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
what to do i cannot submit to us ”
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
what to do i cannot submit to us ” he
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
what to do i cannot submit to us ” he caught
