<a href="https://colab.research.google.com/github/Suhana830/next_word_prediction_using_LSTM/blob/main/LSTM_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("qoute_dataset.csv")
df.head()

Unnamed: 0,quote,Author
0,“The world as we have created it is a process ...,Albert Einstein
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling
2,“There are only two ways to live your life. On...,Albert Einstein
3,"“The person, be it gentleman or lady, who has ...",Jane Austen
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe


In [3]:
df.shape

(3038, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3038 entries, 0 to 3037
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   quote   3038 non-null   object
 1   Author  3038 non-null   object
dtypes: object(2)
memory usage: 47.6+ KB


In [5]:
quotes = df['quote']


In [6]:
quotes = quotes.str.lower()

In [7]:
import string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

quotes = quotes.apply(remove_punctuation)

In [8]:
unique_words = set(
    " ".join(quotes.astype(str)).split()
)

len(unique_words)


8978

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=len(unique_words))
tokenizer.fit_on_texts(quotes)

sequence = tokenizer.texts_to_sequences(quotes)
sequence[0]

[713,
 62,
 29,
 19,
 16,
 946,
 10,
 7,
 5,
 1156,
 8,
 70,
 293,
 10,
 145,
 12,
 809,
 104,
 752,
 70,
 2461]

In [10]:
for i in range(3):
  print(quotes[i]);

for i in range(3):
  print(sequence[i])

“the world as we have created it is a process of our thinking it cannot be changed without changing our thinking”
“it is our choices harry that show what we truly are far more than our abilities”
“there are only two ways to live your life one is as though nothing is a miracle the other is as though everything is a miracle”
[713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10, 145, 12, 809, 104, 752, 70, 2461]
[947, 7, 70, 871, 373, 9, 433, 21, 19, 465, 14, 294, 52, 54, 70, 3676]
[1337, 14, 53, 201, 714, 3, 81, 15, 36, 37, 7, 29, 329, 93, 7, 5, 1157, 1, 101, 7, 29, 329, 126, 7, 5, 3677]


In [11]:
X = []
Y =[]

for i in range(len(quotes)):
  for j in range(1,len(sequence[i])):
    x_input = sequence[i][:j]
    y_output = sequence[i][j];
    X.append(x_input)
    Y.append(y_output)

In [12]:
len(Y)

85270

In [13]:
maxlen = max(len(x) for x in X)

In [14]:
maxlen

745

In [15]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_padded = pad_sequences(X, maxlen=maxlen, padding="pre")

In [16]:
X_padded

array([[   0,    0,    0, ...,    0,    0,  713],
       [   0,    0,    0, ...,    0,  713,   62],
       [   0,    0,    0, ...,  713,   62,   29],
       ...,
       [   0,    0,    0, ...,    9,   19, 1125],
       [   0,    0,    0, ...,   19, 1125,    3],
       [   0,    0,    0, ..., 1125,    3,  169]], dtype=int32)

In [17]:
from tensorflow.keras.utils import to_categorical
Y_categorical = to_categorical(Y, num_classes=len(unique_words))

In [18]:
Y_categorical

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SimpleRNN

In [20]:
embedding_dim = 50
rnn_units = 128
vocab_size = len(unique_words)

rnn_model = Sequential()
rnn_model.add(Embedding(input_dim=len(unique_words), output_dim=embedding_dim, input_length=maxlen))
rnn_model.add(SimpleRNN(units=rnn_units))
rnn_model.add(Dense(units=vocab_size,activation="softmax"))
rnn_model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['accuracy'])




In [21]:
rnn_model.summary()

In [22]:

history_rnn = rnn_model.fit(
    X_padded, Y_categorical,epochs=10, batch_size=32,validation_split=0.1
)

KeyboardInterrupt: 

In [24]:
lstm_model = Sequential()
lstm_model.add(
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen),
)
lstm_model.add(LSTM(units=rnn_units))
lstm_model.add(Dense(units=vocab_size, activation="softmax"))

In [25]:
lstm_model.compile(optimizer="adam",loss="categorical_crossentropy", metrics=['accuracy'])

In [26]:
lstm_model.summary()

In [27]:
lstm_history = lstm_model.fit(X_padded, Y_categorical, batch_size=40, epochs=100, verbose=1, validation_split=0.1)

Epoch 1/100
[1m1919/1919[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 32ms/step - accuracy: 0.0391 - loss: 6.8899 - val_accuracy: 0.0646 - val_loss: 6.5054
Epoch 2/100
[1m1919/1919[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 32ms/step - accuracy: 0.0770 - loss: 6.0965 - val_accuracy: 0.0907 - val_loss: 6.3939
Epoch 3/100
[1m1919/1919[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 32ms/step - accuracy: 0.1030 - loss: 5.6881 - val_accuracy: 0.1037 - val_loss: 6.3572
Epoch 4/100
[1m1919/1919[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 32ms/step - accuracy: 0.1235 - loss: 5.3581 - val_accuracy: 0.1109 - val_loss: 6.4005
Epoch 5/100
[1m1919/1919[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 33ms/step - accuracy: 0.1407 - loss: 5.0543 - val_accuracy: 0.1166 - val_loss: 6.4847
Epoch 6/100
[1m1919/1919[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 32ms/step - accuracy: 0.1564 - loss: 4.7838 - val_accuracy: 0.1149 - val_loss: 6.601

In [28]:
lstm_model.save("lstm_model.h5")



In [31]:
index_to_word = {}

for word, index in tokenizer.word_index.items():
  index_to_word[index] = word

In [32]:
index_to_word

{1: 'the',
 2: 'you',
 3: 'to',
 4: 'and',
 5: 'a',
 6: 'i',
 7: 'is',
 8: 'of',
 9: 'that',
 10: 'it',
 11: 'in',
 12: 'be',
 13: 'not',
 14: 'are',
 15: 'your',
 16: 'have',
 17: 'for',
 18: 'but',
 19: 'we',
 20: 'if',
 21: 'what',
 22: 'with',
 23: 'all',
 24: 'love',
 25: 'can',
 26: 'my',
 27: 'when',
 28: 'will',
 29: 'as',
 30: 'who',
 31: 'do',
 32: 'or',
 33: 'me',
 34: 'he',
 35: 'they',
 36: 'life',
 37: 'one',
 38: 'was',
 39: 'like',
 40: 'there',
 41: 'people',
 42: 'on',
 43: 'its',
 44: 'at',
 45: 'so',
 46: 'never',
 47: 'no',
 48: 'them',
 49: 'dont',
 50: 'know',
 51: 'just',
 52: 'more',
 53: 'only',
 54: 'than',
 55: 'because',
 56: 'this',
 57: 'want',
 58: 'up',
 59: 'how',
 60: 'his',
 61: 'things',
 62: 'world',
 63: 'by',
 64: 'think',
 65: 'make',
 66: 'about',
 67: 'time',
 68: 'from',
 69: 'always',
 70: 'our',
 71: 'an',
 72: 'out',
 73: 'us',
 74: 'good',
 75: 'said',
 76: 'she',
 77: 'her',
 78: 'way',
 79: 'go',
 80: 'am',
 81: 'live',
 82: 'has',
 83:

In [33]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [39]:
def predictor(model, tokenizer, text_input, max_len):
  text = text_input.lower()

  seq = tokenizer.texts_to_sequences([text])[0]
  seq = pad_sequences([seq], maxlen=max_len, padding="pre")
  pred = model.predict(seq, verbose=0)

  pred_index = np.argmax(pred)
  return index_to_word[pred_index]

In [40]:
seed_text = "life is"
next_word = predictor(lstm_model, tokenizer, seed_text, maxlen)
next_word

'not'

In [41]:
seed_text = "you will"
next_word = predictor(lstm_model, tokenizer, seed_text, maxlen)
next_word

'lose'

In [42]:
seed_text = "what are you"
next_word = predictor(lstm_model, tokenizer, seed_text, maxlen)
next_word

'all'

In [None]:
seed_text = "what are you"
next_word = predictor(lstm_model, tokenizer, seed_text, maxlen)
next_word

In [47]:
def generate_text(model, tokenizer, seed_text, max_len, n_word):
  for _ in range(n_word):
    next_word = predictor(model, tokenizer, seed_text, max_len)
    seed_text += " " + next_word
  return seed_text

In [48]:
seed_text = "the meaning of life"
print(generate_text(lstm_model, tokenizer, seed_text, maxlen, 10))

the meaning of life is not to let those who have not completely we


In [49]:
seed_text = "you are my"
print(generate_text(lstm_model, tokenizer, seed_text, maxlen, 10))

you are my best friend as well as you — and much i


In [50]:
import pickle

with open("tokenizer.pkl", "wb") as f:
  pickle.dump(tokenizer, f)

In [53]:
with open("max_len.pkl", "wb") as f:
  pickle.dump(maxlen, f)