In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [31]:
df = pd.read_csv('qoute_dataset.csv')

In [32]:
df.head()

Unnamed: 0,quote,Author
0,“The world as we have created it is a process ...,Albert Einstein
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling
2,“There are only two ways to live your life. On...,Albert Einstein
3,"“The person, be it gentleman or lady, who has ...",Jane Austen
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe


In [33]:
df.shape

(3038, 2)

In [34]:
quotes = df['quote']
quotes.head()

Unnamed: 0,quote
0,“The world as we have created it is a process ...
1,"“It is our choices, Harry, that show what we t..."
2,“There are only two ways to live your life. On...
3,"“The person, be it gentleman or lady, who has ..."
4,"“Imperfection is beauty, madness is genius and..."


In [35]:
quotes = quotes.str.lower()
quotes.head()

Unnamed: 0,quote
0,“the world as we have created it is a process ...
1,"“it is our choices, harry, that show what we t..."
2,“there are only two ways to live your life. on...
3,"“the person, be it gentleman or lady, who has ..."
4,"“imperfection is beauty, madness is genius and..."


In [36]:
import string
translator = str.maketrans('', '', string.punctuation)
quotes = quotes.apply(lambda x: x.translate(translator))
print(translator)

{33: None, 34: None, 35: None, 36: None, 37: None, 38: None, 39: None, 40: None, 41: None, 42: None, 43: None, 44: None, 45: None, 46: None, 47: None, 58: None, 59: None, 60: None, 61: None, 62: None, 63: None, 64: None, 91: None, 92: None, 93: None, 94: None, 95: None, 96: None, 123: None, 124: None, 125: None, 126: None}


In [37]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [38]:
vocab_size = 10000  #size which will takes top 10000 words which were used in the quotes

tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(quotes)

In [39]:
word_index = tokenizer.word_index
print(len(word_index))
list(word_index.items())[:10]

8978


[('the', 1),
 ('you', 2),
 ('to', 3),
 ('and', 4),
 ('a', 5),
 ('i', 6),
 ('is', 7),
 ('of', 8),
 ('that', 9),
 ('it', 10)]

In [40]:
sequence = tokenizer.texts_to_sequences(quotes)

In [41]:
print(quotes[0])
print(sequence[0])

“the world as we have created it is a process of our thinking it cannot be changed without changing our thinking”
[713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10, 145, 12, 809, 104, 752, 70, 2461]


In [42]:
X = []
y = []

for seq in sequence:
    for i in range(1, len(seq)):
        input_seq = seq[:i]
        output_seq = seq[i]
        X.append(input_seq)
        y.append(output_seq)

In [43]:
X

[[713],
 [713, 62],
 [713, 62, 29],
 [713, 62, 29, 19],
 [713, 62, 29, 19, 16],
 [713, 62, 29, 19, 16, 946],
 [713, 62, 29, 19, 16, 946, 10],
 [713, 62, 29, 19, 16, 946, 10, 7],
 [713, 62, 29, 19, 16, 946, 10, 7, 5],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10, 145],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10, 145, 12],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10, 145, 12, 809],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10, 145, 12, 809, 104],
 [713,
  62,
  29,
  19,
  16,
  946,
  10,
  7,
  5,
  1156,
  8,
  70,
  293,
  10,
  145,
  12,
  809,
  104,
  752],
 [713,
  62,
  29,
  19,
  16,
  946,
  10,
  7,
  5,
  1156,
  8,
  70,
  293,
  10,
  145,
  12,
  809,
  

In [44]:
len(X)

85271

In [45]:
len(y)

85271

In [46]:
max_len = max(len(x) for x in X)
print(max_len)

745


In [47]:
#padding the sequence X
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_padded = pad_sequences(X, maxlen = max_len, padding = 'pre')

In [48]:
y = np.array(y)

In [49]:
X_padded.shape

(85271, 745)

In [50]:
y.shape

(85271,)

In [53]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [54]:
embedding_dim = 50
rnn_units = 128

In [55]:
lstm_model = Sequential()
lstm_model.add(
    Embedding(input_dim = vocab_size, output_dim = embedding_dim, input_length = max_len)
)
lstm_model.add(
    LSTM(units = rnn_units)
)
lstm_model.add(
    Dense(units = vocab_size, activation = 'softmax')
)



In [56]:
lstm_model.compile(
    optimizer = 'adam',
    loss = 'sparse_categorical_crossentropy',
    metrics = ['accuracy']
)

In [57]:
lstm_model.summary()

In [59]:
epochs = 100
batch_size = 128

history_lstm = lstm_model.fit(
    X_padded,
    y,
    epochs = epochs,
    batch_size = batch_size,
    validation_split = 0.1
)

Epoch 1/100
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 50ms/step - accuracy: 0.0381 - loss: 7.1073 - val_accuracy: 0.0474 - val_loss: 6.6737
Epoch 2/100
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 50ms/step - accuracy: 0.0550 - loss: 6.3284 - val_accuracy: 0.0702 - val_loss: 6.5271
Epoch 3/100
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 49ms/step - accuracy: 0.0790 - loss: 6.0311 - val_accuracy: 0.0889 - val_loss: 6.4446
Epoch 4/100
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 50ms/step - accuracy: 0.0996 - loss: 5.7940 - val_accuracy: 0.0955 - val_loss: 6.4103
Epoch 5/100
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 50ms/step - accuracy: 0.1105 - loss: 5.6032 - val_accuracy: 0.1010 - val_loss: 6.4053
Epoch 6/100
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 49ms/step - accuracy: 0.1205 - loss: 5.4287 - val_accuracy: 0.1058 - val_loss: 6.4071
Epoch 7/10

In [60]:
lstm_model.save("lstm_model.h5")



In [61]:
index_to_word = {}
for word, index in word_index.items():
  index_to_word[index] = word

In [63]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [64]:
def predictor(model, tokenizer, text, max_len):
    text = text.lower()

    seq = tokenizer.texts_to_sequences([text])[0]
    seq = pad_sequences([seq], maxlen=max_len, padding='pre')

    pred = model.predict(seq, verbose=0)
    pred_index = np.argmax(pred)
    return index_to_word[pred_index]

In [70]:
seed_text = "Hello babies welcome to earth"
next_word = predictor(lstm_model, tokenizer, seed_text, max_len)
print(next_word)

its


In [78]:
def generate_text(model, tokenizer, seed_text, max_len, n_words):
    for _ in range(n_words):
        next_word = predictor(model, tokenizer, seed_text, max_len)
        if next_word == "":
            break
        seed_text += " " + next_word
    return seed_text

In [81]:
seed = "Hi my name is"
generated_text = generate_text(lstm_model, tokenizer, seed, max_len, 15)
print(generated_text)

Hi my name is celaena sardothien but it makes them and children to think them sometimes cry and it


In [82]:
import pickle
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [83]:
with open('max_len.pkl', 'wb') as f:
    pickle.dump(max_len, f)