In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:

df = pd.read_csv("qoute_dataset.csv")

In [None]:

quotes = df['quote']
quotes.head()

In [None]:
quotes = quotes.str.lower()

In [None]:
import string
translator = str.maketrans('', '', string.punctuation)
quotes = quotes.apply(lambda x: x.translate(translator))

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
vocab_size = 10000

tokinizer = Tokenizer(num_words=vocab_size)
tokinizer.fit_on_texts(quotes)

In [None]:
word_index = tokinizer.word_index
print(len(word_index))
list(word_index.items())[:10]

In [None]:
sequence = tokinizer.texts_to_sequences(quotes)

In [None]:
for i in range(3):
  print(quotes[i])


In [None]:
for i in range(3):
  print(sequence[i])

In [None]:
X = []
y = []

for seq in sequence:
  for i in range(1,len(seq)):
    input_seq = seq[:i]
    output_seq = seq[i]
    X.append(input_seq)
    y.append(output_seq)

In [None]:
max_len = max(len(x) for x in X)
print(max_len)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_padded = pad_sequences(X, maxlen=max_len, padding='pre')

In [None]:
y = np.array(y)

In [None]:
from tensorflow.keras.utils import to_categorical
y_one_hot = to_categorical(y, num_classes=vocab_size)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,SimpleRNN,LSTM, Dense

In [None]:
embedding_dim = 50
rnn_units = 128

In [None]:
rnn_model = Sequential()

rnn_model.add(
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len)
)
rnn_model.add(SimpleRNN(units=rnn_units))
rnn_model.add(Dense(units=vocab_size, activation='softmax'))

In [None]:
rnn_model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
rnn_model.summary()

In [None]:
lstm_model = Sequential()
lstm_model.add(
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len)
)
lstm_model.add(LSTM(units=rnn_units))
lstm_model.add(Dense(units=vocab_size, activation='softmax'))

In [None]:
lstm_model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
lstm_model.summary()

In [None]:
import pickle
with open("tokenizer.pkl", "wb") as f:
  pickle.dump(tokinizer, f)

In [None]:
with open("max_len.pkl", "wb") as f:
  pickle.dump(max_len, f)

In [None]:
lstm_model.fit(X_padded, y_one_hot, epochs=50, verbose=1)

In [None]:

lstm_model.save("lstm_model.h5")