In [1]:
import numpy as np
import pandas as pd

# Import Data

In [2]:
df = pd.read_csv("/kaggle/input/quotes-dataset/Quotes Dataset.csv")

In [3]:
df.head()

Unnamed: 0,Number,Quote,Author
0,1,The only thing we have to fear is fear itself.,Franklin D. Roosevelt
1,2,The truth will set you free.,The Bible
2,3,To be yourself in a world that is constantly t...,Ralph Waldo Emerson
3,4,"Success is not final, failure is not fatal: It...",Winston S. Churchill
4,5,The only way to do great work is to love what ...,Steve Jobs


In [4]:
df.tail()

Unnamed: 0,Number,Quote,Author
720,721,Believe you can and you're halfway there.,Theodore Roosevelt
721,722,The mind is everything. What you think you bec...,Buddha
722,723,"I have not failed. I've just found 10,000 ways...",Thomas Edison
723,724,A journey of a thousand miles begins with a si...,Lao Tzu
724,725,It always seems impossible until it's done.,Nelson Mandela


In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Number,725.0,363.0,209.43376,1.0,182.0,363.0,544.0,725.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 725 entries, 0 to 724
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Number  725 non-null    int64 
 1   Quote   725 non-null    object
 2   Author  725 non-null    object
dtypes: int64(1), object(2)
memory usage: 17.1+ KB


# Visulize

In [7]:
from plotly import express
express.pie(data_frame=df, names='Author', color='Author')

In [8]:
express.histogram(x=df['Quote'].str.len(), log_y=True)

# EDA

In [9]:
df.columns

Index(['Number', 'Quote', 'Author'], dtype='object')

In [10]:
del df['Number']

In [11]:
df.columns

Index(['Quote', 'Author'], dtype='object')

In [12]:
# df['Author'].value_counts()

In [13]:
df.isnull().sum()

Quote     0
Author    0
dtype: int64

In [14]:
max_len = max(df['Quote'].str.len())
max_len

125

In [15]:
quotes = df['Quote']

In [16]:
quotes[:4]

0       The only thing we have to fear is fear itself.
1                         The truth will set you free.
2    To be yourself in a world that is constantly t...
3    Success is not final, failure is not fatal: It...
Name: Quote, dtype: object

# Preprocessing

In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2024-04-10 08:17:31.411999: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-10 08:17:31.412133: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-10 08:17:31.578066: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [18]:
tokenizer = Tokenizer()

In [19]:
tokenizer.fit_on_texts(quotes)

In [20]:
voc = tokenizer.word_index
voc_len = len(tokenizer.word_index)
voc_len

249

In [21]:
tokized_sequnece = tokenizer.texts_to_sequences(quotes)
tokized_sequnece[:4]

[[2, 23, 46, 22, 10, 3, 115, 8, 115, 116],
 [2, 144, 106, 145, 1, 146],
 [3, 9, 26, 104, 6, 105, 20, 8, 117, 118, 3, 119, 1, 107, 27, 8, 2, 120, 121],
 [25, 8, 14, 147, 148, 8, 14, 149, 4, 8, 2, 150, 3, 151, 20, 152]]

In [22]:
input_seq = []

for sentence in quotes.str.split('\n'):
#     print(sentence)
    tokenized_sen = tokenizer.texts_to_sequences(sentence)[0]
#     print(tokenized_sen)
    for i in range(1,len(tokenized_sen)):
        input_seq.append(tokenized_sen[:i+1])
    

In [23]:
len(input_seq)

7707

In [24]:
max_len = max([len(x) for x in input_seq])
max_len

24

In [25]:
x_padded = pad_sequences(input_seq,maxlen=max_len,padding='pre')

In [26]:
x_padded

array([[ 0,  0,  0, ...,  0,  2, 23],
       [ 0,  0,  0, ...,  2, 23, 46],
       [ 0,  0,  0, ..., 23, 46, 22],
       ...,
       [ 0,  0,  0, ..., 67, 45, 40],
       [ 0,  0,  0, ..., 45, 40, 38],
       [ 0,  0,  0, ..., 40, 38, 68]], dtype=int32)

In [27]:
x = x_padded[:,:-1]
y = x_padded[:,-1]

In [28]:
x.shape,y.shape

((7707, 23), (7707,))

In [29]:
from tensorflow.keras.utils import to_categorical

In [30]:
y = to_categorical(y,num_classes=voc_len+1)
y.shape

(7707, 250)

# Model

In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM

In [32]:
voc_len

249

In [33]:
model = Sequential()
model.add(Embedding(input_dim=250,output_dim=100,input_shape=(23,)))
model.add(LSTM(150,return_sequences=True))
model.add(LSTM(128,return_sequences=True))
model.add(LSTM(128))
# model.add(BatchNormalization())
model.add(Dense(250,activation='softmax'))


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



In [34]:
model.compile(loss = 'categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [35]:
model.summary()

In [36]:
model.fit(x,y,epochs=15)

Epoch 1/15
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 68ms/step - accuracy: 0.0695 - loss: 4.7382
Epoch 2/15
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 69ms/step - accuracy: 0.1117 - loss: 3.9333
Epoch 3/15
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 69ms/step - accuracy: 0.3371 - loss: 2.6787
Epoch 4/15
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 67ms/step - accuracy: 0.6526 - loss: 1.6574
Epoch 5/15
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 69ms/step - accuracy: 0.8009 - loss: 1.0535
Epoch 6/15
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 69ms/step - accuracy: 0.8702 - loss: 0.7316
Epoch 7/15
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 70ms/step - accuracy: 0.8869 - loss: 0.5547
Epoch 8/15
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 68ms/step - accuracy: 0.9025 - loss: 0.4598
Epoch 9/15
[1m241/241[

<keras.src.callbacks.history.History at 0x7a8e8831e080>

In [37]:
model.save('quote_writer.h5')

In [38]:
import pickle

pickle.dump(tokenizer,open("tokenizer_of_quotes.pkl","wb"))

# Testing on Model

In [39]:
import tensorflow

In [40]:
saved_model = tensorflow.keras.models.load_model("/kaggle/working/quote_writer.h5")

saved_tokenizer = pickle.load(open("/kaggle/working/tokenizer_of_quotes.pkl","rb"))
import numpy as np

In [41]:
# user_text = input("enter your 2,3 word of qoute to complete")

user_text = "the truth"

for i in range(4):
    text_token = saved_tokenizer.texts_to_sequences([user_text])[0]
    print(text_token)
    input_x = pad_sequences([text_token],maxlen=5,padding='pre')
    predictions = saved_model.predict(input_x)
    pos=np.argmax(predictions)
    print(pos)
    for word, index in saved_tokenizer.word_index.items():
        if index == pos:
            user_text = user_text+' '+word
            print(user_text)
        

[2, 144]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 423ms/step
2
the truth the
[2, 144, 2]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
91
the truth the anyone
[2, 144, 2, 91]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
203
the truth the anyone for
[2, 144, 2, 91, 203]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
237
the truth the anyone for successful
