In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('/kaggle/input/quotes-dataset/qoute_dataset.csv')
df

Unnamed: 0,quote,Author
0,“The world as we have created it is a process ...,Albert Einstein
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling
2,“There are only two ways to live your life. On...,Albert Einstein
3,"“The person, be it gentleman or lady, who has ...",Jane Austen
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe
...,...,...
3033,The past beats inside me like a second heart.,"John Banville,"
3034,"Damn, Claire. Warn a guy before you do a face-...","Rachel Caine,"
3035,"Can you be a girl for a few seconds?""""I'm alwa...","Veronica Roth,"
3036,That's what fiction is for. It's for getting a...,Tim O'Brien


In [3]:
qts = df['quote']
qts.head()

0    “The world as we have created it is a process ...
1    “It is our choices, Harry, that show what we t...
2    “There are only two ways to live your life. On...
3    “The person, be it gentleman or lady, who has ...
4    “Imperfection is beauty, madness is genius and...
Name: quote, dtype: object

## Steps
Tokenization

Vectorization

Embedding

##  Preprocessing

In [4]:
qts = qts.str.lower()

In [5]:
import string

translator = str.maketrans('','',string.punctuation)

In [6]:
qts = qts.apply(lambda x:x.translate(translator))
qts.head()

0    “the world as we have created it is a process ...
1    “it is our choices harry that show what we tru...
2    “there are only two ways to live your life one...
3    “the person be it gentleman or lady who has no...
4    “imperfection is beauty madness is genius and ...
Name: quote, dtype: object

## Tokenization

In [7]:
qts[0]

'“the world as we have created it is a process of our thinking it cannot be changed without changing our thinking”'

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer

# using Tokenizer , not .split

vocab_size=10000

token = Tokenizer(num_words=vocab_size)
token.fit_on_texts(qts)

2026-02-04 08:37:29.892303: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770194249.917518     666 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770194249.925451     666 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770194249.945359     666 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770194249.945397     666 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770194249.945400     666 computation_placer.cc:177] computation placer alr

In [9]:
word_index = token.word_index
print(len(word_index))

8978


In [10]:
list(word_index.items())[:10]

[('the', 1),
 ('you', 2),
 ('to', 3),
 ('and', 4),
 ('a', 5),
 ('i', 6),
 ('is', 7),
 ('of', 8),
 ('that', 9),
 ('it', 10)]

In [11]:
sequence = token.texts_to_sequences(qts)

In [12]:
print(f'Sentence : {qts[0]}\n to Vector : {sequence[0]}')

Sentence : “the world as we have created it is a process of our thinking it cannot be changed without changing our thinking”
 to Vector : [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10, 145, 12, 809, 104, 752, 70, 2461]


### Making X,y from sentence tokens...

In [13]:
X = []
y = []

for seq in sequence:
    for i in range(1,len(seq)):
        
        input_seq = seq[:i]
        output_seq = seq[i]
        
        X.append(input_seq)
        y.append(output_seq)

In [14]:
X[:20]

[[713],
 [713, 62],
 [713, 62, 29],
 [713, 62, 29, 19],
 [713, 62, 29, 19, 16],
 [713, 62, 29, 19, 16, 946],
 [713, 62, 29, 19, 16, 946, 10],
 [713, 62, 29, 19, 16, 946, 10, 7],
 [713, 62, 29, 19, 16, 946, 10, 7, 5],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10, 145],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10, 145, 12],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10, 145, 12, 809],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10, 145, 12, 809, 104],
 [713,
  62,
  29,
  19,
  16,
  946,
  10,
  7,
  5,
  1156,
  8,
  70,
  293,
  10,
  145,
  12,
  809,
  104,
  752],
 [713,
  62,
  29,
  19,
  16,
  946,
  10,
  7,
  5,
  1156,
  8,
  70,
  293,
  10,
  145,
  12,
  809,
  

In [15]:
len(X)

85271

In [16]:
len(y)

85271

## Padding

In [17]:
max_len = max(len(x) for x in X)
max_len

745

In [18]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_pad = pad_sequences( X,
    maxlen=max_len,
    padding='pre',)
X_pad

array([[   0,    0,    0, ...,    0,    0,  713],
       [   0,    0,    0, ...,    0,  713,   62],
       [   0,    0,    0, ...,  713,   62,   29],
       ...,
       [   0,    0,    0, ...,    9,   19, 1125],
       [   0,    0,    0, ...,   19, 1125,    3],
       [   0,    0,    0, ..., 1125,    3,  169]], dtype=int32)

In [19]:
X_pad.shape

(85271, 745)

In [20]:
y = np.array(y)
y

array([ 62,  29,  19, ...,   3, 169, 101])

## making sparse y,OneHot enc

In [21]:
from tensorflow.keras.utils import to_categorical

y_enc = to_categorical(y,num_classes=vocab_size)

In [22]:
y_enc.shape

(85271, 10000)

## Embedding & Model built using LSTM

In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

In [24]:
embed_dim = 50
rnn_units = 128

In [25]:
lstm = Sequential([
    layers.Embedding(input_dim = vocab_size, output_dim = embed_dim),

    layers.LSTM(units = rnn_units), #units=128

    layers.Dense(units=vocab_size, activation='softmax')
])

I0000 00:00:1770194254.918340     666 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13757 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1770194254.920345     666 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13757 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [26]:
lstm.compile(optimizer = 'adam',
            loss = 'categorical_crossentropy',
            metrics = ['accuracy'])

In [27]:
lstm.summary()

In [28]:
history_lstm = lstm.fit(x = X_pad, y = y_enc, batch_size=128 , epochs=100, validation_split=0.1)

Epoch 1/100


I0000 00:00:1770194268.433510     726 cuda_dnn.cc:529] Loaded cuDNN version 91002


[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 52ms/step - accuracy: 0.0351 - loss: 7.1159 - val_accuracy: 0.0448 - val_loss: 6.6767
Epoch 2/100
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 51ms/step - accuracy: 0.0556 - loss: 6.3540 - val_accuracy: 0.0652 - val_loss: 6.5595
Epoch 3/100
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 50ms/step - accuracy: 0.0777 - loss: 6.0678 - val_accuracy: 0.0879 - val_loss: 6.4533
Epoch 4/100
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 50ms/step - accuracy: 0.0999 - loss: 5.8175 - val_accuracy: 0.0944 - val_loss: 6.4416
Epoch 5/100
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 51ms/step - accuracy: 0.1070 - loss: 5.6318 - val_accuracy: 0.1001 - val_loss: 6.4216
Epoch 6/100
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 51ms/step - accuracy: 0.1169 - loss: 5.4719 - val_accuracy: 0.1051 - val_loss: 6.4389
Epoch 7/100
[1m600/60

In [30]:
lstm.save("lstm_model.h5")




In [33]:
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(token, f)
