In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv("qoute_dataset.csv")

In [3]:
df.head(4)

Unnamed: 0,quote,Author
0,‚ÄúThe world as we have created it is a process ...,Albert Einstein
1,"‚ÄúIt is our choices, Harry, that show what we t...",J.K. Rowling
2,‚ÄúThere are only two ways to live your life. On...,Albert Einstein
3,"‚ÄúThe person, be it gentleman or lady, who has ...",Jane Austen


In [4]:
df.shape

(3038, 2)

In [5]:
quotes=df['quote']

In [6]:
quotes.head()

0    ‚ÄúThe world as we have created it is a process ...
1    ‚ÄúIt is our choices, Harry, that show what we t...
2    ‚ÄúThere are only two ways to live your life. On...
3    ‚ÄúThe person, be it gentleman or lady, who has ...
4    ‚ÄúImperfection is beauty, madness is genius and...
Name: quote, dtype: object

## text processing

### Lowering case

In [7]:
quotes=quotes.apply(lambda x: x.lower())

In [8]:
quotes.head()

0    ‚Äúthe world as we have created it is a process ...
1    ‚Äúit is our choices, harry, that show what we t...
2    ‚Äúthere are only two ways to live your life. on...
3    ‚Äúthe person, be it gentleman or lady, who has ...
4    ‚Äúimperfection is beauty, madness is genius and...
Name: quote, dtype: object

### Removing Punctuation

In [9]:
import string
def remove_punctuation(txt):
    return txt.translate(str.maketrans('','',string.punctuation))

In [10]:
quotes=quotes.apply(remove_punctuation)

In [11]:
quotes.head()

0    ‚Äúthe world as we have created it is a process ...
1    ‚Äúit is our choices harry that show what we tru...
2    ‚Äúthere are only two ways to live your life one...
3    ‚Äúthe person be it gentleman or lady who has no...
4    ‚Äúimperfection is beauty madness is genius and ...
Name: quote, dtype: object

### Removing digits

In [12]:
def remove_numbers(txt):
  new=""
  for i in txt:
        if not i.isdigit():
            new=new+i
  return new 

In [13]:
quotes=quotes.apply(remove_numbers)

In [14]:
quotes.head()

0    ‚Äúthe world as we have created it is a process ...
1    ‚Äúit is our choices harry that show what we tru...
2    ‚Äúthere are only two ways to live your life one...
3    ‚Äúthe person be it gentleman or lady who has no...
4    ‚Äúimperfection is beauty madness is genius and ...
Name: quote, dtype: object

### Removing url/links

In [15]:
import re

def remove_urls(text):
    # Regex to match URLs (http, https, www, etc.)
    url_pattern = r'http\S+|www\.\S+'
    # Replace all URLs with an empty string
    return re.sub(url_pattern, '', text)

In [16]:
quotes=quotes.apply(remove_urls)

In [17]:
quotes.head()

0    ‚Äúthe world as we have created it is a process ...
1    ‚Äúit is our choices harry that show what we tru...
2    ‚Äúthere are only two ways to live your life one...
3    ‚Äúthe person be it gentleman or lady who has no...
4    ‚Äúimperfection is beauty madness is genius and ...
Name: quote, dtype: object

### Removing Html tags

In [18]:
import re

def remove_html_tags(text):
    # Pattern to match anything between < and >
    html_pattern = r'<[^>]+>'
    # Replace tags with empty string
    return re.sub(html_pattern, '', text)

In [19]:
quotes=quotes.apply(remove_html_tags)

In [20]:
quotes.head()

0    ‚Äúthe world as we have created it is a process ...
1    ‚Äúit is our choices harry that show what we tru...
2    ‚Äúthere are only two ways to live your life one...
3    ‚Äúthe person be it gentleman or lady who has no...
4    ‚Äúimperfection is beauty madness is genius and ...
Name: quote, dtype: object

### removing emojis

In [21]:
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons üòÄ üòÉ üòÑ
        "\U0001F300-\U0001F5FF"  # symbols & pictographs üåÄ üåà üéâ
        "\U0001F680-\U0001F6FF"  # transport & map symbols üöó üöÄ ‚úàÔ∏è
        "\U0001F1E0-\U0001F1FF"  # flags üáÆüá≥ üá∫üá∏
        "\U00002500-\U00002BEF"  # Chinese/Japanese symbols, etc.
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "\U0001f926-\U0001f937"
        "\U00010000-\U0010ffff"
        "\u2640-\u2642" 
        "\u2600-\u2B55"
        "\u200d"
        "\u23cf"
        "\u23e9"
        "\u231a"
        "\ufe0f"  # dingbats
        "\u3030"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub('', text)

    ## second method

    def emoji_remove(txt):
        new=""
        for i in txt:
           if i.isascii():
               new=new+i
        return new


In [22]:
quotes=quotes.apply(remove_emojis)

In [23]:
quotes.head()

0    ‚Äúthe world as we have created it is a process ...
1    ‚Äúit is our choices harry that show what we tru...
2    ‚Äúthere are only two ways to live your life one...
3    ‚Äúthe person be it gentleman or lady who has no...
4    ‚Äúimperfection is beauty madness is genius and ...
Name: quote, dtype: object

In [24]:
from tensorflow.keras.preprocessing.text import Tokenizer


In [25]:
vocab_size = 10000

tokinizer = Tokenizer(num_words=vocab_size)
tokinizer.fit_on_texts(quotes)

In [26]:
word_index = tokinizer.word_index
print(len(word_index))
list(word_index.items())[:10]

8944


[('the', 1),
 ('you', 2),
 ('to', 3),
 ('and', 4),
 ('a', 5),
 ('i', 6),
 ('is', 7),
 ('of', 8),
 ('that', 9),
 ('it', 10)]

In [27]:
sequence = tokinizer.texts_to_sequences(quotes)


In [28]:
for i in range(3):
  print(quotes[i])

‚Äúthe world as we have created it is a process of our thinking it cannot be changed without changing our thinking‚Äù
‚Äúit is our choices harry that show what we truly are far more than our abilities‚Äù
‚Äúthere are only two ways to live your life one is as though nothing is a miracle the other is as though everything is a miracle‚Äù


In [29]:
for i in range(3):
  print(sequence[i])

[713, 62, 29, 19, 16, 945, 10, 7, 5, 1155, 8, 70, 293, 10, 145, 12, 809, 104, 752, 70, 2451]
[946, 7, 70, 871, 373, 9, 433, 21, 19, 465, 14, 294, 52, 54, 70, 3665]
[1334, 14, 53, 201, 714, 3, 81, 15, 36, 37, 7, 29, 329, 93, 7, 5, 1156, 1, 101, 7, 29, 329, 126, 7, 5, 3666]


In [30]:
X = []
y = []

for seq in sequence:
  for i in range(1,len(seq)):
    input_seq = seq[:i]
    output_seq = seq[i]
    X.append(input_seq)
    y.append(output_seq)

In [31]:
len(X)


85201

In [32]:
X

[[713],
 [713, 62],
 [713, 62, 29],
 [713, 62, 29, 19],
 [713, 62, 29, 19, 16],
 [713, 62, 29, 19, 16, 945],
 [713, 62, 29, 19, 16, 945, 10],
 [713, 62, 29, 19, 16, 945, 10, 7],
 [713, 62, 29, 19, 16, 945, 10, 7, 5],
 [713, 62, 29, 19, 16, 945, 10, 7, 5, 1155],
 [713, 62, 29, 19, 16, 945, 10, 7, 5, 1155, 8],
 [713, 62, 29, 19, 16, 945, 10, 7, 5, 1155, 8, 70],
 [713, 62, 29, 19, 16, 945, 10, 7, 5, 1155, 8, 70, 293],
 [713, 62, 29, 19, 16, 945, 10, 7, 5, 1155, 8, 70, 293, 10],
 [713, 62, 29, 19, 16, 945, 10, 7, 5, 1155, 8, 70, 293, 10, 145],
 [713, 62, 29, 19, 16, 945, 10, 7, 5, 1155, 8, 70, 293, 10, 145, 12],
 [713, 62, 29, 19, 16, 945, 10, 7, 5, 1155, 8, 70, 293, 10, 145, 12, 809],
 [713, 62, 29, 19, 16, 945, 10, 7, 5, 1155, 8, 70, 293, 10, 145, 12, 809, 104],
 [713,
  62,
  29,
  19,
  16,
  945,
  10,
  7,
  5,
  1155,
  8,
  70,
  293,
  10,
  145,
  12,
  809,
  104,
  752],
 [713,
  62,
  29,
  19,
  16,
  945,
  10,
  7,
  5,
  1155,
  8,
  70,
  293,
  10,
  145,
  12,
  809,
  

In [33]:
y

[62,
 29,
 19,
 16,
 945,
 10,
 7,
 5,
 1155,
 8,
 70,
 293,
 10,
 145,
 12,
 809,
 104,
 752,
 70,
 2451,
 7,
 70,
 871,
 373,
 9,
 433,
 21,
 19,
 465,
 14,
 294,
 52,
 54,
 70,
 3665,
 14,
 53,
 201,
 714,
 3,
 81,
 15,
 36,
 37,
 7,
 29,
 329,
 93,
 7,
 5,
 1156,
 1,
 101,
 7,
 29,
 329,
 126,
 7,
 5,
 3666,
 116,
 12,
 10,
 2452,
 32,
 1042,
 30,
 82,
 13,
 601,
 11,
 5,
 74,
 1335,
 119,
 12,
 2453,
 3667,
 7,
 313,
 753,
 7,
 638,
 4,
 43,
 144,
 3,
 12,
 682,
 1336,
 54,
 682,
 3669,
 13,
 3,
 202,
 5,
 90,
 8,
 434,
 279,
 202,
 5,
 90,
 8,
 3671,
 7,
 144,
 3,
 12,
 1337,
 17,
 21,
 2,
 14,
 54,
 3,
 12,
 175,
 17,
 21,
 2,
 14,
 3672,
 16,
 13,
 1338,
 191,
 51,
 415,
 714,
 9,
 363,
 3673,
 180,
 7,
 39,
 5,
 810,
 1339,
 2,
 46,
 50,
 59,
 322,
 10,
 7,
 168,
 43,
 11,
 639,
 3674,
 111,
 104,
 1044,
 7,
 39,
 2,
 50,
 3675,
 36,
 7,
 21,
 2,
 65,
 10,
 47,
 181,
 21,
 96,
 130,
 3,
 754,
 58,
 123,
 43,
 5,
 1932,
 174,
 18,
 1,
 74,
 208,
 7,
 2,
 94,
 3,
 466,
 59,
 96,

In [34]:
len(y)


85201

In [35]:
max_len = max(len(x) for x in X)
print(max_len)

745


In [36]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_padded = pad_sequences(X, maxlen=max_len, padding='pre')


In [37]:
y = np.array(y)


In [38]:
X_padded.shape


(85201, 745)

In [39]:
from tensorflow.keras.utils import to_categorical
y_one_hot = to_categorical(y, num_classes=vocab_size)

In [40]:
y.shape


(85201,)

In [41]:
y_one_hot.shape


(85201, 10000)

In [42]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,SimpleRNN,LSTM, Dense

In [43]:
embedding_dim = 50
rnn_units = 128

In [44]:
rnn_model = Sequential()

rnn_model.add(
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len)
)
rnn_model.add(SimpleRNN(units=rnn_units))
rnn_model.add(Dense(units=vocab_size, activation='softmax'))



In [45]:
rnn_model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
     


In [46]:
rnn_model.summary()


In [47]:
epochs=10
batch_size=128

In [49]:
# history_rnn=rnn_model.fit(X_padded,y_one_hot,
#                           epochs=epochs,
#                           batch_size=batch_size,
#                           validation_split=0.1
#                                     )

In [50]:
lstm_model = Sequential()
lstm_model.add(
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len)
)
lstm_model.add(LSTM(units=rnn_units))
lstm_model.add(Dense(units=vocab_size, activation='softmax'))



In [51]:
lstm_model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [52]:
lstm_model.summary()


In [53]:
epochs=100
batch_size=128

In [54]:
# history_lstm=lstm_model.fit(X_padded,y_one_hot,
#                           epochs=epochs,
#                           batch_size=batch_size,
#                           validation_split=0.1
#                                     )

In [55]:
# lstm_model.save("lstm__model.h5model.h5")


In [56]:
from tensorflow.keras.models import load_model

lstm_model = load_model("lstm__model.h5")



In [57]:

index_to_word = {}
for word, index in word_index.items():
  index_to_word[index] = word

In [58]:
def predictor(model,tokenizer,text,max_len):
  text = text.lower()

  seq = tokenizer.texts_to_sequences([text])[0]
  seq = pad_sequences([seq], maxlen=max_len, padding='pre')

  pred = model.predict(seq,verbose = 0)
  pred_index = np.argmax(pred)
  return index_to_word[pred_index]

In [63]:

seed_text = "life is "
next_word = predictor(lstm_model,tokinizer,seed_text,max_len)
print(next_word)

not


In [60]:

import pickle
with open("tokenizer.pkl", "wb") as f:
  pickle.dump(tokinizer, f)

In [61]:

with open("max_len.pkl", "wb") as f:
  pickle.dump(max_len, f)