In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
df = pd.read_csv('qoute_dataset.csv')
df

Unnamed: 0,quote,Author
0,“The world as we have created it is a process ...,Albert Einstein
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling
2,“There are only two ways to live your life. On...,Albert Einstein
3,"“The person, be it gentleman or lady, who has ...",Jane Austen
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe
...,...,...
3033,The past beats inside me like a second heart.,"John Banville,"
3034,"Damn, Claire. Warn a guy before you do a face-...","Rachel Caine,"
3035,"Can you be a girl for a few seconds?""""I'm alwa...","Veronica Roth,"
3036,That's what fiction is for. It's for getting a...,Tim O'Brien


In [6]:
qts = df['quote']
qts.head()

0    “The world as we have created it is a process ...
1    “It is our choices, Harry, that show what we t...
2    “There are only two ways to live your life. On...
3    “The person, be it gentleman or lady, who has ...
4    “Imperfection is beauty, madness is genius and...
Name: quote, dtype: object

## Preprocessing

In [7]:
qts = qts.str.lower()

In [8]:
import string

In [9]:
translator = str.maketrans('','',string.punctuation)

In [13]:
qts = qts.apply(lambda x:x.translate(translator))
qts.head()

0    “the world as we have created it is a process ...
1    “it is our choices harry that show what we tru...
2    “there are only two ways to live your life one...
3    “the person be it gentleman or lady who has no...
4    “imperfection is beauty madness is genius and ...
Name: quote, dtype: object

## Tokenization

In [14]:
qts[0]

'“the world as we have created it is a process of our thinking it cannot be changed without changing our thinking”'

In [15]:
str.split('the world as we have created it is a process of our thinking it cannot be changed without changing our thinking',sep=' ')

['the',
 'world',
 'as',
 'we',
 'have',
 'created',
 'it',
 'is',
 'a',
 'process',
 'of',
 'our',
 'thinking',
 'it',
 'cannot',
 'be',
 'changed',
 'without',
 'changing',
 'our',
 'thinking']

In [17]:
# we will use keras for tokenizing...

from tensorflow.keras.preprocessing.text import Tokenizer

In [18]:
vocab_size=10000

token = Tokenizer(num_words=vocab_size)
token.fit_on_texts(qts)

In [20]:
word_index = token.word_index
print(len(word_index))

8978


In [21]:
list(word_index.items())[:10]

[('the', 1),
 ('you', 2),
 ('to', 3),
 ('and', 4),
 ('a', 5),
 ('i', 6),
 ('is', 7),
 ('of', 8),
 ('that', 9),
 ('it', 10)]

In [22]:
sequence = token.texts_to_sequences(qts)

In [24]:
qts[0]

'“the world as we have created it is a process of our thinking it cannot be changed without changing our thinking”'

In [23]:
sequence[0]

[713,
 62,
 29,
 19,
 16,
 946,
 10,
 7,
 5,
 1156,
 8,
 70,
 293,
 10,
 145,
 12,
 809,
 104,
 752,
 70,
 2461]

## Making X,y from sentences...

In [25]:
X = []
y = []

for seq in sequence:
    for i in range(1,len(seq)):
        
        input_seq = seq[:i]
        output_seq = seq[i]
        
        X.append(input_seq)
        y.append(output_seq)

In [26]:
X

[[713],
 [713, 62],
 [713, 62, 29],
 [713, 62, 29, 19],
 [713, 62, 29, 19, 16],
 [713, 62, 29, 19, 16, 946],
 [713, 62, 29, 19, 16, 946, 10],
 [713, 62, 29, 19, 16, 946, 10, 7],
 [713, 62, 29, 19, 16, 946, 10, 7, 5],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10, 145],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10, 145, 12],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10, 145, 12, 809],
 [713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10, 145, 12, 809, 104],
 [713,
  62,
  29,
  19,
  16,
  946,
  10,
  7,
  5,
  1156,
  8,
  70,
  293,
  10,
  145,
  12,
  809,
  104,
  752],
 [713,
  62,
  29,
  19,
  16,
  946,
  10,
  7,
  5,
  1156,
  8,
  70,
  293,
  10,
  145,
  12,
  809,
  