# Libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
import re
from nltk import ngrams
from collections import defaultdict

In [3]:
import random
import json
import os

In [4]:
pd.options.display.max_colwidth = -1

# Data

In [5]:
data = pd.DataFrame(columns=['Quote'])
for folderName, subfolders, filenames in os.walk('E:/Scrapped-Data/Quotes-Goodreads/'):
    for filename in filenames:
        data = pd.concat([pd.read_csv('E:/Scrapped-Data/Quotes-Goodreads/'+filename), data], axis=0)
print(len(data))

92970


In [6]:
data.head()

Unnamed: 0,Quote
0,"“I love deadlines. I love the whooshing noise they make as they go by.” ― Douglas Adams, The Salmon of Doubt //"
1,"“There is no greater agony than bearing an untold story inside you.” ― Maya Angelou, I Know Why the Caged Bird Sings //"
2,"“What really knocks me out is a book that, when you're all done reading it, you wish the author that wrote it was a terrific friend of yours and you could call him up on the phone whenever you felt like it. That doesn't happen much, though.” ― J.D. Salinger, The Catcher in the Rye //"
3,"“If there's a book that you want to read, but it hasn't been written yet, then you must write it.” ― Toni Morrison"
4,“There is nothing to writing. All you do is sit down at a typewriter and bleed.” ― Ernest Hemingway


**Cleaning**

In [7]:
def clean1(x):
    x = x.split('―')[0]
    x = x[1:-2]
    return x

In [8]:
data['Quote'] = data['Quote'].apply(clean1)
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
print(len(data))

73624


In [9]:
data.head()

Unnamed: 0,Quote
0,I love deadlines. I love the whooshing noise they make as they go by.
1,There is no greater agony than bearing an untold story inside you.
2,"What really knocks me out is a book that, when you're all done reading it, you wish the author that wrote it was a terrific friend of yours and you could call him up on the phone whenever you felt like it. That doesn't happen much, though."
3,"If there's a book that you want to read, but it hasn't been written yet, then you must write it."
4,There is nothing to writing. All you do is sit down at a typewriter and bleed.


# More Data

In [10]:
for folderName, subfolders, filenames in os.walk('E:/Scrapped-Data/BrainyQuote/'):
    for filename in filenames:
        data = pd.concat([pd.read_csv('E:/Scrapped-Data/BrainyQuote/'+filename), data], axis=0)
print(len(data))

82204


In [11]:
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
print(len(data))

81722


# Some More Data

In [12]:
data_ = pd.read_csv('E:/Scrapped-Data/KeepInspiringMe.csv')

In [13]:
def clean2(x):
    x = str(x).split('–')[0]
    x = x[1:-2]
    return x

In [14]:
data_['Quote'] = data_['Quote'].apply(clean2)

In [15]:
data = pd.concat([data_, data], axis=0)

**Preprocess**

In [16]:
data['Quote'] = data['Quote'].apply(lambda x: re.sub(r'[^\w\s]|_', '', x))
data['Quote'] = data['Quote'].apply(lambda x : x + ' endquote')

In [17]:
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
print(len(data))

87188


# Training

In [18]:
vocab = data['Quote'].apply(lambda x: x.split()[0]).values.tolist()

In [19]:
n = 4
model = defaultdict(lambda: defaultdict(lambda: 0))
word_grams = []
for i in range(len(data)):
    word_grams += list(ngrams(data['Quote'][i].lower().split(), n+1, 
                              pad_left=True, pad_right=True,  left_pad_symbol='', right_pad_symbol=''))
for w in word_grams:
    model[w[:-1]][w[-1]] += 1
for w1 in model:
    total_count = float(sum(model[w1].values()))
    for w2 in model[w1]:
        model[w1][w2] /= total_count
model = dict(model)        

In [20]:
q = lambda x : list(ngrams(x.lower().split(), n, pad_left=True, pad_right=False,  left_pad_symbol=''))[-1]

In [21]:
text = str(random.choice(vocab))
print('Me:', text)
query = q(text)
print(query)
print(dict(model[query]))

Me: The
('', '', '', 'the')
{'real': 0.0072252327358621644, 'trick': 0.0011115742670557178, 'root': 0.0005557871335278589, 'more': 0.021119911074058636, 'deepest': 0.000972627483673753, 'thankful': 0.00013894678338196472, 'unthankful': 0.00013894678338196472, 'world': 0.02209253855773239, 'way': 0.00972627483673753, 'purpose': 0.0072252327358621644, 'public': 0.000972627483673753, 'thing': 0.0070862859524802, 'painter': 0.00013894678338196472, 'painting': 0.00013894678338196472, 'only': 0.03959983326385994, 'earth': 0.0029178824510212586, 'artists': 0.00027789356676392944, 'artist': 0.0016673614005835765, 'modern': 0.0005557871335278589, 'great': 0.006113658468806447, 'greater': 0.0018063081839655411, 'role': 0.0008336807002917883, 'job': 0.0012505210504376823, 'true': 0.0068083923857162705, 'day': 0.005835764902042517, 'difference': 0.0063915520355703765, 'chief': 0.0011115742670557178, 'pathway': 0.00027789356676392944, 'quieter': 0.00013894678338196472, 'object': 0.00111157426705571

In [22]:
text = str(random.choice(vocab))
print('Me:', text)
sentence_finished = False
 
while not sentence_finished:
    r = random.random()
    accumulator = .0
    if model[q(text)].keys() is None or (len(model[q(text)].keys()) == 1 and list(model[q(text)].keys())[0] == ''):
        break
    for word in model[q(text)].keys():
        accumulator = model[q(text)][word]
        if word != 'endquote' and accumulator >= r:
            text += ' '+word
            break
        if word == 'endquote':
            sentence_finished = True
print('Inspyrobot:', end = ' ')      
print(' '.join([t for t in text.split() if t]))

Me: Wherever
Inspyrobot: Wherever you are be there totally if you find youre here and now intolerable and it makes you unhappy you have three options remove yourself from the situation change it or accept it totally if you want to be


# Pickling

In [23]:
DIR = 'E:/Models/Inspyrobot-Probabilistic-v3'
os.mkdir(DIR)

In [24]:
with open(f'{DIR}/model.json', 'w') as f:
    k = model.keys() 
    v = model.values() 
    k1 = [str(i) for i in k]
    json.dump(json.dumps(dict(zip(*[k1,v]))),f)     

In [25]:
with open(f'{DIR}/model.json', 'r') as f:
    data = json.load(f)
    dic = json.loads(data)
    k = dic.keys() 
    v = dic.values() 
    k1 = [eval(i) for i in k] 
    model =  dict(zip(*[k1,v]))

In [26]:
with open(f'{DIR}/vocab.json', 'w') as f:
    json.dump(json.dumps(vocab),f)  

In [27]:
with open(f'{DIR}/vocab.json', 'r') as f:
    vocab = json.loads(json.load(f))

In [28]:
text = str(random.choice(vocab))
print('Me:', text)
sentence_finished = False
 
while not sentence_finished:
    r = random.random()
    accumulator = .0
    if model[q(text)].keys() is None or (len(model[q(text)].keys()) == 1 and list(model[q(text)].keys())[0] == ''):
        break
    for word in model[q(text)].keys():
        accumulator = model[q(text)][word]
        if word != 'endquote' and accumulator >= r:
            text += ' '+word
            break
        if word == 'endquote':
            sentence_finished = True
print('Inspyrobot:', end = ' ')      
print(' '.join([t for t in text.split() if t]))

Me: Hazel
Inspyrobot: Hazel always used to say dont raise your voice improve your argument good sense does not always lie with the loudest shouters nor can we say that a large unruly crowd is always the best teacher


# Wrapper Functions

In [29]:
import json, random
from nltk import ngrams

DIR = 'E:/Models/Inspyrobot-Probabilistic-v3'
n = 4
q = lambda x : list(ngrams(x.lower().split(), n, pad_left=True, pad_right=False,  left_pad_symbol=''))[-1]

def inspyre(text=''):
    with open(f'{DIR}/model.json', 'r') as f:
        data = json.load(f)
        dic = json.loads(data)
        k = dic.keys() 
        v = dic.values() 
        k1 = [eval(i) for i in k] 
        model =  dict(zip(*[k1,v]))
    if text == '':
        with open(f'{DIR}/vocab.json', 'r') as f:
            vocab = json.loads(json.load(f))
        text = str(random.choice(vocab))
    sentence_finished = False
    while not sentence_finished:
        r = random.random()
        accumulator = .0
        if model[q(text)].keys() is None or (len(model[q(text)].keys()) == 1 and list(model[q(text)].keys())[0] == ''):
            break
        for word in model[q(text)].keys():
            accumulator = model[q(text)][word]
            if word != 'endquote' and accumulator >= r:
                text += ' '+word
                break
            if word == 'endquote':
                sentence_finished = True
    print('Inspyrobot:', end = ' ')      
    print(' '.join([t for t in text.split() if t]))

In [30]:
inspyre()

Inspyrobot: A good book is an event in my life
