# Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import re
from nltk import ngrams
from collections import defaultdict

In [3]:
import random
import json
import os

In [4]:
pd.options.display.max_colwidth = -1

# Data

In [5]:
data = pd.DataFrame(columns=['Quote'])
for folderName, subfolders, filenames in os.walk('E:/Scrapped-Data/Quotes-Goodreads/'):
    for filename in filenames:
        data = pd.concat([pd.read_csv('E:/Scrapped-Data/Quotes-Goodreads/'+filename), data], axis=0)
print(len(data))

92970


In [6]:
data.head()

Unnamed: 0,Quote
0,"“I love deadlines. I love the whooshing noise they make as they go by.” ― Douglas Adams, The Salmon of Doubt //"
1,"“There is no greater agony than bearing an untold story inside you.” ― Maya Angelou, I Know Why the Caged Bird Sings //"
2,"“What really knocks me out is a book that, when you're all done reading it, you wish the author that wrote it was a terrific friend of yours and you could call him up on the phone whenever you felt like it. That doesn't happen much, though.” ― J.D. Salinger, The Catcher in the Rye //"
3,"“If there's a book that you want to read, but it hasn't been written yet, then you must write it.” ― Toni Morrison"
4,“There is nothing to writing. All you do is sit down at a typewriter and bleed.” ― Ernest Hemingway


**Cleaning**

In [7]:
def clean1(x):
    x = x.split('―')[0].strip()
    return x

In [8]:
data['Quote'] = data['Quote'].apply(clean1)
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
print(len(data))

73625


In [9]:
data.head()

Unnamed: 0,Quote
0,“I love deadlines. I love the whooshing noise they make as they go by.”
1,“There is no greater agony than bearing an untold story inside you.”
2,"“What really knocks me out is a book that, when you're all done reading it, you wish the author that wrote it was a terrific friend of yours and you could call him up on the phone whenever you felt like it. That doesn't happen much, though.”"
3,"“If there's a book that you want to read, but it hasn't been written yet, then you must write it.”"
4,“There is nothing to writing. All you do is sit down at a typewriter and bleed.”


# More Data

In [10]:
for folderName, subfolders, filenames in os.walk('E:/Scrapped-Data/BrainyQuote/'):
    for filename in filenames:
        data = pd.concat([pd.read_csv('E:/Scrapped-Data/BrainyQuote/'+filename), data], axis=0)
print(len(data))

82205


In [11]:
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
print(len(data))

81924


# Some More Data

In [12]:
data_ = pd.read_csv('E:/Scrapped-Data/KeepInspiringMe.csv')

In [13]:
def clean2(x):
    x = str(x).split('–')[0].strip()
    return x

In [14]:
data_['Quote'] = data_['Quote'].apply(clean2)

In [15]:
data = pd.concat([data_, data], axis=0)

**Preprocess**

In [16]:
data['Quote'] = data['Quote'].apply(lambda x: re.sub(r'\.+', ' ', x))
data['Quote'] = data['Quote'].apply(lambda x: re.sub(r'[^\w\s]|_', '', x))
data['Quote'] = data['Quote'].apply(lambda x: re.sub(r'\s+', ' ', x))
data['Quote'] = data['Quote'].apply(lambda x: re.sub(r"(e)?valezwritetag(\d+)keepinspiringme(\w+)(\d+)ezslot(\d+)", ' ', x))
data['Quote'] = data['Quote'].apply(lambda x: x.strip())
data['Quote'] = data['Quote'].apply(lambda x: x.lower())
data['Quote'] = data['Quote'].apply(lambda x : x + ' endquote')

In [17]:
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
print(len(data))

86982


In [18]:
data.sample(5)

Unnamed: 0,Quote
14872,if a writer stops observing he is finished experience is communicated by small details intimately observed endquote
52464,we will scrutinize the ingredients for the food we put in our body but we dont even know what the people we put in our lives are made of endquote
78469,the search for god is a reversal of the normal mundane worldly order in search for god you revert from what attracts you and swim toward that which is difficult you abandon your comforting and familiar habits with the hope the mere hope that something greater will be offered you in return for what you have given up if we truly knew all the answers in advance as to the meaning of life and the nature of god and the destiny of our souls our belief would not be a leap of faith and it would not be a courageous act of humanity it would just be a prudent insurance policy endquote
21162,since then your sere majesty and your lordships seek a simple answer i will give it in this manner neither horned nor toothed unless i am convinced by the testimony of the scriptures or by clear reason for i do not trust either in the pope or in councils alone since it is well known that they have often erred and contradicted themselves i am bound by the scriptures i have quoted and my conscience is captive to the word of god i cannot and i will not recant anything since it is neither safe nor right to go against conscience may god help me amen reply to the diet of worms april 18 1521 endquote
11012,the simplest of simplest things makes me happy i love small gestures and nature in life endquote


**Noise Removal**

In [19]:
data['Noise'] = data['Quote'].apply(lambda x: bool(not re.fullmatch(r'[A-Z a-z 0-9]*', x)))

In [20]:
data = data[data['Noise'] == False]

In [21]:
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
print(len(data))

84815


In [22]:
data.sample(5)

Unnamed: 0,Quote,Noise
81286,remember where you came from where youre going and why you created the mess you got yourself into in the first place youre going to die a horrible death remember its all good training and youll enjoy it more if you keep the facts in mind take your dying with some seriousness however laughing on the way to your execution is not generally understood by lessadvanced lifeforms and theyll call you crazy endquote,False
41908,you will always be fond of me i represent to you all the sins you never had the courage to commit endquote,False
71194,hope is a flicker a candle flame kept burning by the simple act of breathing endquote,False
83414,no one reads if someone does read he doesnt understand if he understands he immediately forgets endquote,False
44089,looks like what drives me crazydont have no effect on youbut im gonna keep on at ittill it drives you crazy too endquote,False


In [23]:
data['Max word len'] = data['Quote'].apply(lambda x: max(list(map(len, x.split()))))

In [24]:
data = data[data['Max word len'] <= 25]

In [25]:
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
print(len(data))

84755


In [26]:
data.sample(5)

Unnamed: 0,Quote,Noise,Max word len
44442,all bad poetry springs from genuine feeling to be natural is to be obvious and to be obvious is to be inartistic endquote,False,10
20067,being wingless doesnt mean you cant fly it just means you have to invent your wings first the malwatch endquote,False,8
59568,my grandpa always said asking a question is embarrass for a moment but not asking a question is embarrasing for a life time endquote,False,11
50587,some people believe that being the loudest in the room makes them look strong it may appear that way to a few uninformed individuals what they dont know is the more they talk the more information they reveal about themselves directly and indirectly they become an easy target and it makes them very vulnerable to attacks do not underestimate the power of silence endquote,False,13
82209,a heart can be broken but it keep on beating just the same endquote,False,8


# Training

In [27]:
vocab = data['Quote'].apply(lambda x: x.split()[0]).values.tolist()

In [28]:
n = 4
model = defaultdict(lambda: defaultdict(lambda: 0))
word_grams = []
for i in range(len(data)):
    word_grams += list(ngrams(data['Quote'][i].split(), n+1, 
                              pad_left=True, pad_right=True,  left_pad_symbol='', right_pad_symbol=''))
for w in word_grams:
    model[w[:-1]][w[-1]] += 1
for w1 in model:
    total_count = float(sum(model[w1].values()))
    for w2 in model[w1]:
        model[w1][w2] /= total_count
model = dict(model)        

In [29]:
q = lambda x : list(ngrams(x.lower().split(), n, pad_left=True, pad_right=False,  left_pad_symbol=''))[-1]

In [30]:
text = str(random.choice(vocab))
print('Me:', text)
query = q(text)
print(query)
print(dict(model[query]))

Me: the
('', '', '', 'the')
{'real': 0.007248396989127405, 'trick': 0.001115137998327293, 'root': 0.0005575689991636465, 'more': 0.021187621968218568, 'deepest': 0.0008363534987454698, 'thankful': 0.00013939224979091162, 'unthankful': 0.00013939224979091162, 'world': 0.022163367716754947, 'way': 0.009478672985781991, 'purpose': 0.007248396989127405, 'public': 0.0009757457485363814, 'thing': 0.0071090047393364926, 'painter': 0.00013939224979091162, 'painting': 0.00013939224979091162, 'only': 0.03972679119040981, 'earth': 0.0027878449958182324, 'artists': 0.00027878449958182325, 'artist': 0.0016727069974909396, 'modern': 0.0005575689991636465, 'great': 0.0059938667410092, 'greater': 0.0018120992472818512, 'role': 0.0008363534987454698, 'job': 0.0012545302481182046, 'true': 0.006969612489545581, 'day': 0.005715082241427377, 'difference': 0.006551435740172846, 'chief': 0.001115137998327293, 'pathway': 0.00027878449958182325, 'quieter': 0.00013939224979091162, 'object': 0.001254530248118204

In [31]:
text = str(random.choice(vocab))
print('Me:', text)
sentence_finished = False
 
while not sentence_finished:
    r = random.random()
    accumulator = .0
    if model[q(text)].keys() is None or (len(model[q(text)].keys()) == 1 and list(model[q(text)].keys())[0] == ''):
        break
    for word in model[q(text)].keys():
        accumulator = model[q(text)][word]
        if word != 'endquote' and accumulator >= r:
            text += ' '+word
            break
        if word == 'endquote':
            sentence_finished = True
print('Inspyrobot:', end = ' ')      
print(' '.join([t for t in text.split() if t]))

Me: sometimes
Inspyrobot: sometimes we need to remind ourselves that thankfulness is indeed a virtue


# Pickling

In [32]:
DIR = 'E:/Models/Inspyrobot-Probabilistic-v4'
os.mkdir(DIR)

In [33]:
with open(f'{DIR}/model.json', 'w') as f:
    k = model.keys() 
    v = model.values() 
    k1 = [str(i) for i in k]
    json.dump(json.dumps(dict(zip(*[k1,v]))),f)     

In [34]:
with open(f'{DIR}/model.json', 'r') as f:
    data = json.load(f)
    dic = json.loads(data)
    k = dic.keys() 
    v = dic.values() 
    k1 = [eval(i) for i in k] 
    model =  dict(zip(*[k1,v]))

In [35]:
with open(f'{DIR}/vocab.json', 'w') as f:
    json.dump(json.dumps(vocab),f)  

In [36]:
with open(f'{DIR}/vocab.json', 'r') as f:
    vocab = json.loads(json.load(f))

In [37]:
text = str(random.choice(vocab))
print('Me:', text)
sentence_finished = False
 
while not sentence_finished:
    r = random.random()
    accumulator = .0
    if model[q(text)].keys() is None or (len(model[q(text)].keys()) == 1 and list(model[q(text)].keys())[0] == ''):
        break
    for word in model[q(text)].keys():
        accumulator = model[q(text)][word]
        if word != 'endquote' and accumulator >= r:
            text += ' '+word
            break
        if word == 'endquote':
            sentence_finished = True
print('Inspyrobot:', end = ' ')      
print(' '.join([t for t in text.split() if t]))

Me: when
Inspyrobot: when we are young and then spend the rest of your life


# Wrapper Functions

In [38]:
import json, random
from nltk import ngrams

DIR = 'E:/Models/Inspyrobot-Probabilistic-v4'
n = 4
q = lambda x : list(ngrams(x.lower().split(), n, pad_left=True, pad_right=False,  left_pad_symbol=''))[-1]

def inspyre(text=''):
    with open(f'{DIR}/model.json', 'r') as f:
        data = json.load(f)
        dic = json.loads(data)
        k = dic.keys() 
        v = dic.values() 
        k1 = [eval(i) for i in k] 
        model =  dict(zip(*[k1,v]))
    if text == '':
        with open(f'{DIR}/vocab.json', 'r') as f:
            vocab = json.loads(json.load(f))
        text = str(random.choice(vocab))
    sentence_finished = False
    while not sentence_finished:
        r = random.random()
        accumulator = .0
        if model[q(text)].keys() is None or (len(model[q(text)].keys()) == 1 and list(model[q(text)].keys())[0] == ''):
            break
        for word in model[q(text)].keys():
            accumulator = model[q(text)][word]
            if word != 'endquote' and accumulator >= r:
                text += ' '+word
                break
            if word == 'endquote':
                sentence_finished = True
    print('Inspyrobot:', end = ' ')      
    print(' '.join([t for t in text.split() if t]))

In [39]:
inspyre()

Inspyrobot: I dont think that the industry agreed with me but ive always had a bit of a romantic to a fault its led me to some great things and also some sad things its made me a better person
