# Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import re
from nltk import ngrams
from collections import defaultdict

In [3]:
import random, json, os

In [4]:
pd.options.display.max_colwidth = None

# Data

In [5]:
data = pd.DataFrame(columns=['Quote'])
for folderName, subfolders, filenames in os.walk('E:/Scrapped-Data/Quotes-Goodreads/'):
    for filename in filenames:
        data = pd.concat([pd.read_csv('E:/Scrapped-Data/Quotes-Goodreads/'+filename), data], axis=0)
print(len(data))

92970


In [6]:
data.head()

Unnamed: 0,Quote
0,"“I love deadlines. I love the whooshing noise they make as they go by.” ― Douglas Adams, The Salmon of Doubt //"
1,"“There is no greater agony than bearing an untold story inside you.” ― Maya Angelou, I Know Why the Caged Bird Sings //"
2,"“What really knocks me out is a book that, when you're all done reading it, you wish the author that wrote it was a terrific friend of yours and you could call him up on the phone whenever you felt like it. That doesn't happen much, though.” ― J.D. Salinger, The Catcher in the Rye //"
3,"“If there's a book that you want to read, but it hasn't been written yet, then you must write it.” ― Toni Morrison"
4,“There is nothing to writing. All you do is sit down at a typewriter and bleed.” ― Ernest Hemingway


**Cleaning**

In [7]:
def clean1(x):
    x = x.split('―')[0].strip()
    return x

In [8]:
data['Quote'] = data['Quote'].apply(clean1)
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
print(len(data))

73625


In [9]:
data.head()

Unnamed: 0,Quote
0,“I love deadlines. I love the whooshing noise they make as they go by.”
1,“There is no greater agony than bearing an untold story inside you.”
2,"“What really knocks me out is a book that, when you're all done reading it, you wish the author that wrote it was a terrific friend of yours and you could call him up on the phone whenever you felt like it. That doesn't happen much, though.”"
3,"“If there's a book that you want to read, but it hasn't been written yet, then you must write it.”"
4,“There is nothing to writing. All you do is sit down at a typewriter and bleed.”


# More Data

In [10]:
for folderName, subfolders, filenames in os.walk('E:/Scrapped-Data/BrainyQuote/'):
    for filename in filenames:
        data = pd.concat([pd.read_csv('E:/Scrapped-Data/BrainyQuote/'+filename), data], axis=0)
print(len(data))

82205


In [11]:
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
print(len(data))

81924


# Some More Data

In [12]:
data_ = pd.read_csv('E:/Scrapped-Data/KeepInspiringMe.csv')

In [13]:
def clean2(x):
    x = str(x).split('–')[0].strip()
    return x

In [14]:
data_['Quote'] = data_['Quote'].apply(clean2)

In [15]:
data = pd.concat([data_, data], axis=0)

**Preprocess**

In [16]:
data['Quote'] = data['Quote'].apply(lambda x: re.sub(r'\.+', ' ', x))
data['Quote'] = data['Quote'].apply(lambda x: re.sub(r'[^\w\s]|_', '', x))
data['Quote'] = data['Quote'].apply(lambda x: re.sub(r'\s+', ' ', x))
data['Quote'] = data['Quote'].apply(lambda x: re.sub(r"(e)?valezwritetag(\d+)keepinspiringme(\w+)(\d+)ezslot(\d+)", ' ', x))
data['Quote'] = data['Quote'].apply(lambda x: x.strip())
data['Quote'] = data['Quote'].apply(lambda x: x.lower())
data['Quote'] = data['Quote'].apply(lambda x : x + ' endquote')

In [17]:
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
print(len(data))

86982


In [18]:
data.sample(5)

Unnamed: 0,Quote
77668,but god does give us responsibility and it takes biblical faith to do those things in dependence on god endquote
82974,everyone you have ever loved in your life becomes a part of your soul they never leave theyre always inside you and you can bring them out whenever you want endquote
55358,do you care about my lonely heart tonight the stars still shining bright only you can save my heart can save my love endquote
67931,read books care about things get excited try not to be too down on youself enjoy the ever present game of knowing endquote
38288,i think the attempt to defend belief can unsettle it in fact because there is always an inadequacy in argument about ultimate things endquote


**Noise Removal**

In [19]:
data['Noise'] = data['Quote'].apply(lambda x: bool(not re.fullmatch(r'[A-Z a-z 0-9]*', x)))

In [20]:
data = data[data['Noise'] == False]

In [21]:
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
print(len(data))

84815


In [22]:
data.sample(5)

Unnamed: 0,Quote,Noise
74430,for it isnt just moments of happiness which is all i thought we got its happiness that spreads endquote,False
47128,laughter is the hand of god on the shoulder of a troubled world endquote,False
71437,i could smack her punch her in the face but then i see what she cant hide from me ive seen it beforethe desperation the agony the need to find a reason to go on and the inability to find it endquote,False
25260,time sometimes becomes very merciless heartless it switches off the brightness of the eyes that you once loved endquote,False
73709,money cant buy love but it improves your bargaining position endquote,False


In [23]:
data['Max word len'] = data['Quote'].apply(lambda x: max(list(map(len, x.split()))))

In [24]:
data = data[data['Max word len'] <= 25]

In [25]:
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
print(len(data))

84755


In [26]:
data.sample(5)

Unnamed: 0,Quote,Noise,Max word len
36735,if complete and utter chaos was lightning then hed be the sort to stand on a hilltop in a thunderstorm wearing wet copper armour and shouting all gods are bastards endquote,False,12
58836,life is a slope as long as youre going up youre always looking towards the top and you feel happy but when you reach it suddenly you can see the road going downhill and death at the end of it all its slow going up and quick going down endquote,False,8
81522,id heard that if you saw a reaper you saw what you expected to see what you thought the agents of death would look like personally i wanted to see little fuzzy pink bunnies but apparently my subconscious visualized tall scary and skeletal my subconscious and i needed to have a long talk endquote,False,12
57057,i love you more than songs can say but i cant keep running after yesterday endquote,False,9
31601,nature and natures laws lay hid in nightgod said let newton be and all was light endquote,False,8


# Training

In [27]:
vocab = data['Quote'].apply(lambda x: x.split()[0]).values.tolist()

In [28]:
n = 4
model = defaultdict(lambda: defaultdict(lambda: 0))
word_grams = []
for i in range(len(data)):
    word_grams += list(ngrams(data['Quote'][i].split(), n+1, 
                              pad_left=True, pad_right=True,  left_pad_symbol='', right_pad_symbol=''))
for w in word_grams:
    model[w[:-1]][w[-1]] += 1
for w1 in model:
    total_count = float(sum(model[w1].values()))
    for w2 in model[w1]:
        model[w1][w2] /= total_count
model = dict(model)        

In [29]:
q = lambda x : list(ngrams(x.lower().split(), n, pad_left=True, pad_right=False,  left_pad_symbol=''))[-1]

In [30]:
text = str(random.choice(vocab))
print('Me:', text)
query = q(text)
print(query)
print(dict(model[query]))

Me: my
('', '', '', 'my')
{'goal': 0.004301075268817204, 'greatest': 0.0064516129032258064, 'family': 0.005376344086021506, 'father': 0.04946236559139785, 'arms': 0.002150537634408602, 'most': 0.0032258064516129032, 'wife': 0.008602150537634409, 'folks': 0.001075268817204301, 'success': 0.001075268817204301, 'religion': 0.004301075268817204, 'opinions': 0.001075268817204301, 'favorite': 0.010752688172043012, 'grandmother': 0.005376344086021506, 'grandfather': 0.004301075268817204, 'pessimism': 0.002150537634408602, 'theory': 0.004301075268817204, 'doctor': 0.004301075268817204, 'psychiatrist': 0.001075268817204301, 'best': 0.007526881720430108, 'policies': 0.001075268817204301, 'mother': 0.05591397849462366, 'observation': 0.001075268817204301, 'ideal': 0.001075268817204301, 'heroes': 0.001075268817204301, 'attitude': 0.021505376344086023, 'challenge': 0.001075268817204301, 'body': 0.002150537634408602, 'alma': 0.001075268817204301, 'black': 0.002150537634408602, 'work': 0.001075268817

In [31]:
text = str(random.choice(vocab))
print('Me:', text)
sentence_finished = False
 
while not sentence_finished:
    r = random.random()
    accumulator = .0
    if model[q(text)].keys() is None or (len(model[q(text)].keys()) == 1 and list(model[q(text)].keys())[0] == ''):
        break
    for word in model[q(text)].keys():
        accumulator = model[q(text)][word]
        if word != 'endquote' and accumulator >= r:
            text += ' '+word
            break
        if word == 'endquote':
            sentence_finished = True
print('Inspyrobot:', end = ' ')      
print(' '.join([t for t in text.split() if t]))

Me: individuals
Inspyrobot: individuals who want to believe that what i do is quite traditional anyway


# Pickling

In [32]:
DIR = 'E:/Models/Inspyrobot'
os.mkdir(DIR)

In [33]:
with open(f'{DIR}/model.json', 'w') as f:
    k = model.keys() 
    v = model.values() 
    k1 = [str(i) for i in k]
    json.dump(json.dumps(dict(zip(*[k1,v]))),f)     

In [34]:
with open(f'{DIR}/model.json', 'r') as f:
    data = json.load(f)
    dic = json.loads(data)
    k = dic.keys() 
    v = dic.values() 
    k1 = [eval(i) for i in k] 
    model =  dict(zip(*[k1,v]))

In [35]:
with open(f'{DIR}/vocab.json', 'w') as f:
    json.dump(json.dumps(vocab),f)  

In [36]:
with open(f'{DIR}/vocab.json', 'r') as f:
    vocab = json.loads(json.load(f))

In [37]:
text = str(random.choice(vocab))
print('Me:', text)
sentence_finished = False
 
while not sentence_finished:
    r = random.random()
    accumulator = .0
    if model[q(text)].keys() is None or (len(model[q(text)].keys()) == 1 and list(model[q(text)].keys())[0] == ''):
        break
    for word in model[q(text)].keys():
        accumulator = model[q(text)][word]
        if word != 'endquote' and accumulator >= r:
            text += ' '+word
            break
        if word == 'endquote':
            sentence_finished = True
print('Inspyrobot:', end = ' ')      
print(' '.join([t for t in text.split() if t]))

Me: whoever
Inspyrobot: whoever is happy will make others happy too


# Wrapper Functions

In [38]:
import json, random
from nltk import ngrams

DIR = 'E:/Models/Inspyrobot'
n = 4
q = lambda x : list(ngrams(x.lower().split(), n, pad_left=True, pad_right=False,  left_pad_symbol=''))[-1]

def inspyre(text=''):
    with open(f'{DIR}/model.json', 'r') as f:
        data = json.load(f)
        dic = json.loads(data)
        k = dic.keys() 
        v = dic.values() 
        k1 = [eval(i) for i in k] 
        model =  dict(zip(*[k1,v]))
    if text == '':
        with open(f'{DIR}/vocab.json', 'r') as f:
            vocab = json.loads(json.load(f))
        text = str(random.choice(vocab))
    sentence_finished = False
    while not sentence_finished:
        r = random.random()
        accumulator = .0
        if model[q(text)].keys() is None or (len(model[q(text)].keys()) == 1 and list(model[q(text)].keys())[0] == ''):
            break
        for word in model[q(text)].keys():
            accumulator = model[q(text)][word]
            if word != 'endquote' and accumulator >= r:
                text += ' '+word
                break
            if word == 'endquote':
                sentence_finished = True
    print('Inspyrobot:', end = ' ')      
    print(' '.join([t for t in text.split() if t]))

In [39]:
inspyre()

Inspyrobot: the only thing that can save this country
