# Libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
import re
from nltk import ngrams
from collections import defaultdict

In [3]:
import random
import json
import os

In [4]:
pd.options.display.max_colwidth = -1

# Data

In [5]:
data = pd.DataFrame(columns=['Quote'])
for folderName, subfolders, filenames in os.walk('E:/Scrapped-Data/Quotes-Goodreads/'):
    for filename in filenames:
        data = pd.concat([pd.read_csv('E:/Scrapped-Data/Quotes-Goodreads/'+filename), data], axis=0)
print(len(data))

92970


In [6]:
data.head()

Unnamed: 0,Quote
0,"“I love deadlines. I love the whooshing noise they make as they go by.” ― Douglas Adams, The Salmon of Doubt //"
1,"“There is no greater agony than bearing an untold story inside you.” ― Maya Angelou, I Know Why the Caged Bird Sings //"
2,"“What really knocks me out is a book that, when you're all done reading it, you wish the author that wrote it was a terrific friend of yours and you could call him up on the phone whenever you felt like it. That doesn't happen much, though.” ― J.D. Salinger, The Catcher in the Rye //"
3,"“If there's a book that you want to read, but it hasn't been written yet, then you must write it.” ― Toni Morrison"
4,“There is nothing to writing. All you do is sit down at a typewriter and bleed.” ― Ernest Hemingway


**Cleaning**

In [7]:
def clean(x):
    x = x.split('―')[0]
    x = x[1:-2]
    return x

In [8]:
data['Quote'] = data['Quote'].apply(clean)
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
print(len(data))

73624


In [9]:
data.head()

Unnamed: 0,Quote
0,I love deadlines. I love the whooshing noise they make as they go by.
1,There is no greater agony than bearing an untold story inside you.
2,"What really knocks me out is a book that, when you're all done reading it, you wish the author that wrote it was a terrific friend of yours and you could call him up on the phone whenever you felt like it. That doesn't happen much, though."
3,"If there's a book that you want to read, but it hasn't been written yet, then you must write it."
4,There is nothing to writing. All you do is sit down at a typewriter and bleed.


# More Data

In [10]:
for folderName, subfolders, filenames in os.walk('E:/Scrapped-Data/BrainyQuote/'):
    for filename in filenames:
        data = pd.concat([pd.read_csv('E:/Scrapped-Data/BrainyQuote/'+filename), data], axis=0)
print(len(data))

82204


In [11]:
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
print(len(data))

81722


**Preprocess**

In [12]:
data['Quote'] = data['Quote'].apply(lambda x: re.sub(r'[^\w\s]|_', '', x))
data['Quote'] = data['Quote'].apply(lambda x : x + ' endquote')

In [13]:
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
print(len(data))

81577


# Training

In [14]:
vocab = data['Quote'].apply(lambda x: x.split()[0]).values.tolist()

In [15]:
n = 4
model = defaultdict(lambda: defaultdict(lambda: 0))
word_grams = []
for i in range(len(data)):
    word_grams += list(ngrams(data['Quote'][i].lower().split(), n+1, 
                              pad_left=True, pad_right=True,  left_pad_symbol='', right_pad_symbol=''))
for w in word_grams:
    model[w[:-1]][w[-1]] += 1
for w1 in model:
    total_count = float(sum(model[w1].values()))
    for w2 in model[w1]:
        model[w1][w2] /= total_count
model = dict(model)        

In [16]:
q = lambda x : list(ngrams(x.lower().split(), n, pad_left=True, pad_right=False,  left_pad_symbol=''))[-1]

In [17]:
text = str(random.choice(vocab))
print('Me:', text)
query = q(text)
print(query)
print(dict(model[query]))

Me: Love
('', '', '', 'love')
{'is': 0.44835680751173707, 'yourself': 0.015258215962441314, 'who': 0.005868544600938967, 'friendship': 0.0011737089201877935, 'and': 0.02699530516431925, 'the': 0.015258215962441314, 'hurtsthink': 0.0011737089201877935, 'of': 0.0011737089201877935, 'only': 0.002347417840375587, 'begets': 0.002347417840375587, 'taught': 0.0011737089201877935, 'between': 0.0011737089201877935, 'consists': 0.004694835680751174, 'as': 0.009389671361502348, 'will': 0.009389671361502348, 'keeps': 0.0011737089201877935, 'can': 0.011737089201877934, 'isnt': 0.009389671361502348, 'begins': 0.002347417840375587, 'truth': 0.0011737089201877935, 'doesnt': 0.018779342723004695, 'speaks': 0.0011737089201877935, 'without': 0.005868544600938967, 'does': 0.011737089201877934, 'wasnt': 0.002347417840375587, 'cost': 0.0011737089201877935, 'he': 0.002347417840375587, 'life': 0.002347417840375587, 'goes': 0.0035211267605633804, 'time': 0.0011737089201877935, 'comes': 0.0035211267605633804, '

In [18]:
text = str(random.choice(vocab))
print('Me:', text)
sentence_finished = False
 
while not sentence_finished:
    r = random.random()
    accumulator = .0
    if model[q(text)].keys() is None or (len(model[q(text)].keys()) == 1 and list(model[q(text)].keys())[0] == ''):
        break
    for word in model[q(text)].keys():
        accumulator = model[q(text)][word]
        if word != 'endquote' and accumulator >= r:
            text += ' '+word
            break
        if word == 'endquote':
            sentence_finished = True
print('Inspyrobot:', end = ' ')      
print(' '.join([t for t in text.split() if t]))

Me: I
Inspyrobot: I am a teacher and i think when you make mistakes and you recover from them and you treat them as valuable learning experiences then youve got something to share


# Pickling

In [19]:
DIR = 'E:/Models/Inspyrobot-Probabilistic-v2'
os.mkdir(DIR)

In [20]:
with open(f'{DIR}/model.json', 'w') as f:
    k = model.keys() 
    v = model.values() 
    k1 = [str(i) for i in k]
    json.dump(json.dumps(dict(zip(*[k1,v]))),f)     

In [21]:
with open(f'{DIR}/model.json', 'r') as f:
    data = json.load(f)
    dic = json.loads(data)
    k = dic.keys() 
    v = dic.values() 
    k1 = [eval(i) for i in k] 
    model =  dict(zip(*[k1,v]))

In [22]:
with open(f'{DIR}/vocab.json', 'w') as f:
    json.dump(json.dumps(vocab),f)  

In [23]:
with open(f'{DIR}/vocab.json', 'r') as f:
    vocab = json.loads(json.load(f))

In [24]:
text = str(random.choice(vocab))
print('Me:', text)
sentence_finished = False
 
while not sentence_finished:
    r = random.random()
    accumulator = .0
    if model[q(text)].keys() is None or (len(model[q(text)].keys()) == 1 and list(model[q(text)].keys())[0] == ''):
        break
    for word in model[q(text)].keys():
        accumulator = model[q(text)][word]
        if word != 'endquote' and accumulator >= r:
            text += ' '+word
            break
        if word == 'endquote':
            sentence_finished = True
print('Inspyrobot:', end = ' ')      
print(' '.join([t for t in text.split() if t]))

Me: Yesterday
Inspyrobot: Yesterday is not ours to recover but tomorrow is ours to win or lose


# Wrapper Functions

In [25]:
import json, random
from nltk import ngrams

DIR = 'E:/Models/Inspyrobot-Probabilistic-v2'
n = 4
q = lambda x : list(ngrams(x.lower().split(), n, pad_left=True, pad_right=False,  left_pad_symbol=''))[-1]

def inspyre(text=''):
    with open(f'{DIR}/model.json', 'r') as f:
        data = json.load(f)
        dic = json.loads(data)
        k = dic.keys() 
        v = dic.values() 
        k1 = [eval(i) for i in k] 
        model =  dict(zip(*[k1,v]))
    if text == '':
        with open(f'{DIR}/vocab.json', 'r') as f:
            vocab = json.loads(json.load(f))
        text = str(random.choice(vocab))
    sentence_finished = False
    while not sentence_finished:
        r = random.random()
        accumulator = .0
        if model[q(text)].keys() is None or (len(model[q(text)].keys()) == 1 and list(model[q(text)].keys())[0] == ''):
            break
        for word in model[q(text)].keys():
            accumulator = model[q(text)][word]
            if word != 'endquote' and accumulator >= r:
                text += ' '+word
                break
            if word == 'endquote':
                sentence_finished = True
    print('Inspyrobot:', end = ' ')      
    print(' '.join([t for t in text.split() if t]))

In [26]:
inspyre()

Inspyrobot: Do not wait the time will never be just right start where you stand and work with whatever tools you may have at your command and better tools will be found as you go along
