## Building a chatbot

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import spatial
import numpy as np
import nltk, string

In [2]:
query = "I want to learn about geometry algorithms"
print(query.lower())

i want to learn about geometry algorithms


In [3]:
# create an object to remove punctuation
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

In [4]:
print(query.lower().translate(remove_punctuation_map))

i want to learn about geometry algorithms


In [5]:
print(nltk.word_tokenize(query.lower().translate(remove_punctuation_map)))

['i', 'want', 'to', 'learn', 'about', 'geometry', 'algorithms']


In [6]:
stemmer = nltk.stem.porter.PorterStemmer()
def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

In [7]:
# change the values into their language stem values
print(stem_tokens(nltk.word_tokenize(query.lower().translate(remove_punctuation_map))))

['i', 'want', 'to', 'learn', 'about', 'geometri', 'algorithm']


In [36]:
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

## Text vectorization

In [44]:
vctrz = TfidfVectorizer(ngram_range = (1,1), tokenizer=normalize, stop_words = 'english')

In [45]:
print(vctrz)

TfidfVectorizer(stop_words='english',
                tokenizer=<function normalize at 0x000001B3002B74C0>)


In [46]:
alldocuments = ['Chapter 1. The algorithmic approach to problem solving, including Galileo and baseball.', 'Chapter 2. Algorithms in history, including magic squares, Russian peasant multiplication, and Egyptian methods.', 'Chapter 3. Optimization, including maximization, minimization, and the gradient ascent algorithm.', 'Chapter 4. Sorting and searching, including merge sort, and algorithm runtime.', 'Chapter 5. Pure math, including algorithms for continued fractions and random numbers and other mathematical ideas.', 'Chapter 6. More advanced optimization, including simulated annealing and how to use it to solve the traveling salesman problem.','Chapter 7. Geometry, the postmaster problem, and Voronoi triangulations.','Chapter 8. Language, including how to insert spaces and predict phrase completions.', 'Chapter 9. Machine learning, focused on decision trees and how to predict happiness and heart attacks.','Chapter 10. Artificial intelligence, and using the minimax algorithm to win at dots and boxes.','Chapter 11. Where to go and what to study next, and how to build a chatbot.']


In [47]:
alldocuments

['Chapter 1. The algorithmic approach to problem solving, including Galileo and baseball.',
 'Chapter 2. Algorithms in history, including magic squares, Russian peasant multiplication, and Egyptian methods.',
 'Chapter 3. Optimization, including maximization, minimization, and the gradient ascent algorithm.',
 'Chapter 4. Sorting and searching, including merge sort, and algorithm runtime.',
 'Chapter 5. Pure math, including algorithms for continued fractions and random numbers and other mathematical ideas.',
 'Chapter 6. More advanced optimization, including simulated annealing and how to use it to solve the traveling salesman problem.',
 'Chapter 7. Geometry, the postmaster problem, and Voronoi triangulations.',
 'Chapter 8. Language, including how to insert spaces and predict phrase completions.',
 'Chapter 9. Machine learning, focused on decision trees and how to predict happiness and heart attacks.',
 'Chapter 10. Artificial intelligence, and using the minimax algorithm to win at d

In [48]:
vctrz.fit(alldocuments)

In [49]:
query = 'I want to read about how to search for items.'
tfidf_reports = vctrz.transform(alldocuments).todense()
tfidf_question = vctrz.transform([query]).todense()

In [51]:
tfidf_reports

matrix([[0.40888456, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.22540334, 0.        , 0.40888456,
         0.        , 0.        , 0.        , 0.40888456, 0.        ,
         0.        , 0.14646124, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.40888456, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.20584617, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.30736536, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.34949964, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0

In [52]:
tfidf_question

matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

## Vector similarity

In [53]:
row_similarities = [1 - spatial.distance.cosine(np.array(tfidf_reports[x]).flatten(),np.array(tfidf_question).flatten()) for x in \
range(len(tfidf_reports)) ]

In [54]:
row_similarities

[0.0, 0.0, 0.0, 0.3393118510377361, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [55]:
print(alldocuments[np.argmax(row_similarities)])

Chapter 4. Sorting and searching, including merge sort, and algorithm runtime.


## Create chatbot

In [58]:
def chatbot(query, allreports):
    clf = TfidfVectorizer(ngram_range=(1,1), tokenizer=normalize, stop_words='english')
    clf.fit(allreports)
    tfidf_reports = clf.transform(alldocuments).todense()
    tfidf_question = clf.transform([query]).todense()
    row_similarities = [1 - spatial.distance.cosine(np.array(tfidf_reports[x]).flatten(),np.array(tfidf_question).flatten()) for x in range(len(tfidf_reports))]
    return(alldocuments[np.argmax(row_similarities)])

In [59]:
print(chatbot('Please tell me which chapter I can go to if I want to read about mathematics algorithms.', alldocuments))

Chapter 5. Pure math, including algorithms for continued fractions and random numbers and other mathematical ideas.
