## Diabetes Chatbot using Deep Learning and Natural Language Processing using Retrieval Based - Deep Neural Networks
### Wanjiru Catherine
### COM/0027/2015

In [61]:
#things we need for NLP
import nltk
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

In [62]:
#things we need for tensorflow
import numpy as np
import tflearn
import tensorflow as tf
import random

In [63]:
print(tf.__version__)

1.13.1


In [64]:
#import our chat-bot intents file
import json
with open('intents_data.json') as json_data:
    intents = json.load(json_data)

In [66]:
words = []
classes = []
documents = []
ignore_words = ['?']

# loop through each sentence in our intent patterns

for intent in intents['intents']:
    for pattern in intent['patterns']:
        # tokenize each word in the sentence
        w = nltk.word_tokenize(pattern)
        # add to our words list
        words.extend(w)
        # add to documents in our corpus
        documents.append((w, intent['tag']))
        # add to our classes list
        if intent['tag'] not in classes:
            classes.append(intent['tag'])
# stem and lower each word and remove duplicates
words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))

# remove duplicates
classes = sorted(list(set(classes)))

# print output

print (len(documents), "documents")
print (len(classes), "classes", classes)
print (len(words), "unique stemmed words", words)

72 documents
17 classes ['Type 1', 'Type 1 Causes', 'Type 1 Prevention', 'Type 1 signs and symptoms', 'Type 1 treatment ', 'Type 2', 'Type 2 Causes', 'Type 2 Signs and Symptoms', 'Type 2 Treatment', 'defination', 'diabetes signs and symptoms', 'diabetes types', 'diet', 'diseases', 'goodbye', 'greeting', 'thanks']
47 unique stemmed words ["'s", '1', '2', 'a', 'about', 'and', 'anyon', 'ar', 'avoid', 'bye', 'caus', 'cur', 'day', 'defin', 'diabet', 'diet', 'do', 'good', 'goodby', 'hello', 'help', 'hi', 'how', 'insulin-dependent', 'is', 'juvinil', 'know', 'lat', 'me', 'melit', 'mellit', 'of', 'paty', 'prev', 'see', 'sign', 'symptom', 'tel', 'thank', 'that', 'the', 'ther', 'to', 'tre', 'typ', 'what', 'you']


In [58]:
# create our training data
training = []
output = []

# create an empty array for our output
output_empty = [0] * len(classes)

# traning set, bag of words for each sentence
for doc in documents:
    #initialize our bag of words
    bag = []
    # list of tokenized words for the pattern
    pattern_words = doc[0]
    # stem each word
    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
    # create our bag of words array
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    # output is a '0' for each tag and '1' for current tag
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1

    training.append([bag, output_row])

# shuffle our features and turn into np.array
random.shuffle(training)
training = np.array(training)

# create train and test lists
train_x = list(training[:,0])
train_y = list(training[:,1])

In [72]:
len(train_x[0])

47

In [60]:
# reset underlying graph data
tf.reset_default_graph()
# Build neural network
net = tflearn.input_data(shape=[None, len(train_x[0])])
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')
net = tflearn.regression(net)

# Define model and setup tensorboard
model = tflearn.DNN(net, tensorboard_dir='tflearn_logs')
# Start training (apply gradient descent algorithm)
model.fit(train_x, train_y, n_epoch=1000, batch_size=8, show_metric=True)
model.save('model.tflearn')

Training Step: 8999  | total loss: [1m[32m0.24875[0m[0m | time: 0.043s
| Adam | epoch: 1000 | loss: 0.24875 - acc: 0.8638 -- iter: 64/72
Training Step: 9000  | total loss: [1m[32m0.25700[0m[0m | time: 0.047s
| Adam | epoch: 1000 | loss: 0.25700 - acc: 0.8638 -- iter: 72/72
--
INFO:tensorflow:/home/shiru/PersonalProjects/DiabetesChatbot/model.tflearn is not in all_model_checkpoint_paths. Manually adding it.


Change the daafrom documents of words into tensors of numbers.

In [9]:
def clean_up_sentence(sentence):
    # tokenize the pattern
    sentence_words = nltk.word_tokenize(sentence)
    # stem each word
    sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
    return sentence_words

# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence
def bow(sentence, words, show_details=False):
    # tokenize the pattern
    sentence_words = clean_up_sentence(sentence)
    # bag of words
    bag = [0]*len(words)  
    for s in sentence_words:
        for i,w in enumerate(words):
            if w == s: 
                bag[i] = 1
                if show_details:
                    print ("found in bag: %s" % w)

    return(np.array(bag))

In [10]:
clean_up_sentence("My name is Catherine and i loe beansee and fiere")

['my', 'nam', 'is', 'catherin', 'and', 'i', 'loe', 'beans', 'and', 'fier']

In [11]:
p = bow("What are signs and symptoms of diabetes?", words)
print (p)
print (classes)

[0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1
 0 0 0 0 0 0 0 0 1 0]
['Type 1', 'Type 1 Causes', 'Type 1 Prevention', 'Type 1 signs and symptoms', 'Type 1 treatment ', 'Type 2', 'Type 2 Causes', 'Type 2 Signs and Symptoms', 'Type 2 Treatment', 'defination', 'diabetes signs and symptoms', 'diabetes types', 'diet', 'diseases', 'goodbye', 'greeting', 'thanks']


In [12]:
print(model.predict([p]))

[[2.6186922e-34 4.4254638e-19 1.8785139e-24 5.7421467e-04 1.7229289e-22
  2.6637251e-10 1.6310716e-03 4.1963092e-05 5.5128661e-15 1.3249927e-14
  3.3068269e-01 3.4211218e-01 2.3972406e-09 3.2495725e-01 5.4032080e-07
  5.6904131e-15 8.7632392e-13]]


In [73]:
# create a data structure to hold user context
context = {}

ERROR_THRESHOLD = 0.25
def classify(sentence):
    # generate probabilities from the model
    results = model.predict([bow(sentence, words)])[0]
    # filter out predictions below a threshold
    results = [[i,r] for i,r in enumerate(results) if r>ERROR_THRESHOLD]
    # sort by strength of probability
    results.sort(key=lambda x: x[1], reverse=True)
    return_list = []
    for r in results:
        return_list.append((classes[r[0]], r[1]))
    # return tuple of intent and probability
    return return_list

def response(sentence, userID='123', show_details=False):
    results = classify(sentence)
    # if we have a classification then find the matching intent tag
    if results:
        # loop as long as there are matches to process
        while results:
            for i in intents['intents']:
                # find a tag matching the first result
                 if i['tag'] == results[0][0]:
                    # set context for this intent if necessary
                    if 'context_set' in i:
                        if show_details: print ('context:', i['context_set'])
                        context[userID] = i['context_set']

                    # check if this intent is contextual and applies to this user's conversation
                    if not 'context_filter' in i or \
                        (userID in context and 'context_filter' in i and i['context_filter'] == context[userID]):
                        if show_details: print ('tag:', i['tag'])
                        # a random response from the intent
                        return print(random.choice(i['responses']))

            results.pop(0)

In [77]:
classify('type 1')

[('Type 1', 0.9371673)]

In [78]:
response('what is diabetes')

Diabetes is a chronic (long-lasting) health condition that affects how your body turns food into energy. 


In [79]:
response('signs and symptoms?')

Diabetes dramatically increases the risk of various cardiovascular problems, including coronary artery disease with chest pain (angina), heart attack, stroke and narrowing of arteries (atherosclerosis). If you have diabetes, you're more likely to have heart disease or stroke. Nerve damage (neuropathy)


In [82]:
response('Do you know diabetes?')

 Diabetes is a chronic condition associated with abnormally high levels of sugar (glucose) in the blood.
