In [1]:
import tensorflow as tf   
import numpy as np
import json
import random
import tflearn
import nltk
nltk.download("punkt")
from nltk.stem.lancaster import LancasterStemmer

Instructions for updating:
non-resource variables are not supported in the long term
curses is not supported on this machine (please install/reinstall curses for an optimal experience)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zhaol\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
# Load the corpus of questions
with open("questions.json") as json_corpus: 
    questions = json.load(json_corpus)

In [17]:
# Display the first 5 questions
for i in range(5):
    print(questions["questions"][i])

{'tag': 'greeting', 'patterns': ['Hi', 'How are you', 'Is anyone there?', 'Hello', 'Good day'], 'responses': ['Hello, thanks for visiting', 'Good to see you again', 'Hi there, how can I help?'], 'context_set': ''}
{'tag': 'goodbye', 'patterns': ['Bye', 'See you later', 'Goodbye'], 'responses': ['See you later, thanks for visiting', 'Have a nice day', 'Bye! Come back again soon.']}
{'tag': 'thanks', 'patterns': ['Thanks', 'Thank you', "That's helpful"], 'responses': ['Happy to help!', 'Any time!', 'My pleasure']}
{'tag': 'hours', 'patterns': ['What hours are you open?', 'What are your hours?', 'When are you open?', 'When is the time to contact ?', 'At what time do you provide services ?'], 'responses': ["We're open every day from 9AM to 9PM", 'Our working hours are 9AM to 9PM every day']}
{'tag': 'location', 'patterns': ['What is your location?', 'Where are you located?', 'What is your address?', 'Where is your company situated?', 'How can we contact you ?', 'How can I contact you?'], '

In [23]:
# Initialize lists
tags = []
documents = []
words=[]

# Starting a loop through each question in questions["patterns"]
for question in questions["questions"]:
    for pattern in question["patterns"]:
        
        # Tokenize the question into word by using word tokenizer
        word = nltk.word_tokenize(pattern)
        
        # Adding tokenized words to words
        words.extend(word) 
        
        # Add words to documents with tag
        documents.append((word, question["tag"]))
        
        # Add non-duplicate tags to tags list
        if question["tag"] not in tags:      
            tags.append(question["tag"])

In [24]:
# Display the first 5 documents
print("documents: ", documents[:5])

# Display the the first 5 tags
print("tags: ", tags[:5])

# Display the the first 10 words
print("words: ", words[:10])

documents:  [(['Hi'], 'greeting'), (['How', 'are', 'you'], 'greeting'), (['Is', 'anyone', 'there', '?'], 'greeting'), (['Hello'], 'greeting'), (['Good', 'day'], 'greeting')]
tags:  ['greeting', 'goodbye', 'thanks', 'hours', 'location']
words:  ['Hi', 'How', 'are', 'you', 'Is', 'anyone', 'there', '?', 'Hello', 'Good']


In [26]:
# Lower each word and Perform Stemming by using stemmer
stemmer = LancasterStemmer()
# Ignori all unwanted punctuation marks.
ignore = ["?"]

words = [stemmer.stem(w.lower()) for w in words if w not in ignore]

#Removing Duplicates
words = sorted(list(set(words)))
tags = sorted(list(set(tags)))

#Printing length of lists
print("Length of Documents: ", len(documents))
print("Length of Tags: ", len(tags))
print("Length of Stemmed Words: ", len(words))

Length of Documents:  106
Length of Tags:  37
Length of Stemmed Words:  118


In [39]:
#Creating Train Data for training
train_data = []

#Creat empty array for output
output_empty = [0 for _ in range(len(tags))]

#Create Train set and bag of words for each question
for doc in documents:
    bag = []
    
    #Store list of tokenized words for the documents[] in pattern_words
    pattern_words = doc[0] 
    
    #Stemme each word in pattern_words
    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]  
    
    #Create bag of words list
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)
        
    #It will give output 1 for curent tag and 0 for all other tags
    output_row = list(output_empty)
    output_row[tags.index(doc[1])] =1 
    train_data.append([bag, output_row])

#Creating Training Lists
random.shuffle(train_data)
#Convert train data into numpy array
train_data = np.array(train_data)

train_x = list(train_data[:,0])
train_y = list(train_data[:,1])

  train_data = np.array(train_data)


In [42]:
# Clears the default graph stack and resets the global default graph
tf.compat.v1.reset_default_graph()

# Build Neural Network layers
nn = tflearn.input_data(shape=[None, len(train_x[0])])
nn = tflearn.fully_connected(nn, 10)
nn = tflearn.fully_connected(nn, 10)
nn = tflearn.fully_connected(nn, len(train_y[0]), activation="softmax")
nn = tflearn.regression(nn)

# Define Model
model = tflearn.DNN(nn, tensorboard_dir="tflearn_logs") 

# Train and fit model using train_x, train_y
model.fit(train_x, train_y, n_epoch=1000, batch_size=8, show_metric=True)

#Save the model
model.save("model.tflearn")

Training Step: 13999  | total loss: [1m[32m0.00047[0m[0m | time: 0.037s
| Adam | epoch: 1000 | loss: 0.00047 - acc: 1.0000 -- iter: 104/106
Training Step: 14000  | total loss: [1m[32m0.00050[0m[0m | time: 0.040s
| Adam | epoch: 1000 | loss: 0.00050 - acc: 1.0000 -- iter: 106/106
--
INFO:tensorflow:C:\Users\zhaol\JupyterNotebook\CST8507\model.tflearn is not in all_model_checkpoint_paths. Manually adding it.


In [45]:
import pickle

#Dumping training data by using dump() and writing it into training_data in binary mode
pickle.dump({"words":words, "tags":tags, "train_x":train_x, "train_y":train_y}, open("training_data", "wb"))

In [47]:
#Read all data structure
data = pickle.load(open("training_data","rb"))
words = data['words']
tags = data['tags']
train_x = data['train_x']
train_y = data['train_y']

In [49]:
# Load the corpus of questions
with open("questions.json") as json_corpus: 
    questions = json.load(json_corpus)

In [50]:
# Load the saved model
model.load("./model.tflearn")

INFO:tensorflow:Restoring parameters from C:\Users\zhaol\JupyterNotebook\CST8507\model.tflearn


In [51]:
#Clean User Input
def clean_up_sentence(sentence):
    
    # Tokenize the pattern
    sentence_words = nltk.word_tokenize(sentence)
    
    #Stemme each word from the user's input
    sentence_words= [stemmer.stem(word.lower()) for word in sentence_words]

    return sentence_words

In [70]:
# Get bag of words array
def bow(sentence, words):
    
    #Tokenize the user input
    sentence_words = clean_up_sentence(sentence)
    
    #Generate bag of words from user input
    bag = [0 for _ in range(len(words))]
    for s in sentence_words:
        for i,w in enumerate(words):
            if w == s:
                bag[i] = 1
    return(np.array(bag))

In [94]:
#Add some context to the conversation for better results.

#Create a dictionary to hold user's context
context = {} 

ERROR_THRESHOLD = 0.25

def classify(sentence):
    
    # predict user input and get probabilities using the model
    results = model.predict([bow(sentence, words)])[0]
    
    #Filter out predictions below a threshold
    results = [[i,r] for i,r in enumerate(results) if r>ERROR_THRESHOLD]
    
    #Sorting by strength of probability
    results.sort(key=lambda x: x[1], reverse=True)
    result_list = []
    for r in results:
        result_list.append((tags[r[0]], r[1]))
    
    return result_list

def response(sentence,userID='123', show_details=False):
    results = classify(sentence)
    
    #If we have a classification then find the matching question tag
    if results:
        # Loop if there are matches to process
        while results:
            for i in questions['questions']:
                
                #Find a tag matching the first result
                if i['tag'] == results[0][0]:
                    # print Probability of answer
                    if show_details: print ('probability of answer:', results[0][1])
                    
                    # Set context for this question if necessary
                    if 'context_set' in i:
                        if show_details: print ('Context:', i['context_set'])
                        context[userID] = i['context_set']

                    # Check if this question is contextual and applies to this user's conversation
                    if not 'context_filter' in i or \
                        (userID in context and 'context_filter' in i and i['context_filter'] == context[userID]):
                        if show_details: print ('Tag:', i['tag'])
                        
                        # return a random response
                        return print('Answer: ', random.choice(i['responses']))
            results.pop(0)

In [97]:
%time
response("what are black hat hackers?", show_details=True)

CPU times: total: 0 ns
Wall time: 0 ns
probability of answer: 0.9997749
Tag: black hat hackers
Answer:  Black hat hackers are known for having vast knowledge about breaking into computer networks. They can write malware which can be used to gain access to these systems. This type of hackers misuse their skills to steal information or use the hacked system for malicious purpose.


In [102]:
%time
response("what is encryption?", show_details=True)

CPU times: total: 0 ns
Wall time: 0 ns
probability of answer: 0.99985397
Tag: encryption
Answer:  Encryption is a way of scrambling data so that only authorized parties can understand the information. In technical terms, it is the process of converting plaintext to ciphertext. In simpler terms, encryption takes readable data and alters it so that it appears random. Encryption requires the use of an encryption key: a set of mathematical values that both the sender and the recipient of an encrypted message know.


In [103]:
%time
response("what can you help me?", show_details=True)

CPU times: total: 0 ns
Wall time: 0 ns
probability of answer: 0.999923
Tag: services
Answer:  We provide Web Penetration Testing,Android Penetration Testing,Docker Penetration Testing,Vulnerability Assessment,Cyber Crime investigation and many more services.
