# EXPLORATORY DATA ANALYSIS 

In [2]:
import pandas as pd
# Libraries needed for NLP 
import nltk 
nltk.download('punkt')
from nltk.stem import PorterStemmer # used to reduce words to their base form, also known as the root form.
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\VARUN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [3]:
# Libraries needed for Tensorflow processing 
import tensorflow as tf
import numpy as np
import random 
import json

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# Load the intents.json file from your local device 
with open('chat.json') as json_data :
    intents = json.load(json_data)

In [None]:
intents

# PRE-PROCESSING THE TEXT DATA 

In [None]:
words =[]   # will contain all the unique words from the pattern to be trained so that chatbot gives correct outcome  
classes =[]   # it will contain the list of all the text totally 8 elements here in text, so it will contain totally 8 elements in the document
documents =[] # documents is like a tuple which will contain the first list , the first element of the list of words 
ignore =['?'] # if you want to remove any special characters 

In [None]:
# loop through each sentence in the intents patterns
for intent in intents['intents']:
    for pattern in intent['patterns']:
        # tokenize each and every word in the sentence
        w =nltk.word_tokenize(pattern)
        # add words to the words list 
        words.extend(w)
        # add words to documents 
        documents.append((w,intent['tag']))
        # add tags to our classes list 
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

In [None]:
# Perform stemming and lower each word as well as remove duplicates
words =[stemmer.stem(w.lower()) for w in words if w not in ignore] # stemmer converts the word into its root word
words = sorted(list(set(words))) # coverted so that no duplicated or same words are appended in the list 

# remove duplicate classes 
classes = sorted(list(set(classes)))

print(len(documents),"documents")
print(len(classes),"classes",classes)
print(len(words),"unique stemmed words",words)

#  CREATING AND TRAINING THE MODEL FOR CHATBOT 

In [None]:
# creating training data 
training =[]   # will work as X data 
output =[]   # will work as Y data
# create an empty array for output
output_empty =[0]* len(classes)

# creating training set ,bag of words for each sentence 
for doc in documents:
    # initialize bag of words 
    bag =[]
    # list of tokenized words for the pattern 
    pattern_words =doc[0]
    # stemming each word 
    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
    # create a bag of words of array 
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)
    # output is '1' for current tag and '0' for the rest of other tags 
    output_row = list(output_empty)
    output_row[classes.index(doc[1])]=1
    
    training.append([bag,output_row])
# shuffling features and turning it to np.array
random.shuffle(training)
training = np.array(training)

# creating training lists
train_x = list(training[:,0])
train_y = list(training[:,1])

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(10,input_shape=(len(train_x[0]),))) # 1st layer with 10m neurons
model.add(tf.keras.layers.Dense(10))
model.add(tf.keras.layers.Dense(len(train_y[0]),activation='softmax')) # when there is more than 2 class(multi class classification) softmax is used
model.compile(tf.keras.optimizers.Adam(),loss='categorical_crossentropy',metrics =['accuracy'])

In [None]:
model.fit(np.array(train_x),np.array(train_y),epochs=100,batch_size=8,verbose=1)
model.save("model.pkl")

# MAKING PREDICTIONS USING CHATBOT

In [None]:
import pickle 
pickle.dump({"words":words,'classes':classes},open("training_data",'wb'))

In [None]:
from keras.models import load_model
model = load_model("model.pkl")

In [None]:
# restoring all the data structures 
data = pickle.load(open("training_data","rb"))
words = data['words']
classes = data['classes']

In [None]:
with open('chat.json') as json_data:
    intents = json.load(json_data)

In [None]:
def clean_up_sentence(sentence):
    # tokenizing the pattern
    sentence_words = nltk.word_tokenize(sentence)
    # stemming each word 
    sentence_words =[stemmer.stem(word.lower()) for word in sentence_words]
    return sentence_words

# returning bag of words array :0 or 1 for each word in the bag that exists in the sentence 
def bow(sentence,words):
    # tokenize the pattern 
    sentence_words = clean_up_sentence(sentence)
    # generating bag of words 
    bag = [0]*len(words)
    for s in sentence_words:
        for i,w in enumerate(words):
            if w==s:
                bag[i]=1
    bag = np.array(bag)
    return(bag)

In [None]:
ERROR_THRESHOLD =0.30
def classify(sentence):
    # generate probabilities from the mmodel 
    bag = bow(sentence,words)
    results = model.predict(np.array([bag]))
    # filter out predictions below a threshold 
    results =[[i,r] for i ,r in enumerate(results[0]) if r>ERROR_THRESHOLD]
    # sort by strength of probabilty 
    results.sort(key=lambda x:x[1],reverse=True)
    return_list =[]
    for r in results:
        return_list.append((classes[r[0]],r[1]))
    # return tuple of intent and probablity 
    return return_list

def response(sentence):
    results = classify(sentence)
    # if we have a classification then find the matching intent tag 
    if results:
        # loop as long as there are matches to the process 
        while results:
            for i in intents['intents']:
                # find a tag matching the first result
                if i['tag']== results[0][0]:
                    # a random response from the intent 
                    return print(random.choice(i['responses']))
              
            results.pop(0)
            
    

In [None]:
response("hi")

In [None]:
response('Where is the nearest police station located')

In [None]:
response("aarey road")

In [None]:
response('WhAT is the nearest help centre address')

In [None]:
response('YAshodham vidyal marg')

In [None]:
response("kanyapada")

In [None]:
response("film city")

In [None]:
answer = "film city"

In [None]:
response(answer)

In [None]:
response("can you tell me how many questons i asked")
