In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
try:
    import nltk
except:
    !pip install nltk
    import nltk

In [3]:
import nltk
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rounaksarkar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
import tensorflow as tf
import random
import json

In [5]:
with open('intents.json') as json_data:
    data = json.load(json_data)

In [6]:
data

{'intents': [{'tag': 'greeting',
   'patterns': ['Hi', 'How are you', 'Is anyone there?', 'Hello', 'Good day'],
   'responses': ['Hello, thanks for visiting',
    'Good to see you again',
    'Hi there, how can I help?'],
   'context_set': ''},
  {'tag': 'goodbye',
   'patterns': ['Bye', 'See you later', 'Goodbye'],
   'responses': ['See you later, thanks for visiting',
    'Have a nice day',
    'Bye! Come back again soon.']},
  {'tag': 'thanks',
   'patterns': ['Thanks', 'Thank you', "That's helpful"],
   'responses': ['Happy to help!', 'Any time!', 'My pleasure']},
  {'tag': 'chatbot',
   'patterns': ['Who built this chatbot?',
    'Tell me about Chatbot',
    'What is this chatbot name?'],
   'responses': ['Hi, I am Chatbot designed by Mayank.',
    'Thanks for asking. I am designed by Mayank Bajaj.',
    'I am a chatbot.']},
  {'tag': 'location',
   'patterns': ['What is your location?',
    'Where are you located?',
    'What is your address?'],
   'responses': ["We are from Worl

In [7]:
words=[]
classes=[]
documents=[]
ignore_words=['?','!']

# loop through each sentence in our intents patterns
for intent in data['intents']:
    for pattern in intent['patterns']:
        # tokenize each word in the sentence
        w=nltk.word_tokenize(pattern)
        # add to our words list
        words.extend(w)
        # add to documents in our corpus
        documents.append((w,intent['tag']))
        # add to our classes list
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

In [8]:
# Perform stemming and lemmatization on the data
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('wordnet')

words= [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
lemmatizer = WordNetLemmatizer()
words= [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]

# remove duplicates and sort
words= sorted(list(set(words)))

# sort classes
classes= sorted(list(set(classes)))

# documents= combination between patterns and intents
print(len(documents), "documents")
print(len(classes), "classes", classes)
print(len(words), "unique lemmatized words", words)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rounaksarkar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


27 documents
8 classes ['about', 'chatbot', 'connect', 'goodbye', 'greeting', 'location', 'movies', 'thanks']
52 unique lemmatized words ["'s", 'about', 'account', 'address', 'ani', 'anyon', 'are', 'built', 'bye', 'can', 'chatbot', 'connect', 'day', 'favourit', 'give', 'good', 'goodby', 'hello', 'help', 'hi', 'how', 'i', 'is', 'later', 'link', 'locat', 'me', 'medium', 'movi', 'name', 'out', 'reach', 'recommend', 'see', 'social', 'some', 'suggest', 'tell', 'thank', 'that', 'there', 'thi', 'to', 'way', 'we', 'what', 'where', 'which', 'who', 'you', 'your', 'yourself']


In [9]:
import numpy as np

# Create training data (input and output pairs)
training = []
output = []

# Create an empty array for output
output_empty = [0] * len(classes)

# Create training set, bag of words for each sentence
for doc in documents:
    # Initialize bag of words
    bag = [0] * len(words)
    # List of tokenized words for the pattern
    pattern_words = doc[0]
    # Stem each word
    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
    # Create bag of words array with 1, if word match found in current pattern
    for w in pattern_words:
        if w in words:
            bag[words.index(w)] = 1
    # Output is 1 for current tag and 0 for rest of other tags
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1
    
    # Append bag of words and output row to training list
    training.append([np.array(bag, dtype=np.float32), np.array(output_row, dtype=np.float32)])

# Shuffle features and turn into np.array
random.shuffle(training)
training = np.array(training, dtype=object)

# Create training and testing lists. X - patterns, Y - intents
train_x = list(training[:, 0])
train_y = list(training[:, 1])


In [10]:
model= tf.keras.Sequential()
model.add(tf.keras.layers.Dense(10, input_shape=(len(train_x[0]),)))
model.add(tf.keras.layers.Dense(10))
model.add(tf.keras.layers.Dense(len(train_y[0]), activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [11]:
model.fit(np.array(train_x), np.array(train_y), epochs=100, batch_size=50, verbose=1)
model.save('chatbot_model.h5')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

  saving_api.save_model(


In [12]:
import pickle
pickle.dump({'words':words, 'classes':classes},open('chatbot_model.h5','wb'))


In [13]:
from keras.models import load_model
import os

model_path = 'chatbot_model.h5'
if os.path.exists(model_path):
    try:
        model = load_model(model_path)  # Load the trained model
    except OSError as e:
        print("Unable to load model:", e)
else:
    print("Model file not found at path:", model_path)


Unable to load model: Unable to open file (file signature not found)


In [14]:
# Restoring all data structures
import pickle
data = pickle.load(open("chatbot_model.h5", "rb"))
words = data['words']
classes = data['classes']

In [15]:
with open('intents.json') as file:
    data = json.load(file)

In [16]:
def clean_up_sentence(sentence):
    # tokenize the pattern - split words into array
    sentence_words = nltk.word_tokenize(sentence)
    # stem each word - create short form for word
    sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
    return sentence_words

In [17]:
# Returning the bag of words array: 0 or 1 for each word in the bag that exists in the sentence
def bow(sentence, words, show_details=True):
    # Tokenize the pattern
    sentence_words = clean_up_sentence(sentence)
    # Bag of words - matrix of N words, vocabulary matrix
    bag = [0]*len(words)  
    for s in sentence_words:
        for i,word in enumerate(words):
            if word == s: 
                # assign 1 if current word is in the vocabulary position
                bag[i] = 1
                if show_details:
                    print ("found in bag: %s" % word)
    return(np.array(bag))

In [18]:
ERROR_THRESHOLD = 0.30
def classify(sentence):
    # generate probabilities from the model
    results = model.predict([bag_of_words(sentence, words)])[0]
    # filter out predictions below a threshold
    results = [[i,r] for i,r in enumerate(results) if r>ERROR_THRESHOLD]
    # sort by strength of probability
    results.sort(key=lambda x: x[1], reverse=True)
    return_list = []
    for r in results:
        return_list.append({"intent": classes[r[0]], "probability": str(r[1])})
    return return_list

In [19]:
def response(sentence):
    results.classify(sentence)
    # If we have a classification then find the matching intent tag
    if results:
        # loop as long as there re matches to process
        while results:
            # for each intent key, find the matching tag
            for i in intents['intents']:
                # find a tag matching the first result
                if i['tag'] == results[0][0]:
                    # a random response from the intent
                    return print(random.choice(i['responses']))

            results.pop(0)