In [1]:
pip install --upgrade pip

Collecting pip
[?25l  Downloading https://files.pythonhosted.org/packages/4e/5f/528232275f6509b1fff703c9280e58951a81abe24640905de621c9f81839/pip-20.2.3-py2.py3-none-any.whl (1.5MB)
[K     |▏                               | 10kB 16.2MB/s eta 0:00:01[K     |▍                               | 20kB 6.5MB/s eta 0:00:01[K     |▋                               | 30kB 7.5MB/s eta 0:00:01[K     |▉                               | 40kB 8.0MB/s eta 0:00:01[K     |█                               | 51kB 6.7MB/s eta 0:00:01[K     |█▎                              | 61kB 7.3MB/s eta 0:00:01[K     |█▌                              | 71kB 8.2MB/s eta 0:00:01[K     |█▊                              | 81kB 8.8MB/s eta 0:00:01[K     |██                              | 92kB 8.2MB/s eta 0:00:01[K     |██▏                             | 102kB 8.7MB/s eta 0:00:01[K     |██▍                             | 112kB 8.7MB/s eta 0:00:01[K     |██▋                             | 122kB 8.7MB/s eta 0:00:

In [2]:
import nltk
from nltk.stem import  WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import json
import pickle

In [3]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD
import random

**Import and load the data file**

In [6]:
words = []
documents = []
classes = []
ignore_words = ['?', '!']

# Load json file
data_file = open('intents.json').read()
intents = json.loads(data_file)

**Preprocess data**

In [7]:
# iterate through the patterns and tokenize the sentence using nltk.word_tokenize()
nltk.download('punkt')

for intent in intents['intents']:
  for pattern in intent['patterns']:

    # tokenize each word
    w = nltk.word_tokenize(pattern)
    words.extend(w)

    # add documents in the corpus
    documents.append((w, intent['tag']))

    #create a list of classes for our tags.
    # add to our classes list
    if intent['tag'] not in classes:
      classes.append(intent['tag'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
nltk.download('wordnet')

#  lemmatize each word and remove duplicate words from the list
words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]

# sort words
words = sorted(list(set(words)))

# sort classes
classes = sorted(list(set(classes)))

# documents = combination between patterns and intents
print(len(documents), 'documents')

# classes = intents
print(len(classes), 'classes', classes)

# words = all words, vocabulary
print(len(words), 'uniqe lemmatized words', words)


# creating a pickle file to store the Python objects which we will use while predicting.
pickle.dump(words, open('words.pkl', 'wb'))
pickle.dump(classes, open('classes.pkl', 'wb'))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
47 documents
9 classes ['adverse_drug', 'blood_pressure', 'blood_pressure_search', 'goodbye', 'greeting', 'hospital_search', 'options', 'pharmacy_search', 'thanks']
88 uniqe lemmatized words ["'s", ',', 'a', 'adverse', 'all', 'anyone', 'are', 'awesome', 'be', 'behavior', 'blood', 'by', 'bye', 'can', 'causing', 'chatting', 'check', 'could', 'data', 'day', 'detail', 'do', 'dont', 'drug', 'entry', 'find', 'for', 'give', 'good', 'goodbye', 'have', 'hello', 'help', 'helpful', 'helping', 'hey', 'hi', 'history', 'hola', 'hospital', 'how', 'i', 'id', 'is', 'later', 'list', 'load', 'locate', 'log', 'looking', 'lookup', 'management', 'me', 'module', 'nearby', 'next', 'nice', 'of', 'offered', 'open', 'patient', 'pharmacy', 'pressure', 'provide', 'reaction', 'related', 'result', 'search', 'searching', 'see', 'show', 'suitable', 'support', 'task', 'thank', 'thanks', 'that', 'there', 'till', 't

**Create training and testing data**

In [9]:
# create our training data
training = []

# create an empty array for our output
output_empty = [0] * len(classes)

# training set, bag of words for each sentence
for doc in documents:
  # initialize our bag of words
  bag = []

  # list of tokenized words for the pattern
  pattern_words = doc[0]

  # lemmatize each word - create base word, in attempt to represent related words
  pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]

  # create our bag of words array with 1, if word match found in current pattern


for w in words:
    bag.append(1) if w in pattern_words else bag.append(0) 

   # output is a '0' for each tag and '1' for current tag (for each pattern)
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1

    training.append([bag, output_row])

# shuffle our features and turn into np.array
random.shuffle(training)
training = np.array(training)

# create train and test lists. X - patterns, Y - intents
train_x = list(training[: , 0])
train_y = list(training[:, 1])

print("Training data created")

Training data created


**Build the model**

In [None]:
# create 3 layers. First layer 128 neurons,
# second layer 64 neurons
# 3rd output layer contains number of neurons equal to number of intents to predict output intent with softmax


model = Sequential()
model.add(Dense(128, 
                input_shape=(len(train_x[0]),),
                activation='relu'))

model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation='softmax'))


# Compile model. Stochastic gradient descent with Nesterov accelerated gradient 
# gives good results for this model
sgd = SGD(lr = 0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer = sgd, metrics=['accuracy'])

#fitting and saving the model 
hist = model.fit(np.array(train_x), 
                 np.array(train_y), 
                 epochs=500, 
                 batch_size=5, 
                 verbose=1)
model.save('Chatbot_model.h5', hist)
print('Model created')