In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import nltk
import pickle
from torch import FloatTensor, optim, nn, unique

Source: https://github.com/katanaml/katana-assistant/blob/master/mlbackend/intents.json

In [2]:
lemmatizer = nltk.stem.WordNetLemmatizer()

base_path = os.getcwd()
raw_data_path = os.path.join(base_path, '../raw_data/intents.json')

with open(raw_data_path, 'r') as f:
    data = json.loads(open(raw_data_path).read())

data.keys()

dict_keys(['intents'])

In [14]:
data['intents']

[{'tag': 'greeting',
  'patterns': ['Hi there',
   'How are you',
   'Is anyone there?',
   'Hello',
   'Good day'],
  'responses': ['Hello, thanks for asking',
   'Good to see you again',
   'Hi there, how can I help?'],
  'context': ['']},
 {'tag': 'goodbye',
  'patterns': ['Bye',
   'See you later',
   'Goodbye',
   'Nice chatting to you, bye',
   'Till next time'],
  'responses': ['See you!', 'Have a nice day', 'Bye! Come back again soon.'],
  'context': ['']},
 {'tag': 'thanks',
  'patterns': ['Thanks',
   'Thank you',
   "That's helpful",
   'Awesome, thanks',
   'Thanks for helping me'],
  'responses': ['Happy to help!', 'Any time!', 'My pleasure'],
  'context': ['']},
 {'tag': 'noanswer',
  'patterns': [],
  'responses': ["Sorry, can't understand you",
   'Please give me more info',
   'Not sure I understand'],
  'context': ['']},
 {'tag': 'options',
  'patterns': ['How you could help me?',
   'What you can do?',
   'What help you provide?',
   'How you can be helpful?',
   'Wh

In [3]:
words = []
tags = []
word_tag_pairs = []
ignore = ['?', '!', '.', ',', '\'s']

for intent in data['intents']:
    for pattern in intent['patterns']:
        w = nltk.tokenize.word_tokenize(pattern)
        words.extend(w)
        word_tag_pairs.append((w, intent['tag'])) # list of tuples containing list of words and tag
        if intent['tag'] not in tags:
            tags.append(intent['tag']) # unique list of tags

# find base form of word and remove ignore words
words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore]
words = sorted(list(set(words))) # remove duplicates and sort

tags = sorted(list(set(tags)))

print(len(word_tag_pairs), "pairs of words and tags")

45 pairs of words and tags


In [11]:
print(len(tags), "tags", tags)

9 tags ['adverse_drug', 'blood_pressure', 'blood_pressure_search', 'goodbye', 'greeting', 'hospital_search', 'options', 'pharmacy_search', 'thanks']


In [12]:
print(len(words), "unique lemmatized words", words)

84 unique lemmatized words ['a', 'adverse', 'all', 'anyone', 'are', 'awesome', 'be', 'behavior', 'blood', 'by', 'bye', 'can', 'causing', 'chatting', 'check', 'could', 'data', 'day', 'detail', 'do', 'dont', 'drug', 'entry', 'find', 'for', 'give', 'good', 'goodbye', 'have', 'hello', 'help', 'helpful', 'helping', 'hi', 'history', 'hospital', 'how', 'i', 'id', 'is', 'later', 'list', 'load', 'locate', 'log', 'looking', 'lookup', 'management', 'me', 'module', 'nearby', 'next', 'nice', 'of', 'offered', 'open', 'patient', 'pharmacy', 'pressure', 'provide', 'reaction', 'related', 'result', 'search', 'searching', 'see', 'show', 'suitable', 'support', 'task', 'thank', 'thanks', 'that', 'there', 'till', 'time', 'to', 'transfer', 'up', 'want', 'what', 'which', 'with', 'you']


In [5]:
pickle.dump(words, open('words.pkl', 'wb'))
pickle.dump(tags, open('tags.pkl', 'wb'))

In [31]:
for pair in word_tag_pairs:
    print(pair)

(['Hi', 'there'], 'greeting')
(['How', 'are', 'you'], 'greeting')
(['Is', 'anyone', 'there', '?'], 'greeting')
(['Hello'], 'greeting')
(['Good', 'day'], 'greeting')
(['Bye'], 'goodbye')
(['See', 'you', 'later'], 'goodbye')
(['Goodbye'], 'goodbye')
(['Nice', 'chatting', 'to', 'you', ',', 'bye'], 'goodbye')
(['Till', 'next', 'time'], 'goodbye')
(['Thanks'], 'thanks')
(['Thank', 'you'], 'thanks')
(['That', "'s", 'helpful'], 'thanks')
(['Awesome', ',', 'thanks'], 'thanks')
(['Thanks', 'for', 'helping', 'me'], 'thanks')
(['How', 'you', 'could', 'help', 'me', '?'], 'options')
(['What', 'you', 'can', 'do', '?'], 'options')
(['What', 'help', 'you', 'provide', '?'], 'options')
(['How', 'you', 'can', 'be', 'helpful', '?'], 'options')
(['What', 'support', 'is', 'offered'], 'options')
(['How', 'to', 'check', 'Adverse', 'drug', 'reaction', '?'], 'adverse_drug')
(['Open', 'adverse', 'drugs', 'module'], 'adverse_drug')
(['Give', 'me', 'a', 'list', 'of', 'drugs', 'causing', 'adverse', 'behavior'], 'ad

In [4]:
train_x = []
train_y = []

for pair in word_tag_pairs:
    words_encoded = []
    pattern_words = pair[0] # list of words
    pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]
    for w in words:
        words_encoded.append(1 if w in pattern_words else 0)

    tags_encoded = [0] * len(tags)
    tags_encoded[tags.index(pair[1])] = 1 # pair[1] is the tag

    train_x.append(words_encoded)
    train_y.append(tags_encoded)

train_x = np.array(train_x)
train_y = np.array(train_y)

In [18]:
print(np.array(train_x).shape)
print(np.array(train_y).shape)

(45,)
(45,)


In [6]:
import tensorflow as tf

model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(len(train_y[0]), activation='softmax'))

sgd = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [12]:
model.fit(train_x, train_y, epochs=200, batch_size=5, verbose=1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.src.callbacks.History at 0x7f7f5465a7a0>

In [7]:
def clean_up_sentence(sentence):
    # tokenize the pattern - split words into array
    sentence_words = nltk.word_tokenize(sentence)
    # stem each word - create short form for word
    sentence_words = [lemmatizer.lemmatize(word.lower()) for word in sentence_words]
    return sentence_words

# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence
def bow(sentence, words, show_details=True):
    # tokenize the pattern
    sentence_words = clean_up_sentence(sentence)
    # bag of words - matrix of N words, vocabulary matrix
    bag = [0]*len(words)  
    for s in sentence_words:
        for i,w in enumerate(words):
            if w == s: 
                # assign 1 if current word is in the vocabulary position
                bag[i] = 1
                if show_details:
                    print ("found in bag: %s" % w)

    return(np.array(bag))

In [8]:
p = bow("Load blood pessure for patient", words)
print (p)
print (tags)

found in bag: load
found in bag: blood
found in bag: for
found in bag: patient
[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0]
['adverse_drug', 'blood_pressure', 'blood_pressure_search', 'goodbye', 'greeting', 'hospital_search', 'options', 'pharmacy_search', 'thanks']


In [17]:
clean_data_path = os.path.join(base_path, '../clean_data/')
# save model and data
model.save(f"{clean_data_path}model.keras")
pickle.dump( {'words':words, 'tags':tags, 'train_x':train_x, 'train_y':train_y}, open( f"{clean_data_path}data.pkl", "wb"))

In [9]:
data = pickle.load( open( f"{clean_data_path}data.pkl", "rb" ) )
words = data['words']
classes = data['tags']

In [11]:
global graph
graph = tf.compat.v1.get_default_graph()

with open('../clean_data/model.pkl', 'rb') as f:
    model = pickle.load(f)

In [16]:
print(model)

None


In [12]:
def classify_local(sentence):
    ERROR_THRESHOLD = 0.25
    
    # generate probabilities from the model
    input_data = pd.DataFrame([bow(sentence, words)], dtype=float, index=['input'])
    results = model.predict([input_data])[0]
    # filter out predictions below a threshold, and provide intent index
    results = [[i,r] for i,r in enumerate(results) if r>ERROR_THRESHOLD]
    # sort by strength of probability
    results.sort(key=lambda x: x[1], reverse=True)
    return_list = []
    for r in results:
        return_list.append({'intent':classes[r[0]], 'probability':str(r[1])})
    # return tuple of intent and probability
    
    return return_list

In [13]:
classify_local('Hello, good day!')

found in bag: hello
found in bag: good
found in bag: day


AttributeError: 'NoneType' object has no attribute 'predict'

In [37]:
data.keys()

dict_keys(['words', 'tags', 'train_x', 'train_y'])

In [40]:
import random
def get_response(ints, intents_json):
    tag = ints[0]['intent']
    list_of_intents = intents_json['intents']
    for i in list_of_intents:
        if(i['tag'] == tag):
            result = random.choice(i['responses'])
            break
    return result

In [41]:
with open(raw_data_path, 'r') as f:
    data = json.loads(open(raw_data_path).read())
    
while True:
    message = input("")
    ints = classify_local(message)
    res = get_response(ints, data)
    print(res)
    if message == 'exit':
        break

found in bag: hello
found in bag: there
Hello, thanks for asking
found in bag: thanks
Happy to help!
found in bag: find
found in bag: me
found in bag: patient
Please provide Patient ID
See you!


In [49]:
tensor_x = FloatTensor(train_x)
tensor_y = FloatTensor(train_y)
tensor_x.shape, tensor_y.shape

(torch.Size([45]), torch.Size([45]))

In [58]:
len(unique(tensor_y))

2

In [50]:
# Define the model
class ChatbotModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(ChatbotModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(0.5)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.drop(out)
        out = self.fc2(out)
        out = self.softmax(out)
        return out

# Model parameters
input_size = 45  # Size of the input layer (number of features in x)
hidden_size = 64  # Can be adjusted
output_size = 45  # Assuming y contains integer labels for classes

# Instantiate the model
model = ChatbotModel(input_size, hidden_size, output_size)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()  # Using Negative Log-Likelihood Loss for a multi-class classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [51]:
criterion(tensor_x, tensor_y)

tensor(18.4013)

In [53]:
x_tensor_reshaped = tensor_x.view(-1, 45)
x_tensor_reshaped.shape

torch.Size([1, 45])

In [56]:
outputs = model(x_tensor_reshaped)
outputs

tensor([[-0.6353, -0.7545]], grad_fn=<LogSoftmaxBackward0>)

In [57]:
criterion(outputs, tensor_y)

ValueError: Expected input batch_size (1) to match target batch_size (45).

In [46]:
# Training loop
epochs = 1000
for epoch in range(epochs):
    # Forward pass
    outputs = model(x_tensor_reshaped)
    loss = criterion(outputs, tensor_y)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item()}')

RuntimeError: mat1 and mat2 shapes cannot be multiplied (45x1 and 45x64)

In [None]:
model = Sequential()
model.add_module('layer1', Linear(45, ))

In [2]:
base_path = os.getcwd()
raw_data_path = os.path.join(base_path, '../raw_data/intents.json')

# Load JSON data
with open(raw_data_path) as file:
    data = json.load(file)

# Flatten the data into a list of dictionaries, each representing a row in the DataFrame
rows = []
for intent in data['intents']:
    tag = intent['tag']
    for pattern in intent.get('patterns', []):
        rows.append({'tag': tag, 'text': pattern, 'type': 'pattern'})
    for response in intent.get('responses', []):
        rows.append({'tag': tag, 'text': response, 'type': 'response'})

# Create a DataFrame
df = pd.DataFrame(rows)

# Display the DataFrame structure
print(df.head())

        tag              text     type
0  greeting          Hi there  pattern
1  greeting       How are you  pattern
2  greeting  Is anyone there?  pattern
3  greeting             Hello  pattern
4  greeting          Good day  pattern


In [5]:

data_raw = pd.read_json(raw_data_path)

In [6]:
data_raw.head()

Unnamed: 0,intents
0,"{'tag': 'greeting', 'patterns': ['Hi there', '..."
1,"{'tag': 'goodbye', 'patterns': ['Bye', 'See yo..."
2,"{'tag': 'thanks', 'patterns': ['Thanks', 'Than..."
3,"{'tag': 'noanswer', 'patterns': [], 'responses..."
4,"{'tag': 'options', 'patterns': ['How you could..."


In [8]:
data_raw.size

14

In [None]:
# Example dictionary of terms and common misspellings
healthcare_terms = {
    "aspirin": ["asprin", "aspirn", "aspiren"],
    "diabetes": ["diabtes", "diabete", "diabetis"],
    "hypertension": ["hypertention", "high blood pre", "highblood pressure"],
    # Add more terms and their common misspellings
    "vaccination": ["vaccination", "vaccin", "vaccinat"],
    "covid": ["covid", "covid-19", "covid19"],
    "cancer": ["cancer", "canser", "cansir"],
    "pregnant": ["pregnint", "pregnent", "pregnet"]
}

# Flatten the dictionary for easier lookup
misspelling_to_correct = {misspelling: correct for correct, misspellings in healthcare_terms.items() for misspelling in misspellings}

def correct_input(input_sentence):
    corrected_words = []
    for word in input_sentence.split():
        # Correct the word if it's a known misspelling
        corrected_word = misspelling_to_correct.get(word.lower(), word)
        corrected_words.append(corrected_word)
    return " ".join(corrected_words)

# Example usage
user_input = "I have diabtes and highblood pressure"
corrected_input = correct_input(user_input)
print("Corrected Input:", corrected_input)
