In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from pathlib import Path
import string
import re
import joblib
import json
from collections import Counter
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import io
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, Conv1D, MaxPooling1D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def file(jsonFile):
    with open(jsonFile) as file:
        data = json.loads(file.read())
    return data

In [3]:
data=file('intents.json')

In [4]:
def get_data(feature1,feature2,is_pattern):
    is_pattern = is_pattern
    df = pd.DataFrame(columns=[feature1,feature2])
    for intent in data['intents']:
        if is_pattern:
            for pattern in intent['patterns']:
                w = pattern
                _df= pd.Series([w,intent['tag']], index = df.columns)
                df = df.append(_df,ignore_index=True)
        else:
            for response in intent['responses']:
                w = response
                _df = pd.Series([w,intent['tag']], index = df.columns)
                df = df.append(_df,ignore_index=True)
    return df

In [5]:
df1 = get_data('questions','labels',True)
df1

Unnamed: 0,questions,labels
0,Hi there,greeting
1,How are you,greeting
2,Is anyone there?,greeting
3,Hey,greeting
4,Hola,greeting
...,...,...
63,nearest,search_hospital_by_params
64,nearest,search_hospital_by_params
65,fortis,search_hospital_by_params
66,fortis,search_hospital_by_params


In [6]:
df1.labels.value_counts(sort=False)

search_pharmacy_by_name                3
thanks                                 5
adverse_drug                           5
hospital_search                        5
options                                5
search_hospital_by_type                7
greeting                               7
noanswer                               1
blood_pressure_search                  5
goodbye                                5
search_hospital_by_params              6
search_blood_pressure_by_patient_id    4
pharmacy_search                        5
blood_pressure                         5
Name: labels, dtype: int64

In [7]:
df2 = get_data('response','labels',False)
df2.head()

Unnamed: 0,response,labels
0,"Hello, thanks for asking",greeting
1,Good to see you again,greeting
2,"Hi there, how can I help?",greeting
3,See you!,goodbye
4,Have a nice day,goodbye


In [8]:
df2.labels.value_counts(sort=False)

search_pharmacy_by_name                1
thanks                                 3
adverse_drug                           1
hospital_search                        1
options                                2
search_hospital_by_type                1
greeting                               3
noanswer                               3
blood_pressure_search                  2
goodbye                                3
search_hospital_by_params              1
search_blood_pressure_by_patient_id    1
pharmacy_search                        1
blood_pressure                         1
Name: labels, dtype: int64

In [9]:
lemmatizer = WordNetLemmatizer()

vocab = Counter()
labels = []
def tokenizer(entry):
    tokens = entry.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens]
    tokens = [word.lower() for word in tokens if len(word) > 1]
    return tokens

In [10]:
 def remove_stop_words(tokenizer,df,feature):
    doc_without_stopwords = []
    for entry in df[feature]:
        tokens = tokenizer(entry)
        joblib.dump(tokens,'tokens.pkl')
        doc_without_stopwords.append(' '.join(tokens))
    df[feature] = doc_without_stopwords
    return 

In [11]:
def create_vocab(tokenizer,df,feature):
    for entry in df[feature]:
        tokens = tokenizer(entry)   
        vocab.update(tokens)
    joblib.dump(vocab,'vocab.pkl')
    return 

In [12]:
create_vocab(tokenizer,df1,'questions')
remove_stop_words(tokenizer,df1,'questions')

In [13]:
df1.head(20)

Unnamed: 0,questions,labels
0,hi there,greeting
1,how are you,greeting
2,is anyone there,greeting
3,hey,greeting
4,hola,greeting
5,hello,greeting
6,good day,greeting
7,see you later,goodbye
8,goodbye,goodbye
9,nice chatting to you bye,goodbye


In [14]:
test_list = list(df1.groupby(by='labels',as_index=False).first()['questions'])
test_list

['how to check adverse drug reaction',
 'open blood pressure module',
 'want to search for blood pressure result history',
 'see you later',
 'hi there',
 'lookup for hospital',
 '',
 'what you can do',
 'find me medical store',
 'patient id is',
 'nearest',
 'brain',
 'pharmeasy',
 'thanks']

In [15]:
test_index = []
for i,_ in enumerate(test_list):
    idx = df1[df1.questions == test_list[i]].index[0]
    test_index.append(idx)
test_index

[23, 28, 33, 7, 0, 50, 22, 17, 42, 38, 62, 55, 47, 12]

In [16]:
train_index = [i for i in df1.index if i not in test_index]

In [17]:
' '.join(list(vocab.keys()))

'hi there how are you is anyone hey hola hello good day see later goodbye nice chatting to bye till next time thanks thank thats helpful awesome for helping me what can do could help provide be support offered check adverse drug reaction open module give list of causing behavior all suitable patient with which dont have blood pressure task related data entry want log result management search history load show find by id medical store pharm nearby locate pharmacy pharmeasy pharacy lookup hospital searching transfer looking up detail brain ent cardiology dentist bone liver nearest fortis jaypee'

In [18]:
def encoder(df,feature):
    t = Tokenizer()
    entries = [entry for entry in df[feature]]
    t.fit_on_texts(entries)
    joblib.dump(t,'tokenizer_t.pkl')
    vocab_size = len(t.word_index) + 1
    entries = [entry for entry in df[feature]]
    max_length = max([len(s.split()) for s in entries])
    encoded = t.texts_to_sequences(entries)
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded, vocab_size

In [19]:
X,vocab_size = encoder(df1,'questions')

In [20]:
df_encoded = pd.DataFrame(X)

In [21]:
df_encoded['labels'] = df1.labels
df_encoded.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,labels
0,38,24,0,0,0,0,0,0,0,greeting
1,11,39,3,0,0,0,0,0,0,greeting
2,13,40,24,0,0,0,0,0,0,greeting
3,41,0,0,0,0,0,0,0,0,greeting
4,42,0,0,0,0,0,0,0,0,greeting


In [22]:
for i in range(0,1):
    dt = [0]*9
    dt.append('confused')
    dt = [dt]
    pd.DataFrame(dt).rename(columns = {9:'labels'})
    df_encoded = df_encoded.append(pd.DataFrame(dt).rename(columns = {9:'labels'}),ignore_index=True)

In [23]:
df_encoded.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,labels
64,23,0,0,0,0,0,0,0,0,search_hospital_by_params
65,37,0,0,0,0,0,0,0,0,search_hospital_by_params
66,37,0,0,0,0,0,0,0,0,search_hospital_by_params
67,98,0,0,0,0,0,0,0,0,search_hospital_by_params
68,0,0,0,0,0,0,0,0,0,confused


In [24]:
train_index.append(68)

In [25]:
from sklearn.preprocessing import LabelEncoder
lable_enc = LabelEncoder()

In [26]:
labl = lable_enc.fit_transform(df_encoded.labels)
labl

array([ 5,  5,  5,  5,  5,  5,  5,  4,  4,  4,  4,  4, 14, 14, 14, 14, 14,
        8,  8,  8,  8,  8,  7,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  2,
        2,  2,  2,  2, 10, 10, 10, 10,  9,  9,  9,  9,  9, 13, 13, 13,  6,
        6,  6,  6,  6, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11,
        3])

In [27]:
mapper = {}
for index,key in enumerate(df_encoded.labels):
    if key not in mapper.keys():
        mapper[key] = labl[index]
mapper

{'adverse_drug': 0,
 'blood_pressure': 1,
 'blood_pressure_search': 2,
 'confused': 3,
 'goodbye': 4,
 'greeting': 5,
 'hospital_search': 6,
 'noanswer': 7,
 'options': 8,
 'pharmacy_search': 9,
 'search_blood_pressure_by_patient_id': 10,
 'search_hospital_by_params': 11,
 'search_hospital_by_type': 12,
 'search_pharmacy_by_name': 13,
 'thanks': 14}

In [28]:
df2.head(20)

Unnamed: 0,response,labels
0,"Hello, thanks for asking",greeting
1,Good to see you again,greeting
2,"Hi there, how can I help?",greeting
3,See you!,goodbye
4,Have a nice day,goodbye
5,Bye! Come back again soon.,goodbye
6,Happy to help!,thanks
7,Any time!,thanks
8,My pleasure,thanks
9,I can guide you through Adverse drug reaction ...,options


In [29]:
df2.labels = df2.labels.map(mapper).astype({'labels': 'int32'})
df2.head()

Unnamed: 0,response,labels
0,"Hello, thanks for asking",5
1,Good to see you again,5
2,"Hi there, how can I help?",5
3,See you!,4
4,Have a nice day,4


In [30]:
df2.to_csv('response.csv',index=False)

In [31]:
train = df_encoded.loc[train_index]
test = df_encoded.loc[test_index]

In [32]:
train = df_encoded.loc[train_index]
test = df_encoded.loc[test_index]
X_train = train.drop(columns=['labels'],axis=1)
y_train = train.labels
X_test = test.drop(columns=['labels'],axis=1)
y_test = test.labels

In [33]:
y_train =pd.get_dummies(y_train).values
y_test =pd.get_dummies(y_test).values

In [34]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8
23,11,5,64,7,8,16,0,0,0
28,29,1,2,30,0,0,0,0,0
33,19,5,20,4,1,2,9,78,0
7,46,3,47,0,0,0,0,0,0
0,38,24,0,0,0,0,0,0,0
50,36,4,10,0,0,0,0,0,0
22,0,0,0,0,0,0,0,0,0
17,15,3,27,58,0,0,0,0,0
42,21,12,34,35,0,0,0,0,0
38,6,33,13,0,0,0,0,0,0


In [35]:
y_test

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=uint8)

In [36]:
max_length = X_train.shape[1]
output=14
max_length

9

In [37]:
y_train[0].shape,y_test[0].shape

((14,), (14,))

In [38]:
early_stopping = EarlyStopping(monitor='val_loss',patience=10)
checkpoint = ModelCheckpoint("model-v1.h5",
                             monitor="val_loss",
                             mode="min",
                             save_best_only = True,
                             verbose=1)
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience = 3, verbose = 1, min_delta = 0.0001)
callbacks = [early_stopping,checkpoint,reduce_lr]

In [39]:
def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size,300, input_length=max_length))
    model.add(Conv1D(filters=53, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=6))
    model.add(Flatten())
    model.add(Dense(32, activation='relu'))
    model.add(Dense(14, activation='softmax'))
    
    
    # compile network
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.compile(loss = 'categorical_crossentropy',
              # optimizer = Adam(lr=0.001),
              optimizer = 'adam',
              metrics = ['accuracy'])
    
    # summarize defined model
    model.summary()
#     plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [40]:
model = define_model(vocab_size, max_length)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 9, 300)            29700     
_________________________________________________________________
conv1d (Conv1D)              (None, 7, 53)             47753     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 1, 53)             0         
_________________________________________________________________
flatten (Flatten)            (None, 53)                0         
_________________________________________________________________
dense (Dense)                (None, 32)                1728      
_________________________________________________________________
dense_1 (Dense)              (None, 14)                462       
Total params: 79,643
Trainable params: 79,643
Non-trainable params: 0
____________________________________________________

In [41]:
history = model.fit(X_train, y_train, epochs=400, verbose=1,validation_data=(X_test,y_test),callbacks=callbacks)

Epoch 1/400

Epoch 00001: val_loss improved from inf to 2.62968, saving model to model-v1.h5
Epoch 2/400

Epoch 00002: val_loss improved from 2.62968 to 2.61365, saving model to model-v1.h5
Epoch 3/400

Epoch 00003: val_loss improved from 2.61365 to 2.59783, saving model to model-v1.h5
Epoch 4/400

Epoch 00004: val_loss improved from 2.59783 to 2.58298, saving model to model-v1.h5
Epoch 5/400

Epoch 00005: val_loss improved from 2.58298 to 2.56706, saving model to model-v1.h5
Epoch 6/400

Epoch 00006: val_loss improved from 2.56706 to 2.54943, saving model to model-v1.h5
Epoch 7/400

Epoch 00007: val_loss improved from 2.54943 to 2.52918, saving model to model-v1.h5
Epoch 8/400

Epoch 00008: val_loss improved from 2.52918 to 2.50765, saving model to model-v1.h5
Epoch 9/400

Epoch 00009: val_loss improved from 2.50765 to 2.48489, saving model to model-v1.h5
Epoch 10/400

Epoch 00010: val_loss improved from 2.48489 to 2.46168, saving model to model-v1.h5
Epoch 11/400

Epoch 00011: val_lo

In [42]:
def get_text():
    input_text  = input();
    if(input_text=="end"):
        return 0
    input_text=[input_text];
    df_input = pd.DataFrame(input_text,columns=['questions'])
    df_input
    return df_input

In [43]:
from tensorflow.keras.models import load_model
model = load_model('model-v1.h5')
tokenizer_t = joblib.load('tokenizer_t.pkl')
vocab = joblib.load('vocab.pkl')

In [44]:
def tokenizer(entry):
    tokens = entry.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens]
    tokens = [word.lower() for word in tokens if len(word) > 1]
    return tokens

In [45]:

def remove_stop_words_for_input(tokenizer,df,feature):
    doc_without_stopwords = []
    entry = df[feature][0]
    tokens = tokenizer(entry)
    doc_without_stopwords.append(' '.join(tokens))
    df[feature] = doc_without_stopwords
    return df

In [46]:
def encode_input_text(tokenizer_t,df,feature):
    t = tokenizer_t
    entry =  [df[feature][0]]
    encoded = t.texts_to_sequences(entry)
    padded = pad_sequences(encoded, maxlen=9, padding='post')
    return padded

In [47]:
def get_pred(model,encoded_input):
    pred = np.argmax(model.predict(encoded_input))
    return pred

In [48]:
def bot_precausion(df_input,pred):
    words = df_input.questions[0].split()
    if len([w for w in words if w in vocab])==0 :
        pred = 1
    return pred

In [49]:
def get_response(df2,pred):
    upper_bound = df2.groupby('labels').get_group(pred).shape[0]
    r = np.random.randint(0,upper_bound)
    responses = list(df2.groupby('labels').get_group(pred).response)
    return responses[r]

In [50]:
def bot_response(response):
    print("BOT:",response)

In [None]:
while(1):
    print("YOU:")
    df_input = get_text()
 
    tokenizer_t = joblib.load('tokenizer_t.pkl')
    vocab = joblib.load('vocab.pkl')

    df_input = remove_stop_words_for_input(tokenizer,df_input,'questions')
    encoded_input = encode_input_text(tokenizer_t,df_input,'questions')

    pred = get_pred(model,encoded_input)
    pred = bot_precausion(df_input,pred)

    response = get_response(df2,pred)
    bot_response(response)
    
print('THANK YOU')

YOU:
hello
BOT: Hi there, how can I help?
YOU:
which drugs dont have adverse reaction
BOT: Navigating to Adverse drug reaction module
YOU:
blood pressure data entry
BOT: Navigating to Blood Pressure module
YOU:
show blood pressure results for patient
BOT: Patient ID?
YOU:
1
BOT: Navigating to Blood Pressure module
YOU:
lookup for hospitals
BOT: Please provide hospital type or location
YOU:
brain
BOT: Hello, thanks for asking
YOU:
nearest
BOT: Loading Blood pressure result for Patient
YOU:
bye
BOT: Have a nice day
YOU:
