In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from pathlib import Path
import string
import re
import joblib
import json
from collections import Counter
import nltk
import tensorflow 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

In [None]:
  def load_doc(jsonFile):
    with open(jsonFile) as file:
        Json_data = json.loads(file.read())
    return Json_data

In [None]:
data = load_doc('intents.json')

In [None]:
def frame_data(feat_1,feat_2,is_pattern):
    is_pattern = is_pattern
    df = pd.DataFrame(columns=[feat_1,feat_2])
    for intent in data['intents']:
        if is_pattern:
            for pattern in intent['patterns']:
                w = pattern
                df_to_append = pd.Series([w,intent['tag']], index = df.columns)
                df = df.append(df_to_append,ignore_index=True)
        else:
            for response in intent['responses']:
                w = response
                df_to_append = pd.Series([w,intent['tag']], index = df.columns)
                df = df.append(df_to_append,ignore_index=True)
    return df

In [None]:
# users intents 
df1 = frame_data('questions','labels',True)
df1

Unnamed: 0,questions,labels
0,Hi there,start_conversation
1,Is anyone there?,start_conversation
2,Hey,start_conversation
3,Hola,start_conversation
4,Hello,start_conversation
...,...,...
68,How many cases of COVID-19 are there?,query
69,Cases of COVID-19,query
70,Number of cases,query
71,How many are infected by COVID-19,query


In [None]:
lemmatizer = WordNetLemmatizer()

vocab = Counter()
labels = []
def tokenizer(entry):
    tokens = entry.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens]
#     stop_words = set(stopwords.words('english'))
#     tokens = [w for w in tokens if not w in stop_words]
    tokens = [word.lower() for word in tokens if len(word) > 1]
    return tokens

In [None]:
def remove_stop_words(tokenizer,df,feature):
    doc_without_stopwords = []
    for entry in df[feature]:
        tokens = tokenizer(entry)
        joblib.dump(tokens,'tokens.pkl')
        doc_without_stopwords.append(' '.join(tokens))
    df[feature] = doc_without_stopwords
    return 

In [None]:
def create_vocab(tokenizer,df,feature):
    for entry in df[feature]:
        tokens = tokenizer(entry)   
        vocab.update(tokens)
    joblib.dump(vocab,'vocab.pkl')
    return 

In [None]:
nltk.download('wordnet')
create_vocab(tokenizer,df1,'questions')
remove_stop_words(tokenizer,df1,'questions')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
print(vocab.most_common(20))

[('what', 17), ('you', 16), ('do', 16), ('how', 15), ('of', 12), ('for', 7), ('symptom', 7), ('to', 6), ('know', 6), ('are', 6), ('about', 6), ('doe', 6), ('there', 5), ('me', 5), ('can', 5), ('the', 5), ('long', 5), ('is', 4), ('protect', 4), ('mask', 4)]


In [None]:
test_index = []
test_list = list(df1.groupby(by='labels',as_index=False).first()['questions'])
for i,_ in enumerate(test_list):
    idx = df1[df1.questions == test_list[i]].index[0]
    test_index.append(idx)
test_index

[25, 48, 35, 7, 55, 51, 64, 17, 40, 68, 0, 60, 29, 12]

In [None]:
train_index = [i for i in df1.index if i not in test_index]
' '.join(list(vocab.keys()))

'hi there is anyone hey hola hello good day bye see you later goodbye nice chatting to till next time thanks thank thats helpful awesome for helping me how can help what do provide be support offered know are about tell really lot exactly the symptom of list happens when get doe spread catch transmit protect myself prevent transmission from im afraid preventive measure reduce cure vaccine treat should wear mask others why people wearing purpose long incubation period take show appear survive on surface lifespan last not shouldnt cant many case number infected by'

In [None]:
def encoder(df,feature):
#     text = ' '.join(list(vocab.keys()))
    t = Tokenizer()
    entries = [entry for entry in df[feature]]
    t.fit_on_texts(entries)
    joblib.dump(t,'tokenizer_t.pkl')
    vocab_size = len(t.word_index) + 1
    entries = [entry for entry in df[feature]]
    max_length = max([len(s.split()) for s in entries])
    encoded = t.texts_to_sequences(entries)
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded, vocab_size

In [None]:
X,vocab_size = encoder(df1,'questions')
df_encoded = pd.DataFrame(X)
df_encoded['labels'] = df1.labels
df_encoded.head()

Unnamed: 0,0,1,2,3,4,5,6,labels
0,33,13,0,0,0,0,0,start_conversation
1,18,44,13,0,0,0,0,start_conversation
2,45,0,0,0,0,0,0,start_conversation
3,46,0,0,0,0,0,0,start_conversation
4,47,0,0,0,0,0,0,start_conversation


In [None]:
for i in range(0,2):
    dt = [0]*7
    dt.append('confused')
    dt = [dt]
    pd.DataFrame(dt).rename(columns = {7:'labels'})
    df_encoded = df_encoded.append(pd.DataFrame(dt).rename(columns = {7:'labels'}),ignore_index=True)

In [None]:
train_index.append(73)

In [None]:
test_index.append(74)

In [None]:
from sklearn.preprocessing import LabelEncoder
lable_enc = LabelEncoder()

In [None]:
labl = lable_enc.fit_transform(df_encoded.labels)


In [None]:
mapper = {}
for index,key in enumerate(df_encoded.labels):
    if key not in mapper.keys():
        mapper[key] = labl[index]
mapper = {v: k for k, v in mapper.items()}
import pickle
with open('mapper.p', 'wb') as fp:
    pickle.dump(mapper, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
df2 = frame_data('response','labels',False)
df2.head()


Unnamed: 0,response,labels
0,"Hello, are you well?",start_conversation
1,Happy to have you here,start_conversation
2,Good to see you again,start_conversation
3,"Hi there, how can I help?",start_conversation
4,Stay safe!,end_conversation


In [None]:
df2.to_csv('response.csv',index=False)

In [None]:
train = df_encoded.loc[train_index]
test = df_encoded.loc[test_index]

In [None]:
X_train = train.drop(columns=['labels'],axis=1)
y_train = train.labels
X_test = test.drop(columns=['labels'],axis=1)
y_test = test.labels

In [None]:
y_train =pd.get_dummies(y_train).values
y_test =pd.get_dummies(y_test).values

In [None]:
max_length = X_train.shape[1]
output = 15

In [None]:
early_stopping = EarlyStopping(monitor='val_loss',patience=10)
checkpoint = ModelCheckpoint("model-v1.h5",
                             monitor="val_loss",
                             mode="min",
                             save_best_only = True,
                             verbose=1)
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience = 3, verbose = 1, min_delta = 0.0001)
callbacks = [early_stopping,checkpoint,reduce_lr]
    
    

In [None]:

def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size,300, input_length=max_length))
    model.add(Conv1D(filters=64, kernel_size=4, activation='relu'))
    model.add(MaxPooling1D(pool_size=4))
    model.add(Flatten())
#     model.add(Dense(32, activation='relu'))
    model.add(Dense(15, activation='softmax'))
    
    
    # compile network
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.compile(loss = 'categorical_crossentropy',
              # optimizer = Adam(lr=0.001),
              optimizer = 'adam',
              metrics = ['accuracy'])
    
    # summarize defined model
    model.summary()
#     plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [None]:
model = define_model(vocab_size, max_length)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 7, 300)            28800     
_________________________________________________________________
conv1d (Conv1D)              (None, 4, 64)             76864     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 1, 64)             0         
_________________________________________________________________
flatten (Flatten)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 15)                975       
Total params: 106,639
Trainable params: 106,639
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(X_train, y_train, epochs=500, verbose=1,validation_data=(X_test,y_test),callbacks=callbacks)

Epoch 1/500
Epoch 00001: val_loss improved from inf to 2.67829, saving model to model-v1.h5
Epoch 2/500
Epoch 00002: val_loss improved from 2.67829 to 2.65425, saving model to model-v1.h5
Epoch 3/500
Epoch 00003: val_loss improved from 2.65425 to 2.63102, saving model to model-v1.h5
Epoch 4/500
Epoch 00004: val_loss improved from 2.63102 to 2.60689, saving model to model-v1.h5
Epoch 5/500
Epoch 00005: val_loss improved from 2.60689 to 2.58226, saving model to model-v1.h5
Epoch 6/500
Epoch 00006: val_loss improved from 2.58226 to 2.55685, saving model to model-v1.h5
Epoch 7/500
Epoch 00007: val_loss improved from 2.55685 to 2.53018, saving model to model-v1.h5
Epoch 8/500
Epoch 00008: val_loss improved from 2.53018 to 2.50160, saving model to model-v1.h5
Epoch 9/500
Epoch 00009: val_loss improved from 2.50160 to 2.47323, saving model to model-v1.h5
Epoch 10/500
Epoch 00010: val_loss improved from 2.47323 to 2.44363, saving model to model-v1.h5
Epoch 11/500
Epoch 00011: val_loss improved

In [None]:
def get_text():
    input_text  = ['what are you']
    df_input = pd.DataFrame(input_text,columns=['questions'])
    df_input
    return df_input 

In [None]:
#load artifacts 
from tensorflow.keras.models import load_model
model = load_model('model-v1.h5')
tokenizer_t = joblib.load('tokenizer_t.pkl')
vocab = joblib.load('vocab.pkl')

with open('mapper.p', 'rb') as fp:
    mapper = pickle.load(fp)


In [None]:
def tokenizer(entry):
    tokens = entry.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens]
#     stop_words = set(stopwords.words('english'))
#     tokens = [w for w in tokens if not w in stop_words]
    tokens = [word.lower() for word in tokens if len(word) > 1]
    return tokens

In [None]:
def remove_stop_words_for_input(tokenizer,df,feature):
    doc_without_stopwords = []
    entry = df[feature][0]
    tokens = tokenizer(entry)
    doc_without_stopwords.append(' '.join(tokens))
    df[feature] = doc_without_stopwords
    return df

In [None]:
def encode_input_text(tokenizer_t,df,feature):
    t = tokenizer_t
    entry = entry = [df[feature][0]]
    encoded = t.texts_to_sequences(entry)
    padded = pad_sequences(encoded, maxlen=7, padding='post')
    return padded

In [None]:
def get_pred(model,encoded_input):
    pred = np.argmax(model.predict(encoded_input))
    return pred

In [None]:
def bot_precausion(df_input,pred):
    words = df_input.questions[0].split()
    if len([w for w in words if w in vocab])==0 :
        pred = 1
    return pred

In [None]:
def get_response(df2,pred):
    upper_bound = df2.groupby('labels').get_group(mapper[pred]).shape[0]
    r = np.random.randint(0, upper_bound)
    responses = list(df2.groupby('labels').get_group(mapper[pred]).response)
    return responses[r]
    

In [None]:
  def bot_response(response,):
      print(response)

In [None]:
df_input = get_text()

#load artifacts 
tokenizer_t = joblib.load('tokenizer_t.pkl')
vocab = joblib.load('vocab.pkl')

df_input = remove_stop_words_for_input(tokenizer,df_input,'questions')
encoded_input = encode_input_text(tokenizer_t,df_input,'questions')

pred = get_pred(model,encoded_input)
pred = bot_precausion(df_input,pred)

response = get_response(df2,pred)
bot_response(response)

I can help you with most COVID-19 Frequently Asked Questions
