In [403]:
import os
import numpy as np
import tensorflow as tf
import keras
from keras_crf import CRFModel

import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns  # for statistical data visualization


In [404]:
import json

folder_name = 'cfg'

with open("{}/{}.json".format(folder_name, "word_list")) as json_file:
    wordIdList = json.load(json_file)

with open("{}/{}.json".format(folder_name, "ner_config")) as json_file:
    ner_config = json.load(json_file)

with open("{}/{}.json".format(folder_name, "tags")) as json_file:
    tags = json.load(json_file)

with open("{}/{}.json".format(folder_name, "symptom_list")) as json_file:
    symptom_list = json.load(json_file)

with open("{}/{}.json".format(folder_name, "disease_list")) as json_file:
    disease_list = json.load(json_file)


In [405]:
data = './data/sentences_by_respondent.csv'

data = pd.read_csv(data)
#data = data.dropna(axis=0)

# Creating a dataframe with 75%
# values of original dataframe
df_train = data.sample(frac=0.67)

# Creating dataframe with
# rest of the 25% values
df_test = data.drop(df_train.index)

dataset = df_test.drop(['result'], axis=1)


In [406]:
dataset.columns


Index(['respondent_idx', 'sentence'], dtype='object')

In [407]:
# Put each sentence in their own group based on respondent ID
respondent_groups = dataset.groupby(['respondent_idx']).sentence.apply(list).reset_index()

respondent_list = []
for val in respondent_groups.values:
    respondent_list.append(val[1])

respondent_list


[['Ako ay nakakaranas ng hirap sa paghinga pagkatapos tumakbo o kaya pagtapos ng mahabang lakaran.'],
 ['Nilalamig dahil sa lagnat at masakit ang katawan.',
  'Masakit ang ulo, nagsusuka at madalas nahihilo.'],
 ['Mataas ang lagnat, inuubong may plema, nahihilo, at nagsusuka.'],
 ['Nahihirapan din sa pagtulog.'],
 ['Masakit ang dibdib at mabilis ang pagtakbo ng puso.'],
 ['Madalas ang pagdumi ng basa o parang tubig.'],
 ['Laging inuubo at naninikip ang dibdib ko.'],
 ['Nagpapawis tuwing gabi tapos nawalan din ng gana kumain.'],
 ['Masakit ang batok at nahihilo.'],
 ['Pananakit ng ulo at nilalagnat.',
  'Masakit na pag-nguya at pamamaga ng gilagid.'],
 ['Mataas na lagnat, pamamantal, at madaling pag-papasa.',
  'Malubhang pananakit ng ulo, pananakit ng kalamnan, at pananakit ng kasu-kasuan.'],
 ['Matamlay, nanghihina, at madalas umihi.',
  'Matagal na pag-galing ng sugat at nabawasn ang timbang.'],
 ['Pumipintig ang ulo at matinding pananakit ng ulo.'],
 ['Madalas din po ang pagdumi ko 

In [408]:
from TglStemmer import stemmer

def preprocess_data():
    preprocessed_respondent_list = []
    for respondent in respondent_list:
        preprocessed_respondent = []
        for sentence in respondent:
            preprocessed_respondent.append(stemmer('2', sentence, '1'))
        preprocessed_respondent_list.append(preprocessed_respondent)
    return preprocessed_respondent_list

#respondent_list = preprocess_data()
#respondent_list


In [409]:
import json

with open("{}/{}.json".format('cfg', "stopwords-tl")) as json_file:
    stopwords = json.load(json_file)

def remove_stopwords(tokenizedSentence):
    for stopword in stopwords:
        for word in tokenizedSentence:
            if (word == stopword):
                print("a")
                #tokenizedSentence.remove(word)
    return tokenizedSentence

# cleaned_respondent_list = []
# for respondent in respondent_list:
#     cleaned_respondent = []
#     for sentence in respondent:
#         cleaned_respondent.append(remove_stopwords(sentence))
#     cleaned_respondent_list.append(cleaned_respondent)

#respondent_list = cleaned_respondent_list
#respondent_list


In [410]:
# Tokenize each sentence in all groups
from nltk.tokenize import word_tokenize

tokenized_sentences = []
for respondent in respondent_list:
    sentence_group = []
    for sentence in respondent:
        sentence_group.append(word_tokenize(sentence))
    tokenized_sentences.append(sentence_group)
    
tokenized_sentences[0][0]


['Ako',
 'ay',
 'nakakaranas',
 'ng',
 'hirap',
 'sa',
 'paghinga',
 'pagkatapos',
 'tumakbo',
 'o',
 'kaya',
 'pagtapos',
 'ng',
 'mahabang',
 'lakaran',
 '.']

In [411]:
END_IDX = ner_config["n_words"] - 2
UNK_IDX = ner_config["n_words"] - 1

def convert_sentence_to_idx(tokenizedSentence):
    sentence2idx = []
    for word in tokenizedSentence:
        wordFound = False
        for key, val in wordIdList.items():
            if (word == key):
                wordFound = True
                sentence2idx.append(val)
        if (not wordFound):
            sentence2idx.append(UNK_IDX)
    while (len((sentence2idx)) < ner_config["maxlen"]):
        sentence2idx.append(END_IDX)
    return sentence2idx

In [412]:
# Convert each tokenized sentence to its word id counterparts
idx_sentences = []
for respondent in tokenized_sentences:
    sentence_group = []
    for sentence in respondent:
        sentence_group.append(convert_sentence_to_idx(sentence))
    idx_sentences.append(sentence_group)
idx_sentences


[[[31,
   2,
   125,
   64,
   19,
   52,
   117,
   109,
   20,
   177,
   45,
   136,
   64,
   83,
   61,
   34,
   188,
   188,
   188]],
 [[92,
   176,
   52,
   164,
   124,
   54,
   57,
   127,
   34,
   188,
   188,
   188,
   188,
   188,
   188,
   188,
   188,
   188,
   188],
  [79,
   57,
   4,
   101,
   85,
   124,
   144,
   168,
   34,
   188,
   188,
   188,
   188,
   188,
   188,
   188,
   188,
   188,
   188]],
 [[131,
   57,
   164,
   101,
   60,
   55,
   49,
   101,
   168,
   101,
   124,
   85,
   34,
   188,
   188,
   188,
   188,
   188,
   188]],
 [[143,
   154,
   52,
   104,
   34,
   188,
   188,
   188,
   188,
   188,
   188,
   188,
   188,
   188,
   188,
   188,
   188,
   188,
   188]],
 [[79,
   57,
   81,
   124,
   93,
   57,
   172,
   64,
   14,
   34,
   188,
   188,
   188,
   188,
   188,
   188,
   188,
   188,
   188]],
 [[69,
   57,
   56,
   64,
   116,
   177,
   119,
   87,
   34,
   188,
   188,
   188,
   188,
   188,
   188,
  

In [413]:
ner_model = tf.keras.models.load_model('bilstm.h5')
ner_model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 19, 300)           57000     
                                                                 
 bidirectional_4 (Bidirectio  (None, 19, 600)          1442400   
 nal)                                                            
                                                                 
 lstm_5 (LSTM)               (None, 19, 600)           2882400   
                                                                 
 time_distributed_4 (TimeDis  (None, 19, 4)            2404      
 tributed)                                                       
                                                                 
 activation_4 (Activation)   (None, 19, 4)             0         
                                                                 
Total params: 4,384,204
Trainable params: 4,384,204
No

In [414]:
len(idx_sentences[0][0])

19

In [415]:
I_INDEX = 0

# For a given sentence, predict the words which are related to symptom information
def recognize_symptoms_in_sentence(sentence2idx, tokenizedSentence):
    p = ner_model.predict(np.array([sentence2idx]))
    p = np.argmax(p, axis=-1)

    input_symptom_list = []

    # Iterate through the entire sentence
    for idx, (w, pred) in enumerate(zip(sentence2idx, p[0])):
        if (tags[pred] == 'B-SYMPTOM'):
            symptom_word = tokenizedSentence[idx]
            
            # Check for additional words for a symptom
            temp_idx = idx + 1
            if (temp_idx < (len(tokenizedSentence) - 1) and p[0][temp_idx] == I_INDEX):
                while (p[0][temp_idx] == I_INDEX):
                    symptom_word = symptom_word + " " + tokenizedSentence[temp_idx]
                    if (temp_idx != len(tokenizedSentence) - 1): 
                        temp_idx += 1
                    else:
                        break
            input_symptom_list.append(symptom_word)
        if (idx == len(tokenizedSentence) - 1):
            break
    return input_symptom_list


In [416]:
from difflib import SequenceMatcher

STRING_MATCH_PERCENTAGE = 0.75

# Match each symptom with the trained symptom list
def symptoms_to_boolean(input_symptom_list):
    input_to_boolean = []

    for symptom in symptom_list:
        symptomInList = False
        for input_symptom in input_symptom_list:
            if (input_symptom in symptom or SequenceMatcher(None, input_symptom, symptom).ratio() >= STRING_MATCH_PERCENTAGE):
                symptomInList = True
        if (symptomInList):
            input_to_boolean.append(1)
        else:
            input_to_boolean.append(0)
    return input_to_boolean

In [417]:
# Convert each tokenized sentence to its word id counterparts
boolean_symptoms = []

i = 0

for respondent in idx_sentences:
    j = 0
    symptom_group = []
    for sentence in respondent:
        symptom_group.append(symptoms_to_boolean(recognize_symptoms_in_sentence(sentence, tokenized_sentences[i][j])))
        j += 1
    boolean_symptoms.append(symptom_group)
    i += 1
boolean_symptoms



[[[0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0]],
 [[0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   0,
   0,
   1,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   1,
   0,
   0,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0]],
 [[0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0]],
 [[0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   

In [418]:
# For each group, merge all existing symptoms into one array
def merge_symptom_info(symptom_arrays):
    # Initialize null array
    merged_symptoms = []
    for symptom in symptom_list:
        merged_symptoms.append(0)

    for symptom_group in symptom_arrays:
        for i, symptom in enumerate(symptom_group):
            if symptom == 1:
                merged_symptoms[i] = 1
    return merged_symptoms


In [419]:
# Merge symptoms per respondent
respondent_symptoms = []
for respondent in boolean_symptoms:
    respondent_symptoms.append(merge_symptom_info(respondent))
respondent_symptoms


[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,


In [420]:
import pandas as pd

symptom_columns = []
for symptom in symptom_list:
    symptom_columns.append([symptom,],)

cols = pd.MultiIndex.from_arrays(symptom_columns)
print(cols)

input_frame = pd.DataFrame(respondent_symptoms)


MultiIndex([('hirap sa paghinga', 'pagsikip sa dibdib', 'ubo', ...)],
           )


In [421]:
import pickle

with open('naiveBayes.pkl', 'rb') as f:
    naiveBayes = pickle.load(f)


In [422]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
scaled_frame = scaler.fit_transform(input_frame)

prediction_results = naiveBayes.predict(input_frame)
prediction_results


array(['migraine', 'flu', 'migraine', 'migraine', 'migraine', 'migraine',
       'migraine', 'migraine', 'altapresyon', 'toothache', 'dengue',
       'migraine', 'migraine', 'migraine', 'migraine'], dtype='<U11')

In [423]:
# DEPRECATED: Convert diseases to respective IDs for comparison
def diseases_to_idx(list_of_disease_strings):
    list_of_disease_idx = []
    for disease in list_of_disease_strings:
        for key, val in disease_list.items():
            if (disease == key):
                list_of_disease_idx.append(val)
    return list_of_disease_idx

In [424]:
from sklearn.metrics import precision_score, recall_score, f1_score, multilabel_confusion_matrix

true_results = df_test.drop(['sentence'], axis=1)
true_results = true_results.drop_duplicates()
true_results = true_results['result']

#y_pred = diseases_to_idx(prediction_results)
#y_true = diseases_to_idx(true_results.values.tolist())

print("Precision:", precision_score(true_results,
      prediction_results, average='macro', zero_division=0))
print("Recall:", recall_score(true_results, prediction_results, average='macro', zero_division=0))
print("F1-score:", f1_score(true_results,
      prediction_results, average='macro', zero_division=0))

print(multilabel_confusion_matrix(true_results, prediction_results))


Precision: 0.371900826446281
Recall: 0.45454545454545453
F1-score: 0.37878787878787884
[[[14  0]
  [ 0  1]]

 [[12  0]
  [ 3  0]]

 [[14  0]
  [ 0  1]]

 [[14  0]
  [ 1  0]]

 [[13  0]
  [ 2  0]]

 [[14  0]
  [ 0  1]]

 [[ 4 10]
  [ 0  1]]

 [[14  0]
  [ 1  0]]

 [[13  0]
  [ 2  0]]

 [[14  0]
  [ 0  1]]

 [[14  0]
  [ 1  0]]]
