In [47]:
import os
import numpy as np
import tensorflow as tf
import keras
from keras_crf import CRFModel

import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns  # for statistical data visualization


In [48]:
import json

folder_name = 'cfg'

with open("{}/{}.json".format(folder_name, "word_list")) as json_file:
    wordIdList = json.load(json_file)

with open("{}/{}.json".format(folder_name, "ner_config")) as json_file:
    ner_config = json.load(json_file)

with open("{}/{}.json".format(folder_name, "tags")) as json_file:
    tags = json.load(json_file)

with open("{}/{}.json".format(folder_name, "symptom_list")) as json_file:
    symptom_list = json.load(json_file)

with open("{}/{}.json".format(folder_name, "disease_list")) as json_file:
    disease_list = json.load(json_file)


In [49]:
data_path = './data/sentences_by_respondent.csv'

dataset = pd.read_csv(data_path)
dataset = dataset.dropna(axis=0)


In [50]:
dataset.columns

Index(['respondent_idx', 'sentence', 'result'], dtype='object')

In [51]:
from sklearn.model_selection import train_test_split


# Put each sentence in their own group based on respondent ID
x = dataset.groupby(['respondent_idx', 'result']).sentence.apply(list).reset_index()
y = x['result']
x = x.drop('result', axis=1)

respondent_groups_train, respondent_groups, respondent_result_train, respondent_result = train_test_split(
    x, y, test_size=0.3, random_state=1)

respondent_list = []
for val in respondent_groups.values:
    respondent_list.append(val[1])

respondent_result

20    tubercolosis
17       pneumonia
3        pneumonia
13             flu
19        diarrhea
16          dengue
10          dengue
Name: result, dtype: object

In [52]:
from TglStemmer import stemmer

def preprocess_data():
    preprocessed_respondent_list = []
    for respondent in respondent_list:
        preprocessed_respondent = []
        for sentence in respondent:
            preprocessed_respondent.append(stemmer('2', sentence, '1'))
        preprocessed_respondent_list.append(preprocessed_respondent)
    return preprocessed_respondent_list

#respondent_list = preprocess_data()
#respondent_list


In [53]:
import json

with open("{}/{}.json".format('cfg', "stopwords-tl")) as json_file:
    stopwords = json.load(json_file)

def remove_stopwords(tokenizedSentence):
    for stopword in stopwords:
        for word in tokenizedSentence:
            if (word == stopword):
                print("a")
                #tokenizedSentence.remove(word)
    return tokenizedSentence

# cleaned_respondent_list = []
# for respondent in respondent_list:
#     cleaned_respondent = []
#     for sentence in respondent:
#         cleaned_respondent.append(remove_stopwords(sentence))
#     cleaned_respondent_list.append(cleaned_respondent)

#respondent_list = cleaned_respondent_list
#respondent_list


In [54]:
# Tokenize each sentence in all groups
from nltk.tokenize import word_tokenize

tokenized_sentences = []
for respondent in respondent_list:
    sentence_group = []
    for sentence in respondent:
        sentence_group.append(word_tokenize(sentence))
    tokenized_sentences.append(sentence_group)
    
tokenized_sentences[0][0]


['nitong', 'raan', 'ramdam', 'pagod', 'sikip', 'dibdib']

In [55]:
END_IDX = ner_config["n_words"] - 2
UNK_IDX = ner_config["n_words"] - 1

def convert_sentence_to_idx(tokenizedSentence):
    sentence2idx = []
    for word in tokenizedSentence:
        wordFound = False
        for key, val in wordIdList.items():
            if (word == key):
                wordFound = True
                sentence2idx.append(val)
        if (not wordFound):
            sentence2idx.append(UNK_IDX)
    while (len((sentence2idx)) < ner_config["maxlen"]):
        sentence2idx.append(END_IDX)
    return sentence2idx

In [56]:
# Convert each tokenized sentence to its word id counterparts
idx_sentences = []
for respondent in tokenized_sentences:
    sentence_group = []
    for sentence in respondent:
        sentence_group.append(convert_sentence_to_idx(sentence))
    idx_sentences.append(sentence_group)
idx_sentences


[[[16, 0, 39, 42, 19, 49, 75, 75], [28, 62, 1, 34, 59, 75, 75, 75]],
 [[59, 62, 48, 38, 75, 75, 75, 75], [20, 6, 44, 37, 39, 50, 15, 13]],
 [[59, 62, 57, 75, 75, 75, 75, 75], [47, 6, 50, 15, 13, 75, 75, 75]],
 [[24, 39, 20, 26, 55, 75, 75, 75], [26, 22, 6, 25, 29, 75, 75, 75]],
 [[2, 71, 26, 66, 75, 75, 75, 75], [39, 44, 46, 24, 75, 75, 75, 75]],
 [[47, 3, 33, 75, 75, 75, 75, 75], [52, 57, 34, 23, 75, 75, 75, 75]],
 [[76, 57, 60, 26, 22, 75, 75, 75],
  [53, 56, 41, 33, 75, 75, 75, 75],
  [25, 44, 38, 75, 75, 75, 75, 75]]]

In [57]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, GRU, Embedding, Bidirectional
from tf2crf import CRF, ModelWithCRFLoss

def initialize_model(model_type):
    inputs = tf.keras.layers.Input(shape=(None,), dtype='int32')
    output = Embedding(ner_config['n_words'], ner_config['word_embedding_size'],
                    trainable=True, mask_zero=True)(inputs)
    if (model_type == 'LSTM'):
        bi_rnn = Bidirectional(LSTM(units=ner_config['word_embedding_size'],
                                    return_sequences=True,
                                    dropout=0.5,
                                    recurrent_dropout=0.5,
                                    kernel_initializer=tf.keras.initializers.he_normal()))(output)
        rnn = LSTM(units=ner_config['word_embedding_size'] * 2,
                    return_sequences=True,
                    dropout=0.5,
                    recurrent_dropout=0.5,
                    kernel_initializer=tf.keras.initializers.he_normal())(bi_rnn)
    else:
        bi_rnn = Bidirectional(GRU(units=ner_config['word_embedding_size'],
                                    return_sequences=True,
                                    dropout=0.5,
                                    recurrent_dropout=0.5,
                                    kernel_initializer=tf.keras.initializers.he_normal()))(output)
        rnn = GRU(units=ner_config['word_embedding_size'] * 2,
                    return_sequences=True,
                    dropout=0.5,
                    recurrent_dropout=0.5,
                    kernel_initializer=tf.keras.initializers.he_normal())(bi_rnn)
    crf = CRF(units=ner_config['n_tags'], dtype='float32')
    output = crf(rnn)
    base_model = Model(inputs, output)
    ner_model = ModelWithCRFLoss(base_model, sparse_target=True)
    ner_model.build(ner_config['shape'])

    if (model_type == 'LSTM'):
        ner_model.load_weights("bilstm")
    else:
        ner_model.load_weights("bigru")
    return ner_model

In [58]:
#ner_model = tf.keras.models.load_model('bigru.h5')
ner_model = initialize_model('GRU')
ner_model.summary()

Model: "model_with_crf_loss_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model_2 (Functional)        ((None, None),            3272112   
                              (None, None, 3),                   
                              (None,),                           
                              (3, 3))                            
                                                                 
Total params: 3,272,116
Trainable params: 3,272,112
Non-trainable params: 4
_________________________________________________________________


In [59]:
len(idx_sentences[0][0])

8

In [60]:
I_INDEX = 0

# For a given sentence, predict the words which are related to symptom information
def recognize_symptoms_in_sentence(sentence2idx, tokenizedSentence):
    p = ner_model.predict(np.array([sentence2idx]))
    #p = np.argmax(p, axis=-1)

    input_symptom_list = []

    # Iterate through the entire sentence
    
    for idx, (w, pred) in enumerate(zip(sentence2idx, p[0])):
        if (tags[pred] == 'B-SYMPTOM'):
            symptom_word = tokenizedSentence[idx]
            
            # Check for additional words for a symptom
            temp_idx = idx + 1
            if (temp_idx < (len(tokenizedSentence) - 1) and p[0][temp_idx] == I_INDEX):
                while (p[0][temp_idx] == I_INDEX):
                    symptom_word = symptom_word + " " + tokenizedSentence[temp_idx]
                    if (temp_idx != len(tokenizedSentence) - 1): 
                        temp_idx += 1
                    else:
                        break
            input_symptom_list.append(symptom_word)
        if (idx == len(tokenizedSentence) - 1):
            break
    return input_symptom_list


In [61]:
from difflib import SequenceMatcher

STRING_MATCH_PERCENTAGE = 0.75

# Match each symptom with the trained symptom list
def symptoms_to_boolean(input_symptom_list):
    input_to_boolean = []

    for symptom in symptom_list:
        symptomInList = False
        for input_symptom in input_symptom_list:
            if (symptom in input_symptom or SequenceMatcher(None, input_symptom, symptom).ratio() >= STRING_MATCH_PERCENTAGE):
                symptomInList = True
        if (symptomInList):
            input_to_boolean.append(1)
        else:
            input_to_boolean.append(0)
    return input_to_boolean

In [62]:
# Convert each tokenized sentence to its word id counterparts
boolean_symptoms = []

i = 0

for respondent in idx_sentences:
    j = 0
    symptom_group = []
    for sentence in respondent:
        symptom_group.append(symptoms_to_boolean(recognize_symptoms_in_sentence(sentence, tokenized_sentences[i][j])))
        j += 1
    boolean_symptoms.append(symptom_group)
    i += 1
boolean_symptoms



[[[0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [0,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0]],
 [[0,
   0,
   1,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0]],
 [[1,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   

In [63]:
# For each group, merge all existing symptoms into one array
def merge_symptom_info(symptom_arrays):
    # Initialize null array
    merged_symptoms = []
    for symptom in symptom_list:
        merged_symptoms.append(0)

    for symptom_group in symptom_arrays:
        for i, symptom in enumerate(symptom_group):
            if symptom == 1:
                merged_symptoms[i] = 1
    return merged_symptoms


In [64]:
# Merge symptoms per respondent
respondent_symptoms = []
for respondent in boolean_symptoms:
    respondent_symptoms.append(merge_symptom_info(respondent))
respondent_symptoms


[[0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0],
 [1,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0]]

In [65]:
import pandas as pd

symptom_columns = []
for symptom in symptom_list:
    symptom_columns.append([symptom,],)

cols = pd.MultiIndex.from_arrays(symptom_columns)
print(cols)

input_frame = pd.DataFrame(respondent_symptoms)


MultiIndex([('lagnat', 'sakit ulo', 'suka', 'hilo', 'ubo', 'hina', ...)],
           )


In [66]:
import pickle

with open('naiveBayes.pkl', 'rb') as f:
    naiveBayes = pickle.load(f)


In [67]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
scaled_frame = scaler.fit_transform(input_frame)

prediction_results = naiveBayes.predict(input_frame)
prediction_results


array(['tubercolosis', 'pneumonia', 'pneumonia', 'flu', 'diarrhea',
       'dengue', 'dengue'], dtype='<U12')

In [68]:
# DEPRECATED: Convert diseases to respective IDs for comparison
def diseases_to_idx(list_of_disease_strings):
    list_of_disease_idx = []
    for disease in list_of_disease_strings:
        for key, val in disease_list.items():
            if (disease == key):
                list_of_disease_idx.append(val)
    return list_of_disease_idx

In [69]:
from sklearn.metrics import precision_score, recall_score, f1_score, multilabel_confusion_matrix

true_results = respondent_result
print(true_results)
print(prediction_results)
#true_results = true_results.drop_duplicates()
# true_results = true_results['result']

#y_pred = diseases_to_idx(prediction_results)
#y_true = diseases_to_idx(true_results.values.tolist())

print("Precision:", precision_score(true_results,
      prediction_results, average='macro', zero_division=0))
print("Recall:", recall_score(true_results, prediction_results, average='macro', zero_division=0))
print("F1-score:", f1_score(true_results,
      prediction_results, average='macro', zero_division=0))

print(multilabel_confusion_matrix(true_results, prediction_results))


20    tubercolosis
17       pneumonia
3        pneumonia
13             flu
19        diarrhea
16          dengue
10          dengue
Name: result, dtype: object
['tubercolosis' 'pneumonia' 'pneumonia' 'flu' 'diarrhea' 'dengue' 'dengue']
Precision: 1.0
Recall: 1.0
F1-score: 1.0
[[[5 0]
  [0 2]]

 [[6 0]
  [0 1]]

 [[6 0]
  [0 1]]

 [[5 0]
  [0 2]]

 [[6 0]
  [0 1]]]
