In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for data visualization purposes
import seaborn as sns # for statistical data visualization
%matplotlib inline

# Import data directory

import os
for dirname, _, filenames in os.walk('./data/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


./data/disease.csv
./data/disease_description.csv
./data/disease_test.csv
./data/ner.csv
./data/ner_test.csv
./data/README.txt
./data/sentences_by_respondent.csv
./data/sentences_by_respondent_test.csv


In [2]:
data = './data/disease.csv'

dataset = pd.read_csv(data)

In [3]:
dataset.shape

(26, 40)

In [4]:
dataset = dataset.dropna(axis=0)

In [5]:
dataset.head()

Unnamed: 0,result,hirap sa paghinga,pagsikip sa dibdib,ubo,pag-aagahas,pamumula sa balat,kati,pantal,kumapal yung balat,sensitibo,...,masakit na pag-nguya,pamamaga nggilagid,hirap sa pag-utot,pananakit sa kanang tagiliran,pananakit ng kalamnan,pananakit ng kasu-kasuan,matamlay,matagal na pag-galing ng sugat,madalas umihi,madalas ang pagdumi
13,flu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,altapresyon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,diarrhea,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,toothache,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,appendicitis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
dataset.columns

Index(['result', 'hirap sa paghinga', 'pagsikip sa dibdib', 'ubo',
       'pag-aagahas', 'pamumula sa balat', 'kati', 'pantal',
       'kumapal yung balat', 'sensitibo', 'magaspang ang balat',
       'dugo sa plema', 'nabawasan ang timbang', 'nanghina ang katawan',
       'nagpapawis tuwing gabi', 'nawalan ng gana kumain', 'nilalamig',
       'lagnat', 'masakit ang katawan', 'masakit ang ulo', 'nagsusuka', 'hilo',
       'ubong may plema', 'pagkawala ng panlasa', 'pagkawala ng pang amoy',
       'barado ang ilong', 'masakit ang batok', 'paglabo ng paningin',
       'sakit ng tiyan', 'mabilis na tibok ng puso', 'masakit na pag-nguya',
       'pamamaga nggilagid', 'hirap sa pag-utot',
       'pananakit sa kanang tagiliran', 'pananakit ng kalamnan',
       'pananakit ng kasu-kasuan', 'matamlay',
       'matagal na pag-galing ng sugat', 'madalas umihi',
       'madalas ang pagdumi'],
      dtype='object')

In [7]:
# find numerical variables

numerical = [var for var in dataset.columns if dataset[var].dtype != 'O']

dataset[numerical].isnull().sum()


hirap sa paghinga                 0
pagsikip sa dibdib                0
ubo                               0
pag-aagahas                       0
pamumula sa balat                 0
kati                              0
pantal                            0
kumapal yung balat                0
sensitibo                         0
magaspang ang balat               0
dugo sa plema                     0
nabawasan ang timbang             0
nanghina ang katawan              0
nagpapawis tuwing gabi            0
nawalan ng gana kumain            0
nilalamig                         0
lagnat                            0
masakit ang katawan               0
masakit ang ulo                   0
nagsusuka                         0
hilo                              0
ubong may plema                   0
pagkawala ng panlasa              0
pagkawala ng pang amoy            0
barado ang ilong                  0
masakit ang batok                 0
paglabo ng paningin               0
sakit ng tiyan              

In [8]:
# Target variable
x = dataset.drop(['result'], axis=1)
y = dataset['result']

In [9]:
# Training and testing split

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3, random_state = 0)

In [10]:
disease_list = y.drop_duplicates()

disease2idx = {w: i for i, w in enumerate(disease_list)}
disease2idx

{'flu': 0,
 'altapresyon': 1,
 'diarrhea': 2,
 'toothache': 3,
 'appendicitis': 4,
 'dengue': 5,
 'diabetes': 6,
 'migraine': 7,
 'asthma': 8,
 'covid 19': 9}

In [11]:
import json

folder_name = 'cfg'

with open("{}/{}.json".format(folder_name, "symptom_list"), "w") as file_path:
    json.dump(numerical, file_path)

with open("{}/{}.json".format(folder_name, "disease_list"), "w") as file_path:
    json.dump(disease2idx, file_path)


In [12]:
from sklearn.preprocessing import RobustScaler

cols = x_train.columns

scaler = RobustScaler()

x_train = scaler.fit_transform(x_train)

x_test = scaler.transform(x_test)

x_train = pd.DataFrame(x_train, columns=[cols])

x_test = pd.DataFrame(x_test, columns=[cols])


In [13]:
x_train.head()

Unnamed: 0,hirap sa paghinga,pagsikip sa dibdib,ubo,pag-aagahas,pamumula sa balat,kati,pantal,kumapal yung balat,sensitibo,magaspang ang balat,...,masakit na pag-nguya,pamamaga nggilagid,hirap sa pag-utot,pananakit sa kanang tagiliran,pananakit ng kalamnan,pananakit ng kasu-kasuan,matamlay,matagal na pag-galing ng sugat,madalas umihi,madalas ang pagdumi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
cols

Index(['hirap sa paghinga', 'pagsikip sa dibdib', 'ubo', 'pag-aagahas',
       'pamumula sa balat', 'kati', 'pantal', 'kumapal yung balat',
       'sensitibo', 'magaspang ang balat', 'dugo sa plema',
       'nabawasan ang timbang', 'nanghina ang katawan',
       'nagpapawis tuwing gabi', 'nawalan ng gana kumain', 'nilalamig',
       'lagnat', 'masakit ang katawan', 'masakit ang ulo', 'nagsusuka', 'hilo',
       'ubong may plema', 'pagkawala ng panlasa', 'pagkawala ng pang amoy',
       'barado ang ilong', 'masakit ang batok', 'paglabo ng paningin',
       'sakit ng tiyan', 'mabilis na tibok ng puso', 'masakit na pag-nguya',
       'pamamaga nggilagid', 'hirap sa pag-utot',
       'pananakit sa kanang tagiliran', 'pananakit ng kalamnan',
       'pananakit ng kasu-kasuan', 'matamlay',
       'matagal na pag-galing ng sugat', 'madalas umihi',
       'madalas ang pagdumi'],
      dtype='object')

In [15]:
# train a Gaussian Naive Bayes classifier on the training set
from sklearn.naive_bayes import GaussianNB


# instantiate the model
naiveBayes = GaussianNB()


# fit the model
naiveBayes.fit(x_train, y_train)


In [16]:
y_pred = naiveBayes.predict(x_test)

y_pred


array(['diarrhea', 'covid 19', 'diarrhea', 'covid 19'], dtype='<U11')

In [17]:
from sklearn.metrics import accuracy_score

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))


Model accuracy score: 0.5000


In [18]:
y_pred_train = naiveBayes.predict(x_train)

y_pred_train

array(['diarrhea', 'diarrhea', 'altapresyon', 'migraine', 'asthma',
       'toothache', 'flu', 'dengue', 'covid 19'], dtype='<U11')

In [19]:
print(
    'Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))


Training-set accuracy score: 1.0000


In [20]:
y_test.value_counts()

covid 19        2
diabetes        1
appendicitis    1
Name: result, dtype: int64

In [21]:
import pickle

# save
with open('naiveBayes.pkl', 'wb') as f:
    pickle.dump(naiveBayes, f)

# load
with open('naiveBayes.pkl', 'rb') as f:
    diseasePredictionModel = pickle.load(f)

In [22]:
import pickle

# save
with open('naiveBayes.pkl','wb') as f:
    pickle.dump(naiveBayes,f)