In [545]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for data visualization purposes
import seaborn as sns # for statistical data visualization
%matplotlib inline

# Import data directory

import os
for dirname, _, filenames in os.walk('./data/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


./data/disease.csv
./data/ner.csv
./data/sentences_by_respondent.csv
./data/sentences_by_respondent_raw.csv
./data/old\disease.csv
./data/old\ner.csv
./data/old\sentences_by_respondent.csv


In [546]:
data = './data/disease.csv'

dataset = pd.read_csv(data)

In [547]:
dataset.shape

(23, 29)

In [548]:
dataset = dataset.dropna(axis=0)

In [549]:
dataset

Unnamed: 0,result,lagnat,sakit ulo,suka,hilo,ubo,hina,hirap hinga,sikip dibdib,kapos hinga,...,nagtatae,dugo plema,pagod,pawis gabi,pantal,dugo ilong,dugo gilagid,sore throat,wala lasa,wala amoy
0,asthma,0,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,flu,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,pneumonia,1,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,pneumonia,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,diarrhea,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,asthma,0,0,0,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
6,dysmenorrhea,0,1,1,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,tubercolosis,0,0,0,0,1,0,0,1,0,...,0,1,0,1,0,0,0,0,0,0
8,covid 19,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1
9,diarrhea,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [550]:
dataset.columns

Index(['result', 'lagnat', 'sakit ulo', 'suka', 'hilo', 'ubo', 'hina',
       'hirap hinga', 'sikip dibdib', 'kapos hinga', 'lamig', 'sakit katawan',
       'plema', 'bilis tibok puso', 'nanginig', 'sakit tiyan', 'dumi basa',
       'dalas dumi', 'himatay', 'nagtatae', 'dugo plema', 'pagod',
       'pawis gabi', 'pantal', 'dugo ilong', 'dugo gilagid', 'sore throat',
       'wala lasa', 'wala amoy'],
      dtype='object')

In [551]:
# find numerical variables

numerical = [var for var in dataset.columns if dataset[var].dtype != 'O']

dataset[numerical].isnull().sum()


lagnat              0
sakit ulo           0
suka                0
hilo                0
ubo                 0
hina                0
hirap hinga         0
sikip dibdib        0
kapos hinga         0
lamig               0
sakit katawan       0
plema               0
bilis tibok puso    0
nanginig            0
sakit tiyan         0
dumi basa           0
dalas dumi          0
himatay             0
nagtatae            0
dugo plema          0
pagod               0
pawis gabi          0
pantal              0
dugo ilong          0
dugo gilagid        0
sore throat         0
wala lasa           0
wala amoy           0
dtype: int64

In [552]:
# Target variable
x = dataset.drop(['result'], axis=1)
y = dataset['result']

In [553]:
# Training and testing split

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3, random_state = 3)

In [554]:
disease_list = y.drop_duplicates()

disease2idx = {w: i for i, w in enumerate(disease_list)}
disease2idx

{'asthma': 0,
 'flu': 1,
 'pneumonia': 2,
 'diarrhea': 3,
 'dysmenorrhea': 4,
 'tubercolosis': 5,
 'covid 19': 6,
 'dengue': 7}

In [555]:
import json

folder_name = 'cfg'

with open("{}/{}.json".format(folder_name, "symptom_list"), "w") as file_path:
    json.dump(numerical, file_path)

with open("{}/{}.json".format(folder_name, "disease_list"), "w") as file_path:
    json.dump(disease2idx, file_path)


In [556]:
from sklearn.preprocessing import RobustScaler

cols = x_train.columns

scaler = RobustScaler()

x_train = scaler.fit_transform(x_train)

x_test = scaler.transform(x_test)

x_train = pd.DataFrame(x_train, columns=[cols])

x_test = pd.DataFrame(x_test, columns=[cols])


In [557]:
x_train.head()

Unnamed: 0,lagnat,sakit ulo,suka,hilo,ubo,hina,hirap hinga,sikip dibdib,kapos hinga,lamig,...,nagtatae,dugo plema,pagod,pawis gabi,pantal,dugo ilong,dugo gilagid,sore throat,wala lasa,wala amoy
0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [558]:
cols

Index(['lagnat', 'sakit ulo', 'suka', 'hilo', 'ubo', 'hina', 'hirap hinga',
       'sikip dibdib', 'kapos hinga', 'lamig', 'sakit katawan', 'plema',
       'bilis tibok puso', 'nanginig', 'sakit tiyan', 'dumi basa',
       'dalas dumi', 'himatay', 'nagtatae', 'dugo plema', 'pagod',
       'pawis gabi', 'pantal', 'dugo ilong', 'dugo gilagid', 'sore throat',
       'wala lasa', 'wala amoy'],
      dtype='object')

In [559]:
# train a Gaussian Naive Bayes classifier on the training set
from sklearn.naive_bayes import BernoulliNB


# instantiate the model
naiveBayes = BernoulliNB()


# fit the model
naiveBayes.fit(x_train, y_train)


In [560]:
y_pred = naiveBayes.predict(x_test)

y_test


21        covid 19
12        diarrhea
17       pneumonia
15    dysmenorrhea
14        covid 19
16          dengue
1              flu
Name: result, dtype: object

In [561]:
from sklearn.metrics import accuracy_score

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))


Model accuracy score: 0.8571


In [562]:
y_pred_train = naiveBayes.predict(x_train)

y_pred_train

array(['pneumonia', 'tubercolosis', 'flu', 'diarrhea', 'diarrhea',
       'dysmenorrhea', 'tubercolosis', 'asthma', 'tubercolosis',
       'diarrhea', 'flu', 'dengue', 'asthma', 'covid 19', 'pneumonia',
       'dengue'], dtype='<U12')

In [563]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))


Training-set accuracy score: 1.0000


In [564]:
y_test.value_counts()

covid 19        2
diarrhea        1
pneumonia       1
dysmenorrhea    1
dengue          1
flu             1
Name: result, dtype: int64

In [565]:
import pickle

# save
with open('naiveBayes.pkl', 'wb') as f:
    pickle.dump(naiveBayes, f)

# load
with open('naiveBayes.pkl', 'rb') as f:
    diseasePredictionModel = pickle.load(f)

In [566]:
import pickle

# save
with open('naiveBayes.pkl','wb') as f:
    pickle.dump(naiveBayes,f)