In [1]:
import numpy as np
import pandas as pd
from sklearn import naive_bayes
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

#### Load Symptom Dataset

In [2]:
data = pd.read_csv('./Data/final.csv', sep = ',')

####  Load Symptom Severity 

In [3]:
severity = pd.read_csv('./Data/Symptom-severity.csv')

####  Load Exam Mapping

In [4]:
mapping = pd.read_csv('./Data/map.csv', sep = ';')

# Split data for train and test

In [5]:
symptomCount = 131
testCount = 102

[train,test] = train_test_split(data,random_state=420)
trainA = np.array(train)
trainAX = trainA[:,testCount:]
trainAY = trainA[:,:testCount]

trainX=train.iloc[:,testCount:]
trainY=train.iloc[:,:testCount]
#print(trainY[0:5])

testX=test.iloc[:,testCount:]
testY=test.iloc[:,:testCount]

In [6]:
def actualAccuracies( predictions, labels):
    thresh =(predictions.max()+ predictions.min())/2
    predictions[predictions>thresh]=1
    predictions[predictions<=thresh]=0
    
    correctByLine=[]
    for i in range(len(testY)):
        matches = predictions[i]==labels[i]
        correctByLine.append(len(matches[matches==True])/len(matches))
    return([len(np.nonzero([x for x in correctByLine if x==1])[0]),
            len(np.nonzero([x for x in correctByLine if x>0.99])[0]),
            len(np.nonzero([x for x in correctByLine if x>0.95])[0]),
            len(np.nonzero([x for x in correctByLine if x>0.90])[0]),
            len(np.nonzero([x for x in correctByLine if x!=1])[0])
           ])

# Naive Bayes

In [7]:
from sklearn import multioutput

clf = multioutput.MultiOutputClassifier(naive_bayes.GaussianNB()).fit(trainX, trainY)

In [8]:
preds = clf.predict(testX)
accs =  actualAccuracies(preds, testY.to_numpy())

print(accs[3]," classified 90% correctly")
print(accs[2]," classified 95% correctly")
print(accs[1]," classified 99% correctly")
print(accs[0]," classified 100% correctly")
print(accs[4]," classified incorrectly (Not 100%)")

1170  classified 90% correctly
1142  classified 95% correctly
1076  classified 99% correctly
1011  classified 100% correctly
159  classified incorrectly (Not 100%)


In [10]:
def gridSearch(model, param_grid, modelName, X_train, y_train, X_test, y_test):        
    clf = GridSearchCV(model, param_grid, refit=True, verbose=3)
    clf.fit(X_train,y_train)
    print(clf.best_params_)
    predicted = clf.predict(X_test)
    evaluateModel(modelName, y_test, predicted)
    return ;


def evaluateModel(name, y_test, predicted):
    print("Accuracy:",accuracy_score(y_test,predicted)," : ", name)
    return;

nSteps = 10

grid_params = {
    'estimator__alpha' : np.array(range(nSteps))/nSteps,
    'estimator__fit_prior':[True,False]
}

baseClass = multioutput.MultiOutputClassifier(naive_bayes.BernoulliNB())
print(baseClass.get_params().keys())
gridSearch(baseClass,grid_params,"NBayes",trainX,trainY,testX,testY)

dict_keys(['estimator__alpha', 'estimator__binarize', 'estimator__class_prior', 'estimator__fit_prior', 'estimator', 'n_jobs'])
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] estimator__alpha=0.0, estimator__fit_prior=True .................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  estimator__alpha=0.0, estimator__fit_prior=True, score=0.909, total=   0.6s
[CV] estimator__alpha=0.0, estimator__fit_prior=True .................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV]  estimator__alpha=0.0, estimator__fit_prior=True, score=0.944, total=   0.6s
[CV] estimator__alpha=0.0, estimator__fit_prior=True .................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.2s remaining:    0.0s


[CV]  estimator__alpha=0.0, estimator__fit_prior=True, score=0.910, total=   0.6s
[CV] estimator__alpha=0.0, estimator__fit_prior=True .................
[CV]  estimator__alpha=0.0, estimator__fit_prior=True, score=0.932, total=   0.6s
[CV] estimator__alpha=0.0, estimator__fit_prior=True .................
[CV]  estimator__alpha=0.0, estimator__fit_prior=True, score=0.925, total=   0.8s
[CV] estimator__alpha=0.0, estimator__fit_prior=False ................
[CV]  estimator__alpha=0.0, estimator__fit_prior=False, score=0.907, total=   0.8s
[CV] estimator__alpha=0.0, estimator__fit_prior=False ................
[CV]  estimator__alpha=0.0, estimator__fit_prior=False, score=0.939, total=   0.7s
[CV] estimator__alpha=0.0, estimator__fit_prior=False ................
[CV]  estimator__alpha=0.0, estimator__fit_prior=False, score=0.913, total=   0.8s
[CV] estimator__alpha=0.0, estimator__fit_prior=False ................
[CV]  estimator__alpha=0.0, estimator__fit_prior=False, score=0.929, total=   0

[CV]  estimator__alpha=0.5, estimator__fit_prior=False, score=0.801, total=   0.8s
[CV] estimator__alpha=0.5, estimator__fit_prior=False ................
[CV]  estimator__alpha=0.5, estimator__fit_prior=False, score=0.805, total=   0.6s
[CV] estimator__alpha=0.5, estimator__fit_prior=False ................
[CV]  estimator__alpha=0.5, estimator__fit_prior=False, score=0.809, total=   0.9s
[CV] estimator__alpha=0.5, estimator__fit_prior=False ................
[CV]  estimator__alpha=0.5, estimator__fit_prior=False, score=0.826, total=   0.7s
[CV] estimator__alpha=0.6, estimator__fit_prior=True .................
[CV]  estimator__alpha=0.6, estimator__fit_prior=True, score=0.876, total=   0.9s
[CV] estimator__alpha=0.6, estimator__fit_prior=True .................
[CV]  estimator__alpha=0.6, estimator__fit_prior=True, score=0.944, total=   0.8s
[CV] estimator__alpha=0.6, estimator__fit_prior=True .................
[CV]  estimator__alpha=0.6, estimator__fit_prior=True, score=0.897, total=   1

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.2min finished


{'estimator__alpha': 0.0, 'estimator__fit_prior': True}
Accuracy: 0.9264957264957265  :  NBayes


In [19]:
clf.score(testX,testY)

1.0

In [58]:
z = np.zeros(len(testX.iloc(0)[0]))
#z[114]=1
z[97]=1
oQueAMariaTem = clf.predict([z])[0]
getTests(oQueAMariaTem)

array(['Blood Tests', 'Physical Examination'], dtype=object)

In [53]:
def getTests(oneHotResults):
    dic = np.array(testY.columns)
    return dic[oneHotResults==1]

array(['Blood Tests', 'Skin biopsy', 'Patch test'], dtype=object)

In [54]:
import sys
np.set_printoptions(threshold=sys.maxsize)

for p in preds:
    print(getTests(p))

['Blood Tests' 'MRI Scan' 'CT scan' 'Sputum test' 'Biopsy'
 'Mantoux tuberculin skin test']
['Blood Tests' 'CT scan' 'Chest X-ray' 'Sputum test'
 'Pleural fluid culture' 'Pulse oximetry']
['Blood Tests' 'CT Scan' 'Echocardiogram' 'Brain MRI Scan'
 'Carotid Ultrasound' 'Cerebral angiogram']
['Physical Examination']
['Blood Tests' 'Physical Examination' 'MRI Scan' 'X-ray']
['Blood Tests' 'Skin Prick Test (SPT)' 'Intradermal Skin Test'
 'Physical Examination']
['MRI Scan' 'CT Scan' 'Neck X-ray' 'Myelophagy' 'Electromyography (EMG)']
['Blood Tests' 'Chest X-ray' 'Spirometry' 'Exhaled nitric oxide test'
 'Peak flow meter test']
['Physical Examination' 'Anoscopy' 'Rigid proctosigmoidoscopy'
 'Colonoscopy' 'Flexible sigmoidoscopy' 'Barium X-ray']
['Blood Tests' 'Physical Examination' 'Liver Ultrasound' 'Liver Biopsy'
 'MRI Scan' 'CT Scan']
['Blood Tests' 'Skin biopsy' 'Patch test']
['Blood Tests' 'CT scan' 'Chest X-ray' 'Sputum test'
 'Pleural fluid culture' 'Pulse oximetry']
['Blood Tests' '

 'Pleural fluid culture' 'Pulse oximetry']
['TSH test' 'T4 test' 'Thyroid scan' 'Thyroid ultrasound']
['Dengue NS1 antigen test' 'Serological tests' 'IgG antibody testing'
 'Nucleic acid amplification tests (NAATs)']
['Blood Tests' 'Chest X-ray' 'Spirometry' 'Exhaled nitric oxide test'
 'Peak flow meter test']
['MRI Scan' 'Electronystagmography (ENG)' 'Videonystagmography (VNG)']
['Blood Tests' 'Urine analysis' 'Stool tests' 'Bone marrow test'
 'Tissue culture']
['Blood Tests' 'Urine analysis' 'Urine Culture' 'Susceptibility testing']
['Physical Examination' 'ELISA Test' 'Saliva Test' 'Viral Load Test'
 'Western Blot']
['Blood Tests' 'Serum bilirubin test' 'Biopsy' 'Ultrasound']
['TSH test' 'T4 test' 'Thyroid scan' 'Thyroid ultrasound']
['Blood Tests' 'Physical Examination']
['Blood Tests' 'Urine analysis' 'Urine Culture' 'Susceptibility testing']
['Blood Tests' 'Serum bilirubin test' 'Biopsy' 'Ultrasound']
['MRI Scan' 'Electronystagmography (ENG)' 'Videonystagmography (VNG)']
['Blood 