# Medical Exam Recommendation Based on Symptoms

Grid Search with Decision Tree Classifier and Random Forest Classifier

In [58]:
import numpy as np
import pandas as pd
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

#### Load Symptom Dataset

In [59]:
data = pd.read_csv('./Data/final.csv', sep = ',')

####  Load Symptom Severity 

In [60]:
severity = pd.read_csv('./Data/Symptom-severity.csv')

####  Load Exam Mapping

In [61]:
mapping = pd.read_csv('./Data/map.csv', sep = ';')

# Split data for train and test

In [62]:
symptomCount = 131
testCount = 102

[train,test] = train_test_split(data,random_state=420)
trainA = np.array(train)
trainAX = trainA[:,testCount:]
trainAY = trainA[:,:testCount]

trainX=train.iloc[:,testCount:]
trainY=train.iloc[:,:testCount]
#print(trainY[0:5])

testX=test.iloc[:,testCount:]
testY=test.iloc[:,:testCount]

# Grid Search

In [63]:
def realAccuracies(predictions, labels):
    thresh =(predictions.max()+ predictions.min())/2
    predictions[predictions>thresh]=1
    predictions[predictions<=thresh]=0
    
    correctByLine=[]
    for i in range(len(testY)):
        matches = predictions[i]==labels[i]
        correctByLine.append(len(matches[matches==True])/len(matches))
        
    return([len(np.nonzero([x for x in correctByLine if x==1])[0]),
            len(np.nonzero([x for x in correctByLine if x>0.99])[0]),
            len(np.nonzero([x for x in correctByLine if x>0.95])[0]),
            len(np.nonzero([x for x in correctByLine if x>0.90])[0]),
            len(np.nonzero([x for x in correctByLine if x!=1])[0])])


def getRealAcc(clf):
    predicted = clf.predict(testX)
    accs =  realAccuracies(predicted, testY.to_numpy())
    print('')
    print(accs[3]," classified 90% correctly")
    print(accs[2]," classified 95% correctly")
    print(accs[1]," classified 99% correctly")
    print(accs[0]," classified 100% correctly")
    print(accs[4]," classified incorrectly (Not 100%)")

In [64]:
def gridSearch(model, param_grid, X_train, y_train, X_test, y_test):        
    clf = GridSearchCV(model, param_grid, refit=True, verbose=3)
    clf.fit(X_train,y_train)
    print(clf.best_params_)
    predictions = clf.predict(testX)
    getRealAcc(clf)
    return ;

# Random Forest Classifier

In [None]:
grid_params_randomForest = {
    'n_estimators' : [10,20,30,50,100,200,1000],
    'max_depth' : [1, 10, 20, None],
    'bootstrap': [True, False],
}

gridSearch(RandomForestClassifier(), grid_params_randomForest, trainX, trainY, testX, testY)

Fitting 5 folds for each of 56 candidates, totalling 280 fits
[CV] bootstrap=True, max_depth=1, n_estimators=10 ....................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  bootstrap=True, max_depth=1, n_estimators=10, score=0.027, total=   0.3s
[CV] bootstrap=True, max_depth=1, n_estimators=10 ....................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV]  bootstrap=True, max_depth=1, n_estimators=10, score=0.021, total=   0.2s
[CV] bootstrap=True, max_depth=1, n_estimators=10 ....................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.6s remaining:    0.0s


[CV]  bootstrap=True, max_depth=1, n_estimators=10, score=0.017, total=   0.3s
[CV] bootstrap=True, max_depth=1, n_estimators=10 ....................
[CV]  bootstrap=True, max_depth=1, n_estimators=10, score=0.028, total=   0.3s
[CV] bootstrap=True, max_depth=1, n_estimators=10 ....................
[CV]  bootstrap=True, max_depth=1, n_estimators=10, score=0.037, total=   0.3s
[CV] bootstrap=True, max_depth=1, n_estimators=20 ....................
[CV]  bootstrap=True, max_depth=1, n_estimators=20, score=0.027, total=   0.5s
[CV] bootstrap=True, max_depth=1, n_estimators=20 ....................
[CV]  bootstrap=True, max_depth=1, n_estimators=20, score=0.021, total=   0.6s
[CV] bootstrap=True, max_depth=1, n_estimators=20 ....................
[CV]  bootstrap=True, max_depth=1, n_estimators=20, score=0.017, total=   0.6s
[CV] bootstrap=True, max_depth=1, n_estimators=20 ....................
[CV]  bootstrap=True, max_depth=1, n_estimators=20, score=0.028, total=   0.4s
[CV] bootstrap=True, 

[CV]  bootstrap=True, max_depth=10, n_estimators=100, score=0.630, total=   4.1s
[CV] bootstrap=True, max_depth=10, n_estimators=100 ..................
[CV]  bootstrap=True, max_depth=10, n_estimators=100, score=0.546, total=   3.1s
[CV] bootstrap=True, max_depth=10, n_estimators=100 ..................
[CV]  bootstrap=True, max_depth=10, n_estimators=100, score=0.588, total=   3.0s
[CV] bootstrap=True, max_depth=10, n_estimators=200 ..................
[CV]  bootstrap=True, max_depth=10, n_estimators=200, score=0.554, total=   5.9s
[CV] bootstrap=True, max_depth=10, n_estimators=200 ..................
[CV]  bootstrap=True, max_depth=10, n_estimators=200, score=0.607, total=   6.1s
[CV] bootstrap=True, max_depth=10, n_estimators=200 ..................
[CV]  bootstrap=True, max_depth=10, n_estimators=200, score=0.625, total=   6.0s
[CV] bootstrap=True, max_depth=10, n_estimators=200 ..................
[CV]  bootstrap=True, max_depth=10, n_estimators=200, score=0.507, total=   5.9s
[CV] bo

[CV]  bootstrap=True, max_depth=None, n_estimators=20, score=1.000, total=   0.7s
[CV] bootstrap=True, max_depth=None, n_estimators=20 .................
[CV]  bootstrap=True, max_depth=None, n_estimators=20, score=1.000, total=   0.7s
[CV] bootstrap=True, max_depth=None, n_estimators=20 .................
[CV]  bootstrap=True, max_depth=None, n_estimators=20, score=1.000, total=   0.7s
[CV] bootstrap=True, max_depth=None, n_estimators=20 .................
[CV]  bootstrap=True, max_depth=None, n_estimators=20, score=1.000, total=   0.7s
[CV] bootstrap=True, max_depth=None, n_estimators=30 .................
[CV]  bootstrap=True, max_depth=None, n_estimators=30, score=1.000, total=   1.0s
[CV] bootstrap=True, max_depth=None, n_estimators=30 .................
[CV]  bootstrap=True, max_depth=None, n_estimators=30, score=1.000, total=   1.0s
[CV] bootstrap=True, max_depth=None, n_estimators=30 .................
[CV]  bootstrap=True, max_depth=None, n_estimators=30, score=1.000, total=   1.0s


[CV]  bootstrap=False, max_depth=1, n_estimators=200, score=0.027, total=  10.7s
[CV] bootstrap=False, max_depth=1, n_estimators=200 ..................
[CV]  bootstrap=False, max_depth=1, n_estimators=200, score=0.021, total=   7.8s
[CV] bootstrap=False, max_depth=1, n_estimators=200 ..................
[CV]  bootstrap=False, max_depth=1, n_estimators=200, score=0.017, total=   5.0s
[CV] bootstrap=False, max_depth=1, n_estimators=200 ..................
[CV]  bootstrap=False, max_depth=1, n_estimators=200, score=0.028, total=   5.8s
[CV] bootstrap=False, max_depth=1, n_estimators=200 ..................
[CV]  bootstrap=False, max_depth=1, n_estimators=200, score=0.037, total=   5.7s
[CV] bootstrap=False, max_depth=1, n_estimators=1000 .................
[CV]  bootstrap=False, max_depth=1, n_estimators=1000, score=0.027, total=  30.1s
[CV] bootstrap=False, max_depth=1, n_estimators=1000 .................
[CV]  bootstrap=False, max_depth=1, n_estimators=1000, score=0.021, total=  22.8s
[CV] 

[CV]  bootstrap=False, max_depth=20, n_estimators=20, score=0.979, total=   0.8s
[CV] bootstrap=False, max_depth=20, n_estimators=30 ..................
[CV]  bootstrap=False, max_depth=20, n_estimators=30, score=0.972, total=   1.1s
[CV] bootstrap=False, max_depth=20, n_estimators=30 ..................
[CV]  bootstrap=False, max_depth=20, n_estimators=30, score=0.960, total=   1.1s
[CV] bootstrap=False, max_depth=20, n_estimators=30 ..................
[CV]  bootstrap=False, max_depth=20, n_estimators=30, score=0.993, total=   1.2s
[CV] bootstrap=False, max_depth=20, n_estimators=30 ..................
[CV]  bootstrap=False, max_depth=20, n_estimators=30, score=0.953, total=   1.1s
[CV] bootstrap=False, max_depth=20, n_estimators=30 ..................
[CV]  bootstrap=False, max_depth=20, n_estimators=30, score=0.974, total=   1.1s
[CV] bootstrap=False, max_depth=20, n_estimators=50 ..................
[CV]  bootstrap=False, max_depth=20, n_estimators=50, score=0.994, total=   1.8s
[CV] bo

[CV]  bootstrap=False, max_depth=None, n_estimators=200, score=1.000, total=   7.4s
[CV] bootstrap=False, max_depth=None, n_estimators=200 ...............
[CV]  bootstrap=False, max_depth=None, n_estimators=200, score=1.000, total=   7.4s
[CV] bootstrap=False, max_depth=None, n_estimators=1000 ..............
[CV]  bootstrap=False, max_depth=None, n_estimators=1000, score=1.000, total=  36.3s
[CV] bootstrap=False, max_depth=None, n_estimators=1000 ..............
[CV]  bootstrap=False, max_depth=None, n_estimators=1000, score=1.000, total=  37.1s
[CV] bootstrap=False, max_depth=None, n_estimators=1000 ..............
[CV]  bootstrap=False, max_depth=None, n_estimators=1000, score=1.000, total=  37.2s
[CV] bootstrap=False, max_depth=None, n_estimators=1000 ..............
[CV]  bootstrap=False, max_depth=None, n_estimators=1000, score=1.000, total=  42.4s
[CV] bootstrap=False, max_depth=None, n_estimators=1000 ..............


# Decision Tree Classifier

In [None]:
grid_params_decisionTree = {
    'criterion' : ['gini', 'entropy'],
    'splitter' : ['best', 'random'],
    'max_depth' : [1, 10, 20, None],
}

gridSearch(DecisionTreeClassifier(), grid_params_decisionTree, trainX, trainY, testX, testY)

# Estas coisas aqui em baixo são para apagar?

In [62]:
from sklearn.tree import _tree

def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print ("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print ("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print ("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print ("{}return {}".format(indent, tree_.value[node]))

    recurse(0, 1)
tree_to_code(clf,testX.columns)

AttributeError: 'RandomForestClassifier' object has no attribute 'tree_'

In [58]:
z = np.zeros(len(testX.iloc(0)[0]))
#z[114]=1
z[97]=1
oQueAMariaTem = clf.predict([z])[0]
getTests(oQueAMariaTem)

array(['Blood Tests', 'Physical Examination'], dtype=object)

In [53]:
def getTests(oneHotResults):
    dic = np.array(testY.columns)
    return dic[oneHotResults==1]

array(['Blood Tests', 'Skin biopsy', 'Patch test'], dtype=object)

In [54]:
import sys
np.set_printoptions(threshold=sys.maxsize)

for p in preds:
    print(getTests(p))

['Blood Tests' 'MRI Scan' 'CT scan' 'Sputum test' 'Biopsy'
 'Mantoux tuberculin skin test']
['Blood Tests' 'CT scan' 'Chest X-ray' 'Sputum test'
 'Pleural fluid culture' 'Pulse oximetry']
['Blood Tests' 'CT Scan' 'Echocardiogram' 'Brain MRI Scan'
 'Carotid Ultrasound' 'Cerebral angiogram']
['Physical Examination']
['Blood Tests' 'Physical Examination' 'MRI Scan' 'X-ray']
['Blood Tests' 'Skin Prick Test (SPT)' 'Intradermal Skin Test'
 'Physical Examination']
['MRI Scan' 'CT Scan' 'Neck X-ray' 'Myelophagy' 'Electromyography (EMG)']
['Blood Tests' 'Chest X-ray' 'Spirometry' 'Exhaled nitric oxide test'
 'Peak flow meter test']
['Physical Examination' 'Anoscopy' 'Rigid proctosigmoidoscopy'
 'Colonoscopy' 'Flexible sigmoidoscopy' 'Barium X-ray']
['Blood Tests' 'Physical Examination' 'Liver Ultrasound' 'Liver Biopsy'
 'MRI Scan' 'CT Scan']
['Blood Tests' 'Skin biopsy' 'Patch test']
['Blood Tests' 'CT scan' 'Chest X-ray' 'Sputum test'
 'Pleural fluid culture' 'Pulse oximetry']
['Blood Tests' '

 'Pleural fluid culture' 'Pulse oximetry']
['TSH test' 'T4 test' 'Thyroid scan' 'Thyroid ultrasound']
['Dengue NS1 antigen test' 'Serological tests' 'IgG antibody testing'
 'Nucleic acid amplification tests (NAATs)']
['Blood Tests' 'Chest X-ray' 'Spirometry' 'Exhaled nitric oxide test'
 'Peak flow meter test']
['MRI Scan' 'Electronystagmography (ENG)' 'Videonystagmography (VNG)']
['Blood Tests' 'Urine analysis' 'Stool tests' 'Bone marrow test'
 'Tissue culture']
['Blood Tests' 'Urine analysis' 'Urine Culture' 'Susceptibility testing']
['Physical Examination' 'ELISA Test' 'Saliva Test' 'Viral Load Test'
 'Western Blot']
['Blood Tests' 'Serum bilirubin test' 'Biopsy' 'Ultrasound']
['TSH test' 'T4 test' 'Thyroid scan' 'Thyroid ultrasound']
['Blood Tests' 'Physical Examination']
['Blood Tests' 'Urine analysis' 'Urine Culture' 'Susceptibility testing']
['Blood Tests' 'Serum bilirubin test' 'Biopsy' 'Ultrasound']
['MRI Scan' 'Electronystagmography (ENG)' 'Videonystagmography (VNG)']
['Blood 