# Medical Exam Recommendation Based on Symptoms

Grid Search with:
- Decision Tree Classifier
- Random Forest Classifier
- KNN
- Nayve Bayes
- SVM

In [1]:
import numpy as np
import pandas as pd

from sklearn import ensemble
from sklearn import multioutput
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn import naive_bayes
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

#### Load Symptom Dataset

In [2]:
data = pd.read_csv('./Data/final.csv', sep = ',')

####  Load Symptom Severity 

In [3]:
severity = pd.read_csv('./Data/Symptom-severity.csv')

####  Load Exam Mapping

In [4]:
mapping = pd.read_csv('./Data/map.csv', sep = ';')

# Split data for train and test

In [5]:
symptomCount = 131
testCount = 102

[train,test] = train_test_split(data,random_state=420)
trainA = np.array(train)
trainAX = trainA[:,testCount:]
trainAY = trainA[:,:testCount]

trainX=train.iloc[:,testCount:]
trainY=train.iloc[:,:testCount]

testX=test.iloc[:,testCount:]
testY=test.iloc[:,:testCount]

# Grid Search

In [6]:
def realAccuracies(predictions, labels):
    thresh =(predictions.max()+ predictions.min())/2
    predictions[predictions>thresh]=1
    predictions[predictions<=thresh]=0
    
    correctByLine=[]
    for i in range(len(testY)):
        matches = predictions[i]==labels[i]
        correctByLine.append(len(matches[matches==True])/len(matches))
        
    return([len(np.nonzero([x for x in correctByLine if x==1])[0]),
            len(np.nonzero([x for x in correctByLine if x>0.99])[0]),
            len(np.nonzero([x for x in correctByLine if x>0.95])[0]),
            len(np.nonzero([x for x in correctByLine if x>0.90])[0]),
            len(np.nonzero([x for x in correctByLine if x!=1])[0])])


def getRealAcc(clf):
    predicted = clf.predict(testX)
    accs =  realAccuracies(predicted, testY.to_numpy())
    print('')
    print(accs[3]," classified 90% correctly")
    print(accs[2]," classified 95% correctly")
    print(accs[1]," classified 99% correctly")
    print(accs[0]," classified 100% correctly")
    print(accs[4]," classified incorrectly (Not 100%)")

In [7]:
def gridSearch(model, param_grid, X_train, y_train, X_test, y_test):        
    clf = GridSearchCV(model, param_grid, refit=True, verbose=3)
    clf.fit(X_train,y_train)
    print(clf.best_params_)
    predictions = clf.predict(testX)
    getRealAcc(clf)
    return ;

# Random Forest Classifier

In [None]:
grid_params_randomForest = {
    'n_estimators' : [10,20,30,50,100,200,1000],
    'max_depth' : [1, 10, 20, None],
    'bootstrap': [True, False],
}

gridSearch(RandomForestClassifier(), grid_params_randomForest, trainX, trainY, testX, testY)

Fitting 5 folds for each of 56 candidates, totalling 280 fits
[CV] bootstrap=True, max_depth=1, n_estimators=10 ....................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  bootstrap=True, max_depth=1, n_estimators=10, score=0.027, total=   0.5s
[CV] bootstrap=True, max_depth=1, n_estimators=10 ....................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  bootstrap=True, max_depth=1, n_estimators=10, score=0.021, total=   0.3s
[CV] bootstrap=True, max_depth=1, n_estimators=10 ....................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s


[CV]  bootstrap=True, max_depth=1, n_estimators=10, score=0.017, total=   0.3s
[CV] bootstrap=True, max_depth=1, n_estimators=10 ....................
[CV]  bootstrap=True, max_depth=1, n_estimators=10, score=0.028, total=   0.3s
[CV] bootstrap=True, max_depth=1, n_estimators=10 ....................
[CV]  bootstrap=True, max_depth=1, n_estimators=10, score=0.037, total=   0.7s
[CV] bootstrap=True, max_depth=1, n_estimators=20 ....................
[CV]  bootstrap=True, max_depth=1, n_estimators=20, score=0.027, total=   0.7s
[CV] bootstrap=True, max_depth=1, n_estimators=20 ....................
[CV]  bootstrap=True, max_depth=1, n_estimators=20, score=0.021, total=   0.5s
[CV] bootstrap=True, max_depth=1, n_estimators=20 ....................
[CV]  bootstrap=True, max_depth=1, n_estimators=20, score=0.017, total=   0.5s
[CV] bootstrap=True, max_depth=1, n_estimators=20 ....................
[CV]  bootstrap=True, max_depth=1, n_estimators=20, score=0.028, total=   0.5s
[CV] bootstrap=True, 

[CV]  bootstrap=True, max_depth=10, n_estimators=100, score=0.608, total=   3.0s
[CV] bootstrap=True, max_depth=10, n_estimators=100 ..................
[CV]  bootstrap=True, max_depth=10, n_estimators=100, score=0.526, total=   3.0s
[CV] bootstrap=True, max_depth=10, n_estimators=100 ..................
[CV]  bootstrap=True, max_depth=10, n_estimators=100, score=0.600, total=   3.0s
[CV] bootstrap=True, max_depth=10, n_estimators=200 ..................
[CV]  bootstrap=True, max_depth=10, n_estimators=200, score=0.567, total=   6.2s
[CV] bootstrap=True, max_depth=10, n_estimators=200 ..................
[CV]  bootstrap=True, max_depth=10, n_estimators=200, score=0.578, total=   6.0s
[CV] bootstrap=True, max_depth=10, n_estimators=200 ..................
[CV]  bootstrap=True, max_depth=10, n_estimators=200, score=0.587, total=   6.0s
[CV] bootstrap=True, max_depth=10, n_estimators=200 ..................
[CV]  bootstrap=True, max_depth=10, n_estimators=200, score=0.544, total=   6.0s
[CV] bo

[CV]  bootstrap=True, max_depth=None, n_estimators=20, score=1.000, total=   0.7s
[CV] bootstrap=True, max_depth=None, n_estimators=20 .................
[CV]  bootstrap=True, max_depth=None, n_estimators=20, score=1.000, total=   0.7s
[CV] bootstrap=True, max_depth=None, n_estimators=20 .................
[CV]  bootstrap=True, max_depth=None, n_estimators=20, score=1.000, total=   0.7s
[CV] bootstrap=True, max_depth=None, n_estimators=20 .................
[CV]  bootstrap=True, max_depth=None, n_estimators=20, score=1.000, total=   0.7s
[CV] bootstrap=True, max_depth=None, n_estimators=30 .................
[CV]  bootstrap=True, max_depth=None, n_estimators=30, score=1.000, total=   1.1s
[CV] bootstrap=True, max_depth=None, n_estimators=30 .................
[CV]  bootstrap=True, max_depth=None, n_estimators=30, score=1.000, total=   1.1s
[CV] bootstrap=True, max_depth=None, n_estimators=30 .................
[CV]  bootstrap=True, max_depth=None, n_estimators=30, score=1.000, total=   1.1s


# Decision Tree Classifier

In [None]:
grid_params_decisionTree = {
    'criterion' : ['gini', 'entropy'],
    'splitter' : ['best', 'random'],
    'max_depth' : [1, 10, 20, None],
}

gridSearch(DecisionTreeClassifier(), grid_params_decisionTree, trainX, trainY, testX, testY)

# KNN

In [None]:
grid_params_knn = {
    'n_neighbors' : [3,5,7,11,13,15,17,25],
    'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'weights' : ['uniform', 'distance'],
    'metric' : ['euclidean', 'manhattan'],
}

gridSearch(KNeighborsClassifier(), grid_params_knn, trainX, trainY, testX, testY)

# Nayve Bayes

In [None]:
grid_params_gaussian = {
    'estimator__var_smoothing': [1e-11, 1e-10, 1e-9]
}

gaussianNB = multioutput.MultiOutputClassifier(naive_bayes.GaussianNB())
#print('Gaussian Params:',  gaussianNB.get_params().keys())

nSteps = 10
grid_params_bernoulli = {
    'estimator__alpha': np.array(range(nSteps))/nSteps,
    'estimator__fit_prior':[True,False]
}

bernoulli = multioutput.MultiOutputClassifier(naive_bayes.BernoulliNB())
#print('Bernoulli Params:', bernoulli.get_params().keys())

grid_params_multinominal = {
    'estimator__alpha': np.array(range(nSteps))/nSteps,
}

multinominal = multioutput.MultiOutputClassifier(naive_bayes.MultinomialNB())
#print('Multinominal Params:', multinominal.get_params().keys())

In [None]:
gridSearch(gaussianNB, grid_params_gaussian, trainX, trainY, testX, testY)

In [None]:
gridSearch(bernoulli, grid_params_bernoulli, trainX, trainY, testX, testY)

In [None]:
gridSearch(multinominal, grid_params_multinominal, trainX, trainY, testX, testY)

# SVM

In [None]:
grid_params_svc = {
    'estimator__kernel': ['poly','rbf','linear'],
    'estimator__gamma': [1,0.1,0.01,0.001],
    'estimator__C': [0.1,1, 10, 100],
    'estimator__class_weight': ['balanced', None]
} 

In [None]:
from sklearn.svm import LinearSVC
svc = OneVsRestClassifier(SVC()).fit(trainX.to_numpy(), trainY.to_numpy())
gridSearch(svc, grid_params_svc, trainX, trainY.to_numpy(), testX, testY)