# Medical Exam Recommendation Based on Symptoms

Grid Search for KNN parameters

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

#### Load Symptom Dataset

In [2]:
data = pd.read_csv('./Data/final.csv', sep = ',')

####  Load Symptom Severity 

In [3]:
severity = pd.read_csv('./Data/Symptom-severity.csv')

####  Load Exam Mapping

In [4]:
mapping = pd.read_csv('./Data/map.csv', sep = ';')

# Split data for train and test

In [5]:
symptomCount = 131
testCount = 102

[train,test] = train_test_split(data,random_state=420)
trainA = np.array(train)
trainAX = trainA[:,testCount:]
trainAY = trainA[:,:testCount]

trainX=train.iloc[:,testCount:]
trainY=train.iloc[:,:testCount]
#print(trainY[0:5])

testX=test.iloc[:,testCount:]
testY=test.iloc[:,:testCount]

# Grid Search

In [6]:
def realAccuracies(predictions, labels):
    thresh =(predictions.max()+ predictions.min())/2
    predictions[predictions>thresh]=1
    predictions[predictions<=thresh]=0
    
    correctByLine=[]
    for i in range(len(testY)):
        matches = predictions[i]==labels[i]
        correctByLine.append(len(matches[matches==True])/len(matches))
        
    return([len(np.nonzero([x for x in correctByLine if x==1])[0]),
            len(np.nonzero([x for x in correctByLine if x>0.99])[0]),
            len(np.nonzero([x for x in correctByLine if x>0.95])[0]),
            len(np.nonzero([x for x in correctByLine if x>0.90])[0]),
            len(np.nonzero([x for x in correctByLine if x!=1])[0])])


def getRealAcc(clf):
    predicted = clf.predict(testX)
    accs =  realAccuracies(predicted, testY.to_numpy())
    print('')
    print(accs[3]," classified 90% correctly")
    print(accs[2]," classified 95% correctly")
    print(accs[1]," classified 99% correctly")
    print(accs[0]," classified 100% correctly")
    print(accs[4]," classified incorrectly (Not 100%)")

In [16]:
def gridSearch(model, param_grid, X_train, y_train, X_test, y_test):        
    clf = GridSearchCV(model, param_grid, refit=True, verbose=1)
    clf.fit(X_train,y_train)
    print('')
    print('Best Parameters are: ',clf.best_params_)
    predictions = clf.predict(testX)
    getRealAcc(clf)
    return ;

# KNN

In [20]:
grid_params_knn = {
    'n_neighbors' : [3,5,7,11,13,15,17,25],
    'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'weights' : ['uniform', 'distance'],
    'metric' : ['euclidean', 'manhattan'],
}

In [21]:
gridSearch(KNeighborsClassifier(), grid_params_knn, trainX, trainY, testX, testY)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 640 out of 640 | elapsed: 12.3min finished



Best Parameters are:  {'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}

1170  classified 90% correctly
1170  classified 95% correctly
1170  classified 99% correctly
1170  classified 100% correctly
0  classified incorrectly (Not 100%)
