In [133]:
import numpy as np
import matplotlib.pyplot as plt
from operator import itemgetter
from collections import Counter
import pandas as pd
from sklearn import neighbors

In [190]:
df=pd.read_csv("Indian Liver Patient Dataset (ILPD).csv")
df=df.fillna(-99999) #filling all missing values

In [100]:
#K nearest neighbor algorithm
def knn(data,predict,k=3):
    if len(data)>=k:
        print("WARNING: Number of classification is larger than k")
        
    k_nearest_groups=[]
    for i in data:
        for feature in data[i]:
            ecludian_distance= np.sum((np.array(feature)-np.array(predict))**2) #not taking sqrt to reduce time complexity
            if(len(k_nearest_groups)<k):
                k_nearest_groups.append((i,ecludian_distance))
            else:
                maxm=max(k_nearest_groups,key=itemgetter(1))
                if(maxm[1]>ecludian_distance): #check if it finds the closest neighbor
                    k_nearest_groups[k_nearest_groups.index(maxm)]=(i,ecludian_distance)
    
    get_k_nearest_groups=[i[0] for i in k_nearest_groups]
    majority=Counter(get_k_nearest_groups).most_common(1)[0]           
    
    return majority[0], majority[1]/k #returning the nearest group with confidence.


In [183]:
#split the data set
def fit(x,y,test_size=0.2):
    
    x_tr,x_te,y_tr,y_te=x[:-int(test_size*len(x))],x[-int(test_size*len(x)):],y[:-int(test_size*len(x))],y[-int(test_size*len(x)):]
    x_train={1:[],2:[]}
    x_test={1:[],2:[]}

    for i in range(len(x_tr)):
        x_train[y_tr[i]].append(x_tr[i])

    for i in range(len(x_te)):
        x_test[y_te[i]].append(x_te[i])
    
    return x_train,x_test



In [192]:
def predict(x_train,data,k=5):
    return knn(x_train,data,k)

In [184]:
#predict and give total accuracy
def score(x_train,x_test):
    correct=0
    total=0
    for grp in x_test:
        for data in x_test[grp]:
            grp_pred,confidence=knn(x_train,data,5)
            if grp_pred==grp:
                correct+=1
            total+=1
    return correct/total    

In [194]:
x=np.array(df.drop(['class'],1))
y=np.array(df['class'])
x_train,x_test=fit(x,y)
acc=score(x_train,x_test)

In [215]:
predict(x_train,x_te[1])

(1, 0.6)

In [188]:
x_tr,x_te,y_tr,y_te=x[:-int(0.2*len(x))],x[-int(0.2*len(x)):],y[:-int(0.2*len(x))],y[-int(0.2*len(x)):]
model=neighbors.KNeighborsClassifier()
model.fit(x_tr,y_tr)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [189]:
print("Sklearn Model's Accuracy: ",model.score(x_te,y_te),"\nManually Calculated Accuracy: ",acc)

Sklearn Model's Accuracy:  0.7413793103448276 
Manually Calculated Accuracy:  0.7413793103448276
