In [None]:
import pandas as pd
import numpy as np
import random
import operator, math

# random.seed(2)

# Loading the data and dropping the index axis
df = pd.read_csv('kidney_disease.csv')
df  = df.drop(['id'],axis=1)


# separating data into different classes
real = ['sc','pot','hemo','rc',]
integer = ['age','bp','bgr','bu','sod','pcv','wc',]
label = ['classification']
cat = list(set(df.columns) - set(real)-set(integer)-set(label))

# Removing parsing errors
df = df.replace('\t?',np.nan)
df = df.replace('\tyes','yes')
df = df.replace(' yes','yes')
df = df.replace('yes\t','yes')
df = df.replace('\tno','no')
df = df.replace('ckd\t','ckd')
df = df.replace('ckd',1)
df = df.replace('notckd',0)





# Filling the null values with mean you can also use other statistic like mode or median
for r in real:
    mean = np.array(df[r][~df[r].isna()]).astype('float').mean()
    df[r] = df[r].fillna(mean)
for i in integer:
    mean = np.array(df[i][~df[i].isna()]).astype('int').mean()
    df[i] = df[i].fillna(int(mean))


X = df.drop(label,axis=1)
Y = df[label]

X['pcv'] = X['pcv'].astype(int)
X['wc'] = X['wc'].astype(int)
X['rc'] = X['rc'].astype(float)
# You need to convert the catagorical variables to binary u can use pd.get_dummies to do so

X = pd.get_dummies(X, columns=cat)

ind = X.columns.values.tolist()
print(ind)
index_real = []
for r in real:
  temp = ind.index(r)
  index_real.append(temp)

index_int = []
for i in integer:
  temp = ind.index(i)
  index_int.append(temp)

index_bin = list(set(range(0, len(ind))) - set(index_real) - set(index_int))
print(index_bin)
X = X.to_numpy()
Y = Y.to_numpy()

['age', 'bp', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'pe_no', 'pe_yes', 'dm_no', 'dm_yes', 'appet_good', 'appet_poor', 'pcc_notpresent', 'pcc_present', 'al_0.0', 'al_1.0', 'al_2.0', 'al_3.0', 'al_4.0', 'al_5.0', 'sg_1.005', 'sg_1.01', 'sg_1.015', 'sg_1.02', 'sg_1.025', 'ane_no', 'ane_yes', 'cad_no', 'cad_yes', 'pc_abnormal', 'pc_normal', 'ba_notpresent', 'ba_present', 'rbc_abnormal', 'rbc_normal', 'htn_no', 'htn_yes', 'su_0.0', 'su_1.0', 'su_2.0', 'su_3.0', 'su_4.0', 'su_5.0']
[11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]


In [None]:
id_list = list(range(len(X)))
random.shuffle(id_list)
id_train = id_list[0:134]
print(len(id_train))
id_val = id_list[134:267]
print(len(id_val))
id_test = id_list[267:]
print(len(id_test))

134
133
133


In [None]:
X_train = X[id_train]
print(X_train.shape)
Y_train = Y[id_train]
print(Y_train.shape)

X_val = X[id_val]
print(X_val.shape)
Y_val = Y[id_val]
print(Y_val.shape)

X_test = X[id_test]
print(X_test.shape)
Y_test = Y[id_test]
print(Y_test.shape)


(134, 48)
(134, 1)
(133, 48)
(133, 1)
(133, 48)
(133, 1)


In [None]:
# from sklearn import neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import DistanceMetric, accuracy_score
from scipy.spatial import distance

d1 = DistanceMetric.get_metric('minkowski')
d2 = DistanceMetric.get_metric('canberra')
d3 = DistanceMetric.get_metric('russellrao')

def mydist(x1, x2):
  return distance.minkowski(x1[index_real], x2[index_real]) + distance.canberra(x1[index_int], x2[index_int]) + distance.russellrao(x1[index_bin], x2[index_bin])

k_list = [1, 3, 5, 7, 9]
scores = []
for k in range(1, 10, 2):
  knn = KNeighborsClassifier(k, metric=mydist)
  knn.fit(X_train, Y_train.ravel())
  scores.append(knn.score(X_val, Y_val))
print(scores)

knn = KNeighborsClassifier(1, metric=mydist)
knn.fit(X_train, Y_train.ravel())
# Y_pred = knn.predict(X_test)
# count = 0
# for i in range(len(Y_pred)):
#   print("{}  {}".format(Y_pred[i], Y_test[i]))
#   if Y_pred[i] == Y_test[i][0]:
#     count += 1
# print(count)
print(knn.score(X_test, Y_test))

[0.9398496240601504, 0.9624060150375939, 0.9699248120300752, 0.9699248120300752, 0.9774436090225563]
0.924812030075188


In [None]:
from sklearn.neighbors import KernelDensity
print(index_real)
print(index_int)
print(Y_test)

[4, 6, 7, 10]
[0, 1, 2, 3, 5, 8, 9]
[[0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]]


In [None]:
import statsmodels.api as sm
Y_train = Y_train.reshape((134, 1))
# print(np.append(X_train[:, index_real], Y_train, axis=1).shape)
t = np.append(X_train, Y_train, axis=1)
s = 'oooococcooc' + 'u' * 38
density = sm.nonparametric.KDEMultivariate(data=t, var_type=s, bw='normal_reference')


In [None]:
print(density.bw)
print(len(density.bw))

[1.43169317e+01 1.55473421e+01 6.76367488e+01 5.10125186e+01
 8.32994720e+00 1.30199808e+01 2.95253200e+00 2.64940680e+00
 8.22108402e+00 2.53143439e+03 8.78052420e-01 3.64411792e-01
 3.64411792e-01 4.48317377e-01 4.45348934e-01 3.92915320e-01
 3.92915320e-01 2.65286966e-01 2.53967332e-01 4.81275065e-01
 3.04709591e-01 2.65286966e-01 3.13377642e-01 1.83166848e-01
 0.00000000e+00 1.17184178e-01 4.16375024e-01 3.58003577e-01
 4.12042572e-01 3.97979464e-01 3.29558306e-01 3.29558306e-01
 2.28979857e-01 2.15039328e-01 3.64411792e-01 4.53852369e-01
 1.99870005e-01 1.83166848e-01 3.13377642e-01 4.82354643e-01
 4.67403176e-01 4.65451589e-01 4.12042572e-01 1.42976048e-01
 1.83166848e-01 1.64463181e-01 1.83166848e-01 8.31750048e-02
 4.70950879e-01]
49


In [None]:
count = 0
tot = 0
for i in range(133):
  x_in = X_test[i].reshape((1, 48))
  tot += 1
  # aa = density.pdf(np.append(x_in, np.array([0]).reshape((1,1)), axis=1))
  # ab = density.pdf(np.append(x_in, np.array([1]).reshape((1,1)), axis=1))
  # print(aa)
  # print(ab)
  if density.pdf(np.append(x_in, np.array([0]).reshape((1,1)), axis=1)) > density.pdf(np.append(x_in, np.array([1]).reshape((1,1)), axis=1)):
    if Y_test[i] == 0:
      count += 1
  else:
    if Y_test[i] == 1:
      count += 1

print(count/tot)

0.6390977443609023
