In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("kidney_disease.csv")
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [3]:
columns_to_retain = ['sg','al','hemo','classification']
df = df.drop([col for col in df.columns if not col in columns_to_retain], axis=1)


In [4]:
df.dropna(axis=0)

Unnamed: 0,sg,al,hemo,classification
0,1.020,1.0,15.4,ckd
1,1.020,4.0,11.3,ckd
2,1.010,2.0,9.6,ckd
3,1.005,4.0,11.2,ckd
4,1.010,2.0,11.6,ckd
...,...,...,...,...
395,1.020,0.0,15.7,notckd
396,1.025,0.0,16.5,notckd
397,1.020,0.0,15.8,notckd
398,1.025,0.0,14.2,notckd


In [5]:
df['sg'].fillna((df['sg'].mean()), inplace=True)
df['al'].fillna((df['al'].mean()), inplace=True)
df['hemo'].fillna((df['hemo'].mean()), inplace=True)

In [6]:
df.isnull().any()

sg                False
al                False
hemo              False
classification    False
dtype: bool

In [7]:
for column in df.columns:
    if df[column].dtype == np.number:
        continue
    df[column] = LabelEncoder().fit_transform(df[column].astype(str))

In [8]:
df.dropna(axis=0)

Unnamed: 0,sg,al,hemo,classification
0,1.020,1.0,15.4,0
1,1.020,4.0,11.3,0
2,1.010,2.0,9.6,0
3,1.005,4.0,11.2,0
4,1.010,2.0,11.6,0
...,...,...,...,...
395,1.020,0.0,15.7,2
396,1.025,0.0,16.5,2
397,1.020,0.0,15.8,2
398,1.025,0.0,14.2,2


In [9]:
df.isnull().any()

sg                False
al                False
hemo              False
classification    False
dtype: bool

In [10]:
df.head()

Unnamed: 0,sg,al,hemo,classification
0,1.02,1.0,15.4,0
1,1.02,4.0,11.3,0
2,1.01,2.0,9.6,0
3,1.005,4.0,11.2,0
4,1.01,2.0,11.6,0


In [11]:
x= df.drop(['classification'], axis=1)
y=  df['classification']

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    shuffle=True)

In [13]:
x_test

Unnamed: 0,sg,al,hemo
132,1.017408,1.016949,8.6
309,1.020000,0.000000,17.2
341,1.025000,0.000000,13.4
196,1.010000,3.000000,8.1
246,1.015000,3.000000,8.6
...,...,...,...
14,1.010000,3.000000,5.6
363,1.025000,0.000000,17.8
304,1.025000,0.000000,13.1
361,1.020000,0.000000,13.7


In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler #feature scaling
from sklearn.pipeline import Pipeline
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test= sc.transform(x_test)

In [15]:
from joblib import dump
dump(sc,"knn.save")

['knn.save']

In [16]:
pipe = Pipeline([
    ("rescale", StandardScaler()),
    ("Classifier", KNeighborsClassifier(n_neighbors=5, metric="minkowski", p=1))
])

In [17]:
pipe.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('rescale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('Classifier',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=5, p=1,
                                      weights='uniform'))],
         verbose=False)

In [18]:
import pickle
pickle.dump(pipe,open('random.pkl','wb'))

In [19]:
y_pred = pipe.predict(x_test)
y_pred

array([0, 2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 2,
       0, 2, 0, 2, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
       2, 0, 2, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2,
       0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 2, 2, 2])

In [20]:
y_test

132    0
309    2
341    2
196    0
246    0
      ..
14     0
363    2
304    2
361    2
329    2
Name: classification, Length: 80, dtype: int32

In [21]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9625