In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import matplotlib.pyplot as plt

In [26]:
df = pd.read_csv("kidney_disease.csv")
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [48]:
#create a list of column names to keep with independent and dependent vars
columns_to_retain = ['age','bp','sg','al','su','hemo','pcv','wc','rc','htn','classification']

#Drop the columns that are not in columns_to_retain
df = df.drop([col for col in df.columns if not col in columns_to_retain], axis=1)


In [49]:
#Drop the rows with na or missing values
df.dropna(axis=0)

Unnamed: 0,sg,al,hemo,pcv,wc,rc,htn,classification
0,1.020,1.0,15.4,32,72,34,2,0
1,1.020,4.0,11.3,26,56,49,1,0
2,1.010,2.0,9.6,19,70,49,1,0
3,1.005,4.0,11.2,20,62,19,2,0
4,1.010,2.0,11.6,23,68,27,1,0
...,...,...,...,...,...,...,...,...
395,1.020,0.0,15.7,35,62,30,1,2
396,1.025,0.0,16.5,42,72,44,1,2
397,1.020,0.0,15.8,37,61,36,1,2
398,1.025,0.0,14.2,39,67,41,1,2


In [50]:
#apply mean on the rest of Nan values
df['sg'].fillna((df['sg'].mean()), inplace=True)
df['al'].fillna((df['al'].mean()), inplace=True)
df['hemo'].fillna((df['hemo'].mean()), inplace=True)

In [51]:
df.isnull().any()

sg                False
al                False
hemo              False
pcv               False
wc                False
rc                False
htn               False
classification    False
dtype: bool

In [52]:
#Transform the non-numeric data in the column
for column in df.columns:
    if df[column].dtype == np.number:
        continue
    df[column] = LabelEncoder().fit_transform(df[column].astype(str))

In [53]:
#removing the rest Nan values 
df.dropna(axis=0)

Unnamed: 0,sg,al,hemo,pcv,wc,rc,htn,classification
0,1.020,1.0,15.4,26,70,28,2,0
1,1.020,4.0,11.3,19,52,44,1,0
2,1.010,2.0,9.6,11,68,44,1,0
3,1.005,4.0,11.2,13,59,11,2,0
4,1.010,2.0,11.6,16,65,20,1,0
...,...,...,...,...,...,...,...,...
395,1.020,0.0,15.7,29,59,24,1,2
396,1.025,0.0,16.5,37,70,39,1,2
397,1.020,0.0,15.8,31,58,30,1,2
398,1.025,0.0,14.2,33,64,36,1,2


In [54]:
df.isnull().any()

sg                False
al                False
hemo              False
pcv               False
wc                False
rc                False
htn               False
classification    False
dtype: bool

In [55]:
#Print the the first five rows of clean dataset
df.head()

Unnamed: 0,sg,al,hemo,pcv,wc,rc,htn,classification
0,1.02,1.0,15.4,26,70,28,2,0
1,1.02,4.0,11.3,19,52,44,1,0
2,1.01,2.0,9.6,11,68,44,1,0
3,1.005,4.0,11.2,13,59,11,2,0
4,1.01,2.0,11.6,16,65,20,1,0
