In [51]:
import pandas as pd
from Univariate import Univariate
import numpy as np

In [155]:
# dataSet chronic disease dataSet
dataSet = pd.read_csv('kidney_disease.csv')

In [156]:
# drop the id
dataSet = dataSet.drop(labels = 'id', axis=1)

In [157]:
# No data preprocessing data
dataSet

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,,...,38,6000,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,140.0,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,75.0,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,100.0,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,114.0,...,51,7200,5.9,no,no,no,good,no,no,notckd


In [158]:
# show the shape for rows and columns
dataSet.shape

(400, 25)

In [159]:
# data information
dataSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
age               391 non-null float64
bp                388 non-null float64
sg                353 non-null float64
al                354 non-null float64
su                351 non-null float64
rbc               248 non-null object
pc                335 non-null object
pcc               396 non-null object
ba                396 non-null object
bgr               356 non-null float64
bu                381 non-null float64
sc                383 non-null float64
sod               313 non-null float64
pot               312 non-null float64
hemo              348 non-null float64
pcv               330 non-null object
wc                295 non-null object
rc                270 non-null object
htn               398 non-null object
dm                398 non-null object
cad               398 non-null object
appet             399 non-null object
pe                399 non-null object
ane       

In [160]:
# missing value
dataSet.isnull().sum()

age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [161]:
# split the quan and qual
quan, qual = Univariate.SplitDataSet(dataSet)

In [185]:
from sklearn.impute import SimpleImputer

#Numerical DataSet
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(dataSet[quan])
numericalDataSet = imp_mean.transform(dataSet[quan])
numericalDataSet = pd.DataFrame(numericalDataSet, columns =quan) 

#Categorical DataSet
imp_most_frequent = SimpleImputer(strategy='most_frequent')
categoricalDataSet = imp_most_frequent.fit_transform(dataSet[qual])
categoricalDataSet = pd.DataFrame(categoricalDataSet, columns =qual) 

In [163]:
# concat the numerical DataSet and categorical DataSet
dataSet = pd.concat([numericalDataSet, categoricalDataSet], axis=1, join='inner')

In [186]:
# recheck the data set is null
dataSet.isnull().sum()

age               0
bp                0
sg                0
al                0
su                0
bgr               0
bu                0
sc                0
sod               0
pot               0
hemo              0
rbc               0
pc                0
pcc               0
ba                0
pcv               0
wc                0
rc                0
htn               0
dm                0
cad               0
appet             0
pe                0
ane               0
classification    0
dtype: int64

In [189]:
# check the mean, median, mode, percentile, IQR, skewness, kurtosis
descriptive = Univariate.Descriptive(quan, dataSet)

In [190]:
# descriptive data set
descriptive

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
Mean,51.4834,76.4691,1.01741,1.01695,0.450142,148.037,57.4257,3.07245,137.529,4.62724,12.5264
Median,54.0,78.2345,1.01741,1.0,0.0,126.0,44.0,1.4,137.529,4.62724,12.5264
Mode,60.0,80.0,1.02,0.0,0.0,148.037,57.4257,1.2,137.529,4.62724,12.5264
0%,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1
25%,42.0,70.0,1.015,0.0,0.0,101.0,27.0,0.9,135.0,4.0,10.875
50%,54.0,78.2345,1.01741,1.0,0.0,126.0,44.0,1.4,137.529,4.62724,12.5264
75%,64.0,80.0,1.02,2.0,0.450142,150.0,61.75,3.07245,141.0,4.8,14.625
99%,80.01,110.0,1.025,4.0,4.0,425.22,235.06,18.159,150.0,6.501,17.601
100%,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8
IQR,22.0,10.0,0.005,2.0,0.450142,49.0,34.75,2.17245,6.0,0.8,3.75


In [192]:
# check the lesser outlier and greater outlier
lesser, greater = Univariate.OutLier(quan, descriptive)
print(lesser)
print(greater)

['age', 'bp', 'sg', 'bgr', 'sod', 'pot', 'hemo']
['bp', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot']


In [193]:
# update the value for outlier and change it
descriptive,lesser,greater = Univariate.ReplaceOutlier(lesser, greater, dataSet, descriptive, quan)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataSet[columns][dataSet[columns] < descriptive[columns]['Lesser']] = descriptive[columns]['Lesser']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataSet[columns][dataSet[columns] > descriptive[columns]['Greater']] = descriptive[columns]['Greater']


In [197]:
# after remove the outlier for lesser and greater
descriptive

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
Mean,51.5684,75.7191,1.01745,1.01695,0.222685,136.985,51.0104,2.25413,138.115,4.43274,12.5329
Median,54.0,78.2345,1.01741,1.0,0.0,126.0,44.0,1.4,137.529,4.62724,12.5264
Mode,60.0,80.0,1.02,0.0,0.0,223.5,113.875,6.33114,137.529,4.62724,12.5264
0%,9.0,55.0,1.0075,0.0,0.0,27.5,1.5,0.4,126.0,2.8,5.25
25%,42.0,70.0,1.015,0.0,0.0,101.0,27.0,0.9,135.0,4.0,10.875
50%,54.0,78.2345,1.01741,1.0,0.0,126.0,44.0,1.4,137.529,4.62724,12.5264
75%,64.0,80.0,1.02,2.0,0.450142,150.0,61.75,3.07245,141.0,4.8,14.625
99%,80.01,95.0,1.025,4.0,1.12536,223.5,113.875,6.33114,150.0,6.0,17.601
100%,90.0,95.0,1.025,5.0,1.12536,223.5,113.875,6.33114,150.0,6.0,17.8
IQR,22.0,10.0,0.005,2.0,0.450142,49.0,34.75,2.17245,6.0,0.8,3.75


In [198]:
# after preprocessing data set
dataSet

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.0200,1.0,0.000000,121.000000,36.0,1.2,137.528754,4.627244,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,9.0,55.0,1.0200,4.0,0.000000,148.036517,18.0,0.8,137.528754,4.627244,...,38,6000,5.2,no,no,no,good,no,no,ckd
2,62.0,80.0,1.0100,2.0,1.125356,223.500000,53.0,1.8,137.528754,4.627244,...,31,7500,5.2,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.0075,4.0,0.000000,117.000000,56.0,3.8,126.000000,2.800000,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.0100,2.0,0.000000,106.000000,26.0,1.4,137.528754,4.627244,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.0200,0.0,0.000000,140.000000,49.0,0.5,150.000000,4.900000,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,42.0,70.0,1.0250,0.0,0.000000,75.000000,31.0,1.2,141.000000,3.500000,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,12.0,80.0,1.0200,0.0,0.000000,100.000000,26.0,0.6,137.000000,4.400000,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,17.0,60.0,1.0250,0.0,0.000000,114.000000,50.0,1.0,135.000000,4.900000,...,51,7200,5.9,no,no,no,good,no,no,notckd


In [206]:
# save the file 
dataSet.to_csv('CKD.csv', index=False,header=True)