In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.max_columns', 26)

In [2]:
df = pd.read_csv('/kaggle/input/ckdisease/kidney_disease.csv')
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd


In [3]:
df.drop('id', axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd


In [5]:
df.columns = ['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell',
              'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
              'potassium', 'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
              'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'peda_edema',
              'aanemia', 'class']

In [6]:
df.sample(10)

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,blood_urea,serum_creatinine,sodium,potassium,haemoglobin,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,aanemia,class
34,70.0,70.0,1.01,1.0,0.0,normal,,present,present,171.0,153.0,5.2,,,,,,,no,yes,no,poor,no,no,ckd
160,81.0,60.0,,,,,,notpresent,notpresent,148.0,39.0,2.1,147.0,4.2,10.9,35.0,9400.0,2.4,yes,yes,yes,poor,yes,no,ckd
202,78.0,60.0,,,,,,notpresent,notpresent,114.0,74.0,2.9,135.0,5.9,8.0,24.0,,,no,yes,no,good,no,yes,ckd
174,54.0,70.0,,,,,,notpresent,notpresent,111.0,146.0,7.5,141.0,4.7,11.0,35.0,8600.0,4.6,no,no,no,good,no,no,ckd
231,60.0,90.0,,,,,,notpresent,notpresent,269.0,51.0,2.8,138.0,3.7,11.5,35.0,,,yes,yes,yes,good,yes,no,ckd
103,76.0,70.0,1.015,2.0,0.0,normal,abnormal,present,notpresent,226.0,217.0,10.2,,,10.2,36.0,12700.0,4.2,yes,no,no,poor,yes,yes,ckd
179,72.0,90.0,1.01,2.0,0.0,,abnormal,present,notpresent,124.0,53.0,2.3,,,11.9,39.0,,,no,no,no,good,no,no,ckd
251,23.0,80.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,70.0,36.0,1.0,150.0,4.6,17.0,52.0,9800.0,5.0,no,no,no,good,no,no,notckd
359,74.0,60.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,88.0,50.0,0.6,147.0,3.7,17.2,53.0,6000.0,4.5,no,no,no,good,no,no,notckd
121,54.0,60.0,,3.0,,,,notpresent,notpresent,125.0,21.0,1.3,137.0,3.4,15.0,46.0,,,yes,yes,no,good,yes,no,ckd


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      391 non-null    float64
 1   blood_pressure           388 non-null    float64
 2   specific_gravity         353 non-null    float64
 3   albumin                  354 non-null    float64
 4   sugar                    351 non-null    float64
 5   red_blood_cells          248 non-null    object 
 6   pus_cell                 335 non-null    object 
 7   pus_cell_clumps          396 non-null    object 
 8   bacteria                 396 non-null    object 
 9   blood_glucose_random     356 non-null    float64
 10  blood_urea               381 non-null    float64
 11  serum_creatinine         383 non-null    float64
 12  sodium                   313 non-null    float64
 13  potassium                312 non-null    float64
 14  haemoglobin              3

In [8]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,391.0,51.483376,17.169714,2.0,42.0,55.0,64.5,90.0
blood_pressure,388.0,76.469072,13.683637,50.0,70.0,80.0,80.0,180.0
specific_gravity,353.0,1.017408,0.005717,1.005,1.01,1.02,1.02,1.025
albumin,354.0,1.016949,1.352679,0.0,0.0,0.0,2.0,5.0
sugar,351.0,0.450142,1.099191,0.0,0.0,0.0,0.0,5.0
blood_glucose_random,356.0,148.036517,79.281714,22.0,99.0,121.0,163.0,490.0
blood_urea,381.0,57.425722,50.503006,1.5,27.0,42.0,66.0,391.0
serum_creatinine,383.0,3.072454,5.741126,0.4,0.9,1.3,2.8,76.0
sodium,313.0,137.528754,10.408752,4.5,135.0,138.0,142.0,163.0
potassium,312.0,4.627244,3.193904,2.5,3.8,4.4,4.9,47.0


In [9]:
df['packed_cell_volume'] = pd.to_numeric(df['packed_cell_volume'], errors='coerce')
df['white_blood_cell_count'] = pd.to_numeric(df['white_blood_cell_count'], errors='coerce')
df['red_blood_cell_count'] = pd.to_numeric(df['red_blood_cell_count'], errors='coerce')

In [10]:
cat_cols = [col for col in df.columns if df[col].dtype == 'object']

In [11]:
cat_cols

['red_blood_cells',
 'pus_cell',
 'pus_cell_clumps',
 'bacteria',
 'hypertension',
 'diabetes_mellitus',
 'coronary_artery_disease',
 'appetite',
 'peda_edema',
 'aanemia',
 'class']

In [12]:
num_cols = [col for col in df.columns if df[col].dtype != 'object']

In [13]:
num_cols

['age',
 'blood_pressure',
 'specific_gravity',
 'albumin',
 'sugar',
 'blood_glucose_random',
 'blood_urea',
 'serum_creatinine',
 'sodium',
 'potassium',
 'haemoglobin',
 'packed_cell_volume',
 'white_blood_cell_count',
 'red_blood_cell_count']

In [14]:
for col in cat_cols:
    print(f'{col} {df[col].unique()}')

red_blood_cells [nan 'normal' 'abnormal']
pus_cell ['normal' 'abnormal' nan]
pus_cell_clumps ['notpresent' 'present' nan]
bacteria ['notpresent' 'present' nan]
hypertension ['yes' 'no' nan]
diabetes_mellitus ['yes' 'no' ' yes' '\tno' '\tyes' nan]
coronary_artery_disease ['no' 'yes' '\tno' nan]
appetite ['good' 'poor' nan]
peda_edema ['no' 'yes' nan]
aanemia ['no' 'yes' nan]
class ['ckd' 'ckd\t' 'notckd']


- diabetes_mellitus has ['yes' 'no' ' yes' '\tno' '\tyes' nan] values
- coronary_artery_disease has ['no' 'yes' '\tno' nan] values
- class has ['ckd' 'ckd\t' 'notckd'] 

values  must be corrected

In [15]:
df['diabetes_mellitus'].replace({'\tno':'no', '\tyes':'yes', ' yes':'yes'}, inplace=True)
df['coronary_artery_disease'] = df['coronary_artery_disease'].str.replace('\tno', 'no')
df['class'].replace({'ckd\t': 'ckd', 'notckd': 'not ckd'}, inplace=True)

In [16]:
df['class'] = df['class'].map({'ckd': 0, 'not ckd': 1})
df['class'] = pd.to_numeric(df['class'], errors='coerce')

In [17]:
cols = ['diabetes_mellitus','coronary_artery_disease','class']

for col in cols:
    print(f'{col} {df[col].unique()}')

diabetes_mellitus ['yes' 'no' nan]
coronary_artery_disease ['no' 'yes' nan]
class [0 1]


### EDA ###