In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.max_columns', 26)

In [2]:
df = pd.read_csv('/kaggle/input/ckdisease/kidney_disease.csv')
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd


In [3]:
df.drop('id', axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38,6000,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31,7500,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35,7300,4.6,no,no,no,good,no,no,ckd


In [5]:
df.columns = ['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell',
              'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
              'potassium', 'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
              'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'peda_edema',
              'aanemia', 'class']

In [6]:
df.sample(10)

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,blood_urea,serum_creatinine,sodium,potassium,haemoglobin,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,aanemia,class
382,48.0,80.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,75.0,22.0,0.8,137.0,5.0,16.8,51.0,6000.0,6.5,no,no,no,good,no,no,notckd
84,59.0,70.0,1.01,3.0,0.0,normal,abnormal,notpresent,notpresent,76.0,186.0,15.0,135.0,7.6,7.1,22.0,3800.0,2.1,yes,no,no,poor,yes,yes,ckd
194,80.0,70.0,1.01,2.0,,,abnormal,notpresent,notpresent,,49.0,1.2,,,,,,,yes,\tyes,no,good,no,no,ckd
204,65.0,90.0,1.01,4.0,2.0,normal,normal,notpresent,notpresent,172.0,82.0,13.5,145.0,6.3,8.8,31.0,,,yes,yes,no,good,yes,yes,ckd
11,63.0,70.0,1.01,3.0,0.0,abnormal,abnormal,present,notpresent,380.0,60.0,2.7,131.0,4.2,10.8,32.0,4500.0,3.8,yes,yes,no,poor,yes,no,ckd
25,61.0,60.0,1.025,0.0,0.0,,normal,notpresent,notpresent,108.0,75.0,1.9,141.0,5.2,9.9,29.0,8400.0,3.7,yes,yes,no,good,no,yes,ckd
85,70.0,70.0,1.015,2.0,,,,notpresent,notpresent,,46.0,1.5,,,9.9,,,,no,yes,no,poor,yes,no,ckd
13,68.0,70.0,,,,,,notpresent,notpresent,98.0,86.0,4.6,135.0,3.4,9.8,,,,yes,yes,yes,poor,yes,no,ckd
279,24.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,140.0,23.0,0.6,140.0,4.7,16.3,48.0,5800.0,5.6,no,no,no,good,no,no,notckd
175,60.0,50.0,1.01,0.0,0.0,,normal,notpresent,notpresent,261.0,58.0,2.2,113.0,3.0,,,4200.0,3.4,yes,no,no,good,no,no,ckd


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      391 non-null    float64
 1   blood_pressure           388 non-null    float64
 2   specific_gravity         353 non-null    float64
 3   albumin                  354 non-null    float64
 4   sugar                    351 non-null    float64
 5   red_blood_cells          248 non-null    object 
 6   pus_cell                 335 non-null    object 
 7   pus_cell_clumps          396 non-null    object 
 8   bacteria                 396 non-null    object 
 9   blood_glucose_random     356 non-null    float64
 10  blood_urea               381 non-null    float64
 11  serum_creatinine         383 non-null    float64
 12  sodium                   313 non-null    float64
 13  potassium                312 non-null    float64
 14  haemoglobin              3

In [8]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,391.0,51.483376,17.169714,2.0,42.0,55.0,64.5,90.0
blood_pressure,388.0,76.469072,13.683637,50.0,70.0,80.0,80.0,180.0
specific_gravity,353.0,1.017408,0.005717,1.005,1.01,1.02,1.02,1.025
albumin,354.0,1.016949,1.352679,0.0,0.0,0.0,2.0,5.0
sugar,351.0,0.450142,1.099191,0.0,0.0,0.0,0.0,5.0
blood_glucose_random,356.0,148.036517,79.281714,22.0,99.0,121.0,163.0,490.0
blood_urea,381.0,57.425722,50.503006,1.5,27.0,42.0,66.0,391.0
serum_creatinine,383.0,3.072454,5.741126,0.4,0.9,1.3,2.8,76.0
sodium,313.0,137.528754,10.408752,4.5,135.0,138.0,142.0,163.0
potassium,312.0,4.627244,3.193904,2.5,3.8,4.4,4.9,47.0


In [9]:
df['packed_cell_volume'] = pd.to_numeric(df['packed_cell_volume'], errors='coerce')
df['white_blood_cell_count'] = pd.to_numeric(df['white_blood_cell_count'], errors='coerce')
df['red_blood_cell_count'] = pd.to_numeric(df['red_blood_cell_count'], errors='coerce')

In [10]:
df['packed_cell_volume'].fillna(df['packed_cell_volume'].median(), inplace=True)
df['white_blood_cell_count'].fillna(df['white_blood_cell_count'].median(), inplace=True)
df['red_blood_cell_count'].fillna(df['red_blood_cell_count'].median(), inplace=True)

In [11]:
cat_cols = [col for col in df.columns if df[col].dtype == 'object']

In [12]:
cat_cols

['red_blood_cells',
 'pus_cell',
 'pus_cell_clumps',
 'bacteria',
 'hypertension',
 'diabetes_mellitus',
 'coronary_artery_disease',
 'appetite',
 'peda_edema',
 'aanemia',
 'class']

In [13]:
num_cols = [col for col in df.columns if df[col].dtype != 'object']

In [14]:
num_cols

['age',
 'blood_pressure',
 'specific_gravity',
 'albumin',
 'sugar',
 'blood_glucose_random',
 'blood_urea',
 'serum_creatinine',
 'sodium',
 'potassium',
 'haemoglobin',
 'packed_cell_volume',
 'white_blood_cell_count',
 'red_blood_cell_count']

In [15]:
for col in cat_cols:
    print(f'{col} has {df[col].unique()}')

red_blood_cells has [nan 'normal' 'abnormal']
pus_cell has ['normal' 'abnormal' nan]
pus_cell_clumps has ['notpresent' 'present' nan]
bacteria has ['notpresent' 'present' nan]
hypertension has ['yes' 'no' nan]
diabetes_mellitus has ['yes' 'no' ' yes' '\tno' '\tyes' nan]
coronary_artery_disease has ['no' 'yes' '\tno' nan]
appetite has ['good' 'poor' nan]
peda_edema has ['no' 'yes' nan]
aanemia has ['no' 'yes' nan]
class has ['ckd' 'ckd\t' 'notckd']


- diabetes_mellitus has ['yes' 'no' ' yes' '\tno' '\tyes' nan] values
- coronary_artery_disease has ['no' 'yes' '\tno' nan] values
- class has ['ckd' 'ckd\t' 'notckd'] 

values  must be corrected

In [16]:
df['diabetes_mellitus'].replace({'\tno':'no', '\tyes':'yes', ' yes':'yes'}, inplace=True)
df['coronary_artery_disease'] = df['coronary_artery_disease'].str.replace('\tno', 'no')
df['class'].replace({'ckd\t': 'ckd', 'notckd': 'not ckd'}, inplace=True)

In [17]:
df['class'] = df['class'].map({'ckd': 0, 'not ckd' : 1}).astype(int)

In [18]:
cols = ['diabetes_mellitus','coronary_artery_disease','class']

for col in cols:
    print(f'{col} has {df[col].unique()}')

diabetes_mellitus has ['yes' 'no' nan]
coronary_artery_disease has ['no' 'yes' nan]
class has [0 1]


In [19]:
df = df.drop('red_blood_cells', axis=1)