In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

In [3]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = pd.read_csv('/content/drive/MyDrive/kidney_disease.csv')

In [7]:



# Step 1: Drop irrelevant columns
df.drop("id", axis=1, inplace=True)

# Step 2: Strip whitespace from string columns
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Step 3: Replace common inconsistencies
df.replace({'\tno': 'no', '\tyes': 'yes', 'ckd\t': 'ckd'}, inplace=True)

# Step 4: Convert object columns to proper numeric where possible
for col in ['pcv', 'wc', 'rc']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Step 5: Separate features by type
cat_cols = df.select_dtypes(include='object').columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Step 6: Impute missing values
# Numeric: median
num_imputer = SimpleImputer(strategy='median')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Categorical: mode
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# Step 7: Encode categorical columns
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# ✅ Preprocessed DataFrame ready for ML
print(df.head())

    age    bp     sg   al   su  rbc  pc  pcc  ba    bgr  ...   pcv      wc  \
0  48.0  80.0  1.020  1.0  0.0    1   1    0   0  121.0  ...  44.0  7800.0   
1   7.0  50.0  1.020  4.0  0.0    1   1    0   0  121.0  ...  38.0  6000.0   
2  62.0  80.0  1.010  2.0  3.0    1   1    0   0  423.0  ...  31.0  7500.0   
3  48.0  70.0  1.005  4.0  0.0    1   0    1   0  117.0  ...  32.0  6700.0   
4  51.0  80.0  1.010  2.0  0.0    1   1    0   0  106.0  ...  35.0  7300.0   

    rc  htn  dm  cad  appet  pe  ane  classification  
0  5.2    1   1    0      0   0    0               0  
1  4.8    0   0    0      0   0    0               0  
2  4.8    0   1    0      1   0    1               0  
3  3.9    1   0    0      1   1    1               0  
4  4.6    0   0    0      0   0    0               0  

[5 rows x 25 columns]


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             400 non-null    float64
 1   bp              400 non-null    float64
 2   sg              400 non-null    float64
 3   al              400 non-null    float64
 4   su              400 non-null    float64
 5   rbc             400 non-null    int64  
 6   pc              400 non-null    int64  
 7   pcc             400 non-null    int64  
 8   ba              400 non-null    int64  
 9   bgr             400 non-null    float64
 10  bu              400 non-null    float64
 11  sc              400 non-null    float64
 12  sod             400 non-null    float64
 13  pot             400 non-null    float64
 14  hemo            400 non-null    float64
 15  pcv             400 non-null    float64
 16  wc              400 non-null    float64
 17  rc              400 non-null    flo

In [9]:
df.head(10)

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.02,1.0,0.0,1,1,0,0,121.0,...,44.0,7800.0,5.2,1,1,0,0,0,0,0
1,7.0,50.0,1.02,4.0,0.0,1,1,0,0,121.0,...,38.0,6000.0,4.8,0,0,0,0,0,0,0
2,62.0,80.0,1.01,2.0,3.0,1,1,0,0,423.0,...,31.0,7500.0,4.8,0,1,0,1,0,1,0
3,48.0,70.0,1.005,4.0,0.0,1,0,1,0,117.0,...,32.0,6700.0,3.9,1,0,0,1,1,1,0
4,51.0,80.0,1.01,2.0,0.0,1,1,0,0,106.0,...,35.0,7300.0,4.6,0,0,0,0,0,0,0
5,60.0,90.0,1.015,3.0,0.0,1,1,0,0,74.0,...,39.0,7800.0,4.4,1,1,0,0,1,0,0
6,68.0,70.0,1.01,0.0,0.0,1,1,0,0,100.0,...,36.0,8000.0,4.8,0,0,0,0,0,0,0
7,24.0,80.0,1.015,2.0,4.0,1,0,0,0,410.0,...,44.0,6900.0,5.0,0,1,0,0,1,0,0
8,52.0,100.0,1.015,3.0,0.0,1,0,1,0,138.0,...,33.0,9600.0,4.0,1,1,0,0,0,1,0
9,53.0,90.0,1.02,2.0,0.0,0,0,1,0,70.0,...,29.0,12100.0,3.7,1,1,0,1,0,1,0


In [10]:
# Columns safe to convert to int (e.g., age, bp, al, su, bu, etc.)
safe_to_convert = ['age', 'bp', 'al', 'su', 'bgr', 'bu', 'pcv', 'wc']

# Convert safely
for col in safe_to_convert:
    if col in df.columns:
        df[col] = df[col].astype(int)


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             400 non-null    int64  
 1   bp              400 non-null    int64  
 2   sg              400 non-null    float64
 3   al              400 non-null    int64  
 4   su              400 non-null    int64  
 5   rbc             400 non-null    int64  
 6   pc              400 non-null    int64  
 7   pcc             400 non-null    int64  
 8   ba              400 non-null    int64  
 9   bgr             400 non-null    int64  
 10  bu              400 non-null    int64  
 11  sc              400 non-null    float64
 12  sod             400 non-null    float64
 13  pot             400 non-null    float64
 14  hemo            400 non-null    float64
 15  pcv             400 non-null    int64  
 16  wc              400 non-null    int64  
 17  rc              400 non-null    flo

In [12]:
df.to_csv("kidney_cleaned_dadaset.csv", index=False)

In [None]:
from google.colab import files
files.download("ckd_preprocessed.csv")