In [2]:
# Common imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ML
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix


In [3]:
import pandas as pd
df=pd.read_csv('../data/raw/ckd.csv')
df

Unnamed: 0,Bp,Sg,Al,Su,Rbc,Bu,Sc,Sod,Pot,Hemo,Wbcc,Rbcc,Htn,Class
0,80.0,1.020,1.0,0.0,1.0,36.0,1.2,137.53,4.63,15.4,7800.0,5.20,1.0,1
1,50.0,1.020,4.0,0.0,1.0,18.0,0.8,137.53,4.63,11.3,6000.0,4.71,0.0,1
2,80.0,1.010,2.0,3.0,1.0,53.0,1.8,137.53,4.63,9.6,7500.0,4.71,0.0,1
3,70.0,1.005,4.0,0.0,1.0,56.0,3.8,111.00,2.50,11.2,6700.0,3.90,1.0,1
4,80.0,1.010,2.0,0.0,1.0,26.0,1.4,137.53,4.63,11.6,7300.0,4.60,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,80.0,1.020,0.0,0.0,1.0,49.0,0.5,150.00,4.90,15.7,6700.0,4.90,0.0,0
396,70.0,1.025,0.0,0.0,1.0,31.0,1.2,141.00,3.50,16.5,7800.0,6.20,0.0,0
397,80.0,1.020,0.0,0.0,1.0,26.0,0.6,137.00,4.40,15.8,6600.0,5.40,0.0,0
398,60.0,1.025,0.0,0.0,1.0,50.0,1.0,135.00,4.90,14.2,7200.0,5.90,0.0,0


In [4]:
df.columns
df.isnull().sum()


Bp       0
Sg       0
Al       0
Su       0
Rbc      0
Bu       0
Sc       0
Sod      0
Pot      0
Hemo     0
Wbcc     0
Rbcc     0
Htn      0
Class    0
dtype: int64

In [5]:
df.shape
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Bp      400 non-null    float64
 1   Sg      400 non-null    float64
 2   Al      400 non-null    float64
 3   Su      400 non-null    float64
 4   Rbc     400 non-null    float64
 5   Bu      400 non-null    float64
 6   Sc      400 non-null    float64
 7   Sod     400 non-null    float64
 8   Pot     400 non-null    float64
 9   Hemo    400 non-null    float64
 10  Wbcc    400 non-null    float64
 11  Rbcc    400 non-null    float64
 12  Htn     400 non-null    float64
 13  Class   400 non-null    int64  
dtypes: float64(13), int64(1)
memory usage: 43.9 KB


In [6]:
df.describe()


Unnamed: 0,Bp,Sg,Al,Su,Rbc,Bu,Sc,Sod,Pot,Hemo,Wbcc,Rbcc,Htn,Class
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,76.455,1.017712,1.015,0.395,0.8825,57.4055,3.07235,137.529025,4.62785,12.5269,8406.09,4.708275,0.36935,0.625
std,13.476536,0.005434,1.272329,1.040038,0.322418,49.28597,5.61749,9.204273,2.819783,2.716171,2523.219976,0.840315,0.482023,0.484729
min,50.0,1.005,0.0,0.0,0.0,1.5,0.4,4.5,2.5,3.1,2200.0,2.1,0.0,0.0
25%,70.0,1.015,0.0,0.0,1.0,27.0,0.9,135.0,4.0,10.875,6975.0,4.5,0.0,0.0
50%,78.0,1.02,1.0,0.0,1.0,44.0,1.4,137.53,4.63,12.53,8406.0,4.71,0.0,1.0
75%,80.0,1.02,2.0,0.0,1.0,61.75,3.07,141.0,4.8,14.625,9400.0,5.1,1.0,1.0
max,180.0,1.025,5.0,5.0,1.0,391.0,76.0,163.0,47.0,17.8,26400.0,8.0,1.0,1.0


In [7]:
df['Class'].value_counts()
df['Class'].value_counts(normalize=True)


Class
1    0.625
0    0.375
Name: proportion, dtype: float64

In [8]:
## Preprocessing & Baseline Model
X = df.drop(columns=['Class'])
y = df['Class']


In [9]:
binary_features = [col for col in X.columns if set(X[col].unique()).issubset({0, 1})]
binary_features


['Rbc']

In [10]:
continuous_features = [col for col in X.columns if col not in binary_features]
continuous_features


['Bp',
 'Sg',
 'Al',
 'Su',
 'Bu',
 'Sc',
 'Sod',
 'Pot',
 'Hemo',
 'Wbcc',
 'Rbcc',
 'Htn']

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), continuous_features),
        ('bin', 'passthrough', binary_features)
    ]
)


In [None]:
## Conclusion
The dataset shows clear class imbalance and clinically meaningful numeric features, forming the basis for predictive modeling.


In [13]:
# Save processed dataset for modeling
df_processed = df.copy()

df_processed.to_csv("../data/processed/ckd_processed.csv", index=False)
