In [7]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import os
import joblib

# Optional: XGBoost if installed
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except ImportError:
    HAS_XGB = False

## Load dataset

In [8]:
# Path assumes this notebook is in `notebooks/` and data in `../dataset/`
data_path = os.path.join('..', 'dataset', 'kidney_disease.csv')
df = pd.read_csv(data_path)

print('Shape:', df.shape)
df.head()

Shape: (400, 26)


Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


## Basic cleaning and type conversion

In [9]:
# Drop id column if present
if 'id' in df.columns:
    df = df.drop(columns=['id'])

# Strip whitespace and tab characters from string columns
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype(str).str.replace('\t', '', regex=False).str.strip()

# Original CKD dataset column names
numeric_features = ['age', 'bp', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']
categorical_features = ['sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
target_col = 'classification'

# Convert numeric-like columns to numeric (coerce errors to NaN)
for col in numeric_features:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Ensure categorical columns are treated as object
for col in categorical_features:
    if col in df.columns:
        df[col] = df[col].astype('object')

df[target_col].value_counts()

classification
ckd       250
notckd    150
Name: count, dtype: int64

## Encode target and train-test split

In [10]:
# Map classification labels to binary: ckd -> 1, notckd -> 0
y_raw = df[target_col].astype(str).str.replace('\t', '', regex=False).str.strip().str.lower()
y = y_raw.replace({'ckd': 1, 'notckd': 0})

# Remove rows with unknown target (if any)
mask = y.isin([0, 1])
df = df[mask].copy()
y = y[mask].astype(int)

# Features and target
X = df.drop(columns=[target_col])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

X_train.shape, X_test.shape

  y = y_raw.replace({'ckd': 1, 'notckd': 0})


((280, 24), (120, 24))

## Preprocessing and RandomForest model

In [11]:
# Numeric preprocessing: median imputation + MinMax scaling
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

# Categorical preprocessing: most frequent imputation + ordinal encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, [c for c in numeric_features if c in X.columns]),
        ('cat', categorical_transformer, [c for c in categorical_features if c in X.columns])
    ]
)

rf_clf = RandomForestClassifier(
    n_estimators=300,
    random_state=42
)

rf_pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', rf_clf)
])

# Cross-validation on training set
cv_scores = cross_val_score(rf_pipeline, X_train, y_train, cv=5, scoring='accuracy')
print('CV accuracy (RF):', cv_scores.mean(), '+/-', cv_scores.std())

# Fit and evaluate on test set
rf_pipeline.fit(X_train, y_train)
y_pred = rf_pipeline.predict(X_test)

print('Test accuracy (RF):', accuracy_score(y_test, y_pred))
print('\nConfusion matrix (RF):')
print(confusion_matrix(y_test, y_pred))
print('\nClassification report (RF):')
print(classification_report(y_test, y_pred))

CV accuracy (RF): 0.9928571428571429 +/- 0.008748177652797088
Test accuracy (RF): 1.0

Confusion matrix (RF):
[[45  0]
 [ 0 75]]

Classification report (RF):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        45
           1       1.00      1.00      1.00        75

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120



## Logistic Regression and AdaBoost models

In [12]:
# Logistic Regression model using the same preprocessor
logreg_clf = LogisticRegression(max_iter=2000, n_jobs=-1) if hasattr(LogisticRegression(), 'n_jobs') else LogisticRegression(max_iter=2000)

logreg_pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', logreg_clf)
])

cv_scores_lr = cross_val_score(logreg_pipeline, X_train, y_train, cv=5, scoring='accuracy')
print('CV accuracy (LogReg):', cv_scores_lr.mean(), '+/-', cv_scores_lr.std())

logreg_pipeline.fit(X_train, y_train)
y_pred_lr = logreg_pipeline.predict(X_test)

print('Test accuracy (LogReg):', accuracy_score(y_test, y_pred_lr))
print('\nConfusion matrix (LogReg):')
print(confusion_matrix(y_test, y_pred_lr))
print('\nClassification report (LogReg):')
print(classification_report(y_test, y_pred_lr))

# AdaBoost model using the same preprocessor
ada_clf = AdaBoostClassifier(random_state=42)

ada_pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', ada_clf)
])

cv_scores_ada = cross_val_score(ada_pipeline, X_train, y_train, cv=5, scoring='accuracy')
print('\nCV accuracy (AdaBoost):', cv_scores_ada.mean(), '+/-', cv_scores_ada.std())

ada_pipeline.fit(X_train, y_train)
y_pred_ada = ada_pipeline.predict(X_test)

print('Test accuracy (AdaBoost):', accuracy_score(y_test, y_pred_ada))
print('\nConfusion matrix (AdaBoost):')
print(confusion_matrix(y_test, y_pred_ada))
print('\nClassification report (AdaBoost):')
print(classification_report(y_test, y_pred_ada))

CV accuracy (LogReg): 0.9892857142857142 +/- 0.008748177652797088
Test accuracy (LogReg): 1.0

Confusion matrix (LogReg):
[[45  0]
 [ 0 75]]

Classification report (LogReg):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        45
           1       1.00      1.00      1.00        75

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120


CV accuracy (AdaBoost): 0.9964285714285716 +/- 0.007142857142857162
Test accuracy (AdaBoost): 1.0

Confusion matrix (AdaBoost):
[[45  0]
 [ 0 75]]

Classification report (AdaBoost):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        45
           1       1.00      1.00      1.00        75

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00 

## XGBoost model (if xgboost is installed)

In [13]:
if HAS_XGB:
    xgb_clf = XGBClassifier(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=4,
        subsample=0.9,
        colsample_bytree=0.75,
        random_state=42,
        eval_metric='logloss'
)
    
    xgb_pipeline = Pipeline(steps=[
        ('preprocess', preprocessor),
        ('model', xgb_clf)
])
    
    cv_scores_xgb = cross_val_score(xgb_pipeline, X_train, y_train, cv=5, scoring='accuracy')
    print('CV accuracy (XGB):', cv_scores_xgb.mean(), '+/-', cv_scores_xgb.std())
    
    xgb_pipeline.fit(X_train, y_train)
    y_pred_xgb = xgb_pipeline.predict(X_test)
    print('Test accuracy (XGB):', accuracy_score(y_test, y_pred_xgb))
else:
    print('xgboost is not installed; skipping XGB model.')

CV accuracy (XGB): 0.9892857142857142 +/- 0.014285714285714285
Test accuracy (XGB): 0.9833333333333333


## Save trained models to disk

In [14]:
# Directory to save models (relative to notebook location)
models_dir = os.path.join('..', 'models')
os.makedirs(models_dir, exist_ok=True)

# Save RandomForest pipeline
rf_path = os.path.join(models_dir, 'rf_pipeline.joblib')
joblib.dump(rf_pipeline, rf_path)
print('Saved RandomForest model to:', rf_path)

# Save Logistic Regression pipeline
logreg_path = os.path.join(models_dir, 'logreg_pipeline.joblib')
joblib.dump(logreg_pipeline, logreg_path)
print('Saved Logistic Regression model to:', logreg_path)

# Save AdaBoost pipeline
ada_path = os.path.join(models_dir, 'ada_pipeline.joblib')
joblib.dump(ada_pipeline, ada_path)
print('Saved AdaBoost model to:', ada_path)

# Save XGBoost pipeline if available
if HAS_XGB:
    xgb_path = os.path.join(models_dir, 'xgb_pipeline.joblib')
    joblib.dump(xgb_pipeline, xgb_path)
    print('Saved XGBoost model to:', xgb_path)
else:
    print('XGBoost not available; no XGB model saved.')

Saved RandomForest model to: ..\models\rf_pipeline.joblib
Saved Logistic Regression model to: ..\models\logreg_pipeline.joblib
Saved AdaBoost model to: ..\models\ada_pipeline.joblib
Saved XGBoost model to: ..\models\xgb_pipeline.joblib
