In [2]:
# Import libraries
import os
import joblib

import numpy as np
import pandas as pd

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

# Optional: XGBoost if installed
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except ImportError:
    HAS_XGB = False

## Load dataset

In [3]:
# Path assumes this notebook is in `notebooks/` and data in `../dataset/`
data_path = os.path.join('..', 'dataset', 'kidney_disease.csv')
df = pd.read_csv(data_path)

print('Shape:', df.shape)

# Show raw target distribution (after stripping hidden whitespace/tabs)
target_preview = df['classification'].astype(str).str.replace('\t', '', regex=False).str.strip().str.lower()
print('\nTarget distribution (cleaned):')
print(target_preview.value_counts(dropna=False))

df.head()

Shape: (400, 26)

Target distribution (cleaned):
classification
ckd       250
notckd    150
Name: count, dtype: int64


Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


## Preprocessing + Model Training (KNNImputer + MinMaxScaler)

This section uses the dataset-specific cleaning + binary encoding you provided, then trains models using an `sklearn` pipeline: `KNNImputer → MinMaxScaler → Model`.

In [4]:
# --- Helpers: cleaning + dataset-specific encoding ---
def clean_text(value):
    if isinstance(value, str):
        return value.strip().replace('\t', '')
    return value

def prepare_xy(raw_df: pd.DataFrame):
    df_local = raw_df.copy()

    # Clean string columns (fix hidden tabs/spaces)
    for col_name in df_local.select_dtypes(include=['object']).columns:
        df_local[col_name] = df_local[col_name].apply(clean_text)

    # Convert '?' markers into NaN (this dataset contains '?' in numeric-like cols)
    df_local = df_local.replace('?', np.nan)

    if 'classification' not in df_local.columns:
        raise ValueError("Expected 'classification' column in dataset")

    # Target: ckd -> 1, notckd -> 0
    y_raw = df_local['classification'].astype(str).apply(clean_text).str.lower()
    y = y_raw.replace({'ckd': 1, 'notckd': 0})
    mask = y.isin([0, 1])
    df_local = df_local.loc[mask].copy()
    y = y.loc[mask].astype(int)

    # Features (drop identifiers + target)
    drop_cols = [c for c in ['id', 'classification'] if c in df_local.columns]
    X = df_local.drop(columns=drop_cols)

    # Force known numeric-like columns into numeric
    for col_name in ['pcv', 'wc', 'rc']:
        if col_name in X.columns:
            X[col_name] = pd.to_numeric(X[col_name], errors='coerce')

    # Binary/categorical mappings to 0/1
    mapping = {
        'rbc': {'normal': 1, 'abnormal': 0},
        'pc': {'normal': 1, 'abnormal': 0},
        'pcc': {'present': 1, 'notpresent': 0},
        'ba': {'present': 1, 'notpresent': 0},
        'htn': {'yes': 1, 'no': 0},
        'dm': {'yes': 1, 'no': 0},
        'cad': {'yes': 1, 'no': 0},
        'appet': {'good': 1, 'poor': 0},
        'pe': {'yes': 1, 'no': 0},
        'ane': {'yes': 1, 'no': 0},
    }
    for col_name, map_dict in mapping.items():
        if col_name in X.columns:
            X[col_name] = X[col_name].map(map_dict)

    return X, y

# Build X/y
X, y = prepare_xy(df)
print('X shape:', X.shape)
print('y distribution:\n', y.value_counts())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Models (all share the same KNN+MinMax preprocessing pipeline)
models = {
    'rf': RandomForestClassifier(n_estimators=300, random_state=42),
    'logreg': LogisticRegression(max_iter=2000),
    'ada': AdaBoostClassifier(random_state=42),
}
if HAS_XGB:
    models['xgb'] = XGBClassifier(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=4,
        subsample=0.9,
        colsample_bytree=0.75,
        random_state=42,
        eval_metric='logloss',
    )

trained_pipelines = {}
for name, clf in models.items():
    pipe = Pipeline(steps=[
        ('imputer', KNNImputer(n_neighbors=5)),
        ('scaler', MinMaxScaler()),
        ('model', clf),
    ])

    cv_scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy')
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    print('\n' + '=' * 60)
    print(f'Model: {name}')
    print('CV accuracy:', cv_scores.mean(), '+/-', cv_scores.std())
    print('Test accuracy:', accuracy_score(y_test, y_pred))
    print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
    print('Classification report:\n', classification_report(y_test, y_pred))

    trained_pipelines[name] = pipe

# Save trained models
models_dir = os.path.join('..', 'models')
os.makedirs(models_dir, exist_ok=True)

save_paths = {
    'rf': os.path.join(models_dir, 'rf_knnminmax_pipeline.joblib'),
    'logreg': os.path.join(models_dir, 'logreg_knnminmax_pipeline.joblib'),
    'ada': os.path.join(models_dir, 'ada_knnminmax_pipeline.joblib'),
    'xgb': os.path.join(models_dir, 'xgb_knnminmax_pipeline.joblib'),
}

for name, pipe in trained_pipelines.items():
    path = save_paths[name]
    joblib.dump(pipe, path)
    print('Saved', name, 'to:', path)

# Optional: export a fully numeric preprocessed dataset (features scaled; target kept as 0/1)
preprocess_only = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', MinMaxScaler()),
])
X_all_scaled = preprocess_only.fit_transform(X)
df_export = pd.DataFrame(X_all_scaled, columns=X.columns)
df_export['classification'] = y.values

out_path = os.path.join('..', 'dataset', 'kidney_disease_preprocessed_knnminmax.csv')
df_export.to_csv(out_path, index=False)
print('Exported preprocessed dataset to:', out_path)

  y = y_raw.replace({'ckd': 1, 'notckd': 0})


X shape: (400, 24)
y distribution:
 classification
1    250
0    150
Name: count, dtype: int64

Model: rf
CV accuracy: 0.9928571428571429 +/- 0.01428571428571428
Test accuracy: 0.9833333333333333
Confusion matrix:
 [[44  1]
 [ 1 74]]
Classification report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98        45
           1       0.99      0.99      0.99        75

    accuracy                           0.98       120
   macro avg       0.98      0.98      0.98       120
weighted avg       0.98      0.98      0.98       120


Model: logreg
CV accuracy: 0.9678571428571427 +/- 0.03072259023943795
Test accuracy: 0.9833333333333333
Confusion matrix:
 [[45  0]
 [ 2 73]]
Classification report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98        45
           1       1.00      0.97      0.99        75

    accuracy                           0.98       120
   macro avg       0.98      0