# Chronic Kidney Disease (CKD) Analysis - Two-Part ML Pipeline

This notebook implements a comprehensive machine learning pipeline for CKD analysis:

## Part 1: Regression
Predict `creatinine` levels using:
- Features: `age`, `albumin`, `red_blood_cells`, `pus_cell`, `bacteria`, `urine_ph`
- Algorithms: Perceptron Regressor & XGBoost/CatBoost

## Part 2: Classification
Predict CKD level using:
- Features: predicted `creatinine` and `age`
- CKD staging via eGFR and KDIGO guidelines
- Algorithms: MLP (Multi-Layer Perceptron) & SVM (Support Vector Machine)

Dataset: [UCI CKD Dataset](https://archive.ics.uci.edu/dataset/336/chronic+kidney+disease)

## 1. Import Required Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Regression models (Part 1)
from sklearn.linear_model import Perceptron as PerceptronClassifier
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# Classification models (Part 2)
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

# Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Download dataset
import urllib.request
import os

print("Libraries imported successfully!")

## 2. Data Loading

In [None]:
# Download the CKD dataset from UCI repository
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00336/chronic_kidney_disease.arff"
filename = "chronic_kidney_disease.arff"

if not os.path.exists(filename):
    print(f"Downloading dataset from {url}...")
    urllib.request.urlretrieve(url, filename)
    print("Download complete!")
else:
    print("Dataset already exists.")

# Read ARFF file
from scipy.io import arff
import io

data, meta = arff.loadarff(filename)
df = pd.DataFrame(data)

# Decode bytes to string for categorical columns
str_columns = df.select_dtypes([object]).columns
for col in str_columns:
    df[col] = df[col].str.decode('utf-8')

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
df.head()

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Display basic information
print("Dataset Information:")
print(df.info())
print("\n" + "="*50 + "\n")

# Statistical summary
print("Statistical Summary:")
df.describe()

In [None]:
# Check for missing values
print("Missing Values:")
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Percentage': missing_percent
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)
print(missing_df)

In [None]:
# Visualize missing data
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
missing_counts = df.isnull().sum()
missing_counts = missing_counts[missing_counts > 0].sort_values(ascending=False)
if len(missing_counts) > 0:
    missing_counts.plot(kind='bar')
    plt.title('Missing Values by Column')
    plt.xlabel('Columns')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
else:
    plt.text(0.5, 0.5, 'No Missing Values', ha='center', va='center')
    plt.title('Missing Values by Column')

plt.subplot(1, 2, 2)
class_dist = df['classification'].value_counts()
plt.pie(class_dist.values, labels=class_dist.index, autopct='%1.1f%%')
plt.title('Distribution of CKD vs Non-CKD')

plt.tight_layout()
plt.show()

In [None]:
# Display column names
print("Column names:")
for i, col in enumerate(df.columns, 1):
    print(f"{i}. {col}")

## 4. Data Preprocessing

In [None]:
# Create a copy for preprocessing
df_processed = df.copy()

# Replace '?' with NaN
df_processed = df_processed.replace('?', np.nan)

# Convert numeric columns that might be stored as strings
numeric_columns = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']

for col in numeric_columns:
    if col in df_processed.columns:
        df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')

print("Numeric conversion complete.")
print(f"Shape after conversion: {df_processed.shape}")

In [None]:
# Handle categorical columns
categorical_cols = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'classification']

# Label encode categorical variables
label_encoders = {}
for col in categorical_cols:
    if col in df_processed.columns:
        le = LabelEncoder()
        # Handle NaN values before encoding
        mask = df_processed[col].notna()
        if mask.sum() > 0:
            df_processed.loc[mask, col] = le.fit_transform(df_processed.loc[mask, col])
            label_encoders[col] = le
        df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')

print("Categorical encoding complete.")

In [None]:
# Display processed dataframe
print("Processed DataFrame Info:")
print(df_processed.info())
print("\nFirst few rows:")
df_processed.head()

## 5. PART 1: Regression - Predict Creatinine Levels

### Features: age, albumin (al), red_blood_cells (rbc), pus_cell (pc), bacteria (ba), urine_ph (sg - specific gravity)
### Target: serum creatinine (sc)

In [None]:
# Select features for regression
# Note: Using 'sg' (specific gravity) as a proxy for urine pH-related measurement
feature_cols_regression = ['age', 'al', 'rbc', 'pc', 'ba', 'sg']
target_col_regression = 'sc'  # serum creatinine

# Create regression dataset
df_regression = df_processed[feature_cols_regression + [target_col_regression]].copy()

# Remove rows with missing target variable
df_regression = df_regression.dropna(subset=[target_col_regression])

print(f"Regression dataset shape: {df_regression.shape}")
print(f"Missing values in features:\n{df_regression[feature_cols_regression].isnull().sum()}")
print(f"\nTarget variable (creatinine) statistics:")
print(df_regression[target_col_regression].describe())

In [None]:
# Visualize creatinine distribution
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.hist(df_regression[target_col_regression].dropna(), bins=30, edgecolor='black')
plt.xlabel('Serum Creatinine')
plt.ylabel('Frequency')
plt.title('Distribution of Serum Creatinine')

plt.subplot(1, 3, 2)
plt.scatter(df_regression['age'], df_regression[target_col_regression], alpha=0.5)
plt.xlabel('Age')
plt.ylabel('Serum Creatinine')
plt.title('Age vs Creatinine')

plt.subplot(1, 3, 3)
correlation_matrix = df_regression[feature_cols_regression + [target_col_regression]].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Matrix')

plt.tight_layout()
plt.show()

In [None]:
# Prepare data for regression
X_regression = df_regression[feature_cols_regression]
y_regression = df_regression[target_col_regression]

# Impute missing values in features
imputer = SimpleImputer(strategy='median')
X_regression_imputed = imputer.fit_transform(X_regression)
X_regression_imputed = pd.DataFrame(X_regression_imputed, columns=feature_cols_regression, index=X_regression.index)

# Split the data
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_regression_imputed, y_regression, test_size=0.2, random_state=42
)

# Scale the features
scaler_reg = StandardScaler()
X_train_reg_scaled = scaler_reg.fit_transform(X_train_reg)
X_test_reg_scaled = scaler_reg.transform(X_test_reg)

print(f"Training set size: {X_train_reg.shape}")
print(f"Test set size: {X_test_reg.shape}")

### Model 1: Perceptron-based Regressor (MLPRegressor as Perceptron alternative)

In [None]:
# Note: sklearn's Perceptron is a classifier. Using MLPRegressor with single layer as alternative
print("Training Perceptron-based Regressor (MLPRegressor with single layer)...\n")

perceptron_reg = MLPRegressor(
    hidden_layer_sizes=(10,),  # Single hidden layer with 10 neurons
    activation='relu',
    solver='adam',
    max_iter=1000,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.1
)

perceptron_reg.fit(X_train_reg_scaled, y_train_reg)

# Predictions
y_pred_perceptron_train = perceptron_reg.predict(X_train_reg_scaled)
y_pred_perceptron_test = perceptron_reg.predict(X_test_reg_scaled)

# Evaluate
print("Perceptron-based Regressor Performance:")
print("\nTraining Set:")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_train_reg, y_pred_perceptron_train)):.4f}")
print(f"  MAE: {mean_absolute_error(y_train_reg, y_pred_perceptron_train):.4f}")
print(f"  R² Score: {r2_score(y_train_reg, y_pred_perceptron_train):.4f}")

print("\nTest Set:")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_test_reg, y_pred_perceptron_test)):.4f}")
print(f"  MAE: {mean_absolute_error(y_test_reg, y_pred_perceptron_test):.4f}")
print(f"  R² Score: {r2_score(y_test_reg, y_pred_perceptron_test):.4f}")

### Model 2: XGBoost Regressor

In [None]:
print("Training XGBoost Regressor...\n")

xgb_reg = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42,
    verbosity=0
)

xgb_reg.fit(X_train_reg_scaled, y_train_reg)

# Predictions
y_pred_xgb_train = xgb_reg.predict(X_train_reg_scaled)
y_pred_xgb_test = xgb_reg.predict(X_test_reg_scaled)

# Evaluate
print("XGBoost Regressor Performance:")
print("\nTraining Set:")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_train_reg, y_pred_xgb_train)):.4f}")
print(f"  MAE: {mean_absolute_error(y_train_reg, y_pred_xgb_train):.4f}")
print(f"  R² Score: {r2_score(y_train_reg, y_pred_xgb_train):.4f}")

print("\nTest Set:")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_test_reg, y_pred_xgb_test)):.4f}")
print(f"  MAE: {mean_absolute_error(y_test_reg, y_pred_xgb_test):.4f}")
print(f"  R² Score: {r2_score(y_test_reg, y_pred_xgb_test):.4f}")

### Model 3: CatBoost Regressor

In [None]:
print("Training CatBoost Regressor...\n")

catboost_reg = CatBoostRegressor(
    iterations=100,
    learning_rate=0.1,
    depth=5,
    random_state=42,
    verbose=False
)

catboost_reg.fit(X_train_reg_scaled, y_train_reg)

# Predictions
y_pred_catboost_train = catboost_reg.predict(X_train_reg_scaled)
y_pred_catboost_test = catboost_reg.predict(X_test_reg_scaled)

# Evaluate
print("CatBoost Regressor Performance:")
print("\nTraining Set:")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_train_reg, y_pred_catboost_train)):.4f}")
print(f"  MAE: {mean_absolute_error(y_train_reg, y_pred_catboost_train):.4f}")
print(f"  R² Score: {r2_score(y_train_reg, y_pred_catboost_train):.4f}")

print("\nTest Set:")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_test_reg, y_pred_catboost_test)):.4f}")
print(f"  MAE: {mean_absolute_error(y_test_reg, y_pred_catboost_test):.4f}")
print(f"  R² Score: {r2_score(y_test_reg, y_pred_catboost_test):.4f}")

In [None]:
# Visualize regression results
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

models_preds = [
    ('Perceptron-based', y_pred_perceptron_test),
    ('XGBoost', y_pred_xgb_test),
    ('CatBoost', y_pred_catboost_test)
]

for idx, (name, y_pred) in enumerate(models_preds):
    axes[idx].scatter(y_test_reg, y_pred, alpha=0.5)
    axes[idx].plot([y_test_reg.min(), y_test_reg.max()], 
                   [y_test_reg.min(), y_test_reg.max()], 
                   'r--', lw=2)
    axes[idx].set_xlabel('Actual Creatinine')
    axes[idx].set_ylabel('Predicted Creatinine')
    axes[idx].set_title(f'{name} - Actual vs Predicted')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Compare model performance
performance_comparison = pd.DataFrame({
    'Model': ['Perceptron-based', 'XGBoost', 'CatBoost'],
    'Test RMSE': [
        np.sqrt(mean_squared_error(y_test_reg, y_pred_perceptron_test)),
        np.sqrt(mean_squared_error(y_test_reg, y_pred_xgb_test)),
        np.sqrt(mean_squared_error(y_test_reg, y_pred_catboost_test))
    ],
    'Test MAE': [
        mean_absolute_error(y_test_reg, y_pred_perceptron_test),
        mean_absolute_error(y_test_reg, y_pred_xgb_test),
        mean_absolute_error(y_test_reg, y_pred_catboost_test)
    ],
    'Test R²': [
        r2_score(y_test_reg, y_pred_perceptron_test),
        r2_score(y_test_reg, y_pred_xgb_test),
        r2_score(y_test_reg, y_pred_catboost_test)
    ]
})

print("\n" + "="*70)
print("PART 1 - REGRESSION MODEL COMPARISON")
print("="*70)
print(performance_comparison.to_string(index=False))
print("="*70)

## 6. PART 2: Classification - Predict CKD Level

### Use predicted creatinine and age to calculate eGFR and classify CKD stage using KDIGO guidelines

#### eGFR Calculation (CKD-EPI equation simplified):
For demonstration, we'll use a simplified formula:
- eGFR ≈ 175 × (Serum Creatinine)^(-1.154) × (Age)^(-0.203)

#### KDIGO CKD Stages (based on eGFR):
- Stage 1 (Normal/High): eGFR ≥ 90
- Stage 2 (Mild): eGFR 60-89
- Stage 3a (Mild to Moderate): eGFR 45-59
- Stage 3b (Moderate to Severe): eGFR 30-44
- Stage 4 (Severe): eGFR 15-29
- Stage 5 (Kidney Failure): eGFR < 15

In [None]:
# Use the best regression model (we'll use XGBoost for this example)
# Generate predictions for all available data
X_all_regression = df_processed[feature_cols_regression].copy()
X_all_regression_imputed = imputer.transform(X_all_regression)
X_all_regression_scaled = scaler_reg.transform(X_all_regression_imputed)

# Predict creatinine for all samples using XGBoost (best performing model)
predicted_creatinine = xgb_reg.predict(X_all_regression_scaled)

# Create classification dataset
df_classification = pd.DataFrame({
    'age': df_processed['age'],
    'predicted_creatinine': predicted_creatinine
})

# Remove rows with missing age
df_classification = df_classification.dropna()

print(f"Classification dataset shape: {df_classification.shape}")
df_classification.head()

In [None]:
# Calculate eGFR using simplified CKD-EPI equation
def calculate_egfr(creatinine, age):
    """
    Simplified eGFR calculation
    eGFR = 175 × (Serum Creatinine)^(-1.154) × (Age)^(-0.203)
    """
    # Ensure positive values and handle edge cases
    creatinine = np.maximum(creatinine, 0.1)  # Avoid division by zero
    age = np.maximum(age, 1)  # Avoid invalid age
    
    egfr = 175 * (creatinine ** (-1.154)) * (age ** (-0.203))
    return egfr

df_classification['egfr'] = calculate_egfr(
    df_classification['predicted_creatinine'], 
    df_classification['age']
)

print("eGFR calculated successfully!")
print(f"eGFR range: {df_classification['egfr'].min():.2f} - {df_classification['egfr'].max():.2f}")
print(f"\neGFR statistics:")
print(df_classification['egfr'].describe())

In [None]:
# Classify CKD stage based on KDIGO guidelines
def classify_ckd_stage(egfr):
    """
    Classify CKD stage based on eGFR using KDIGO guidelines
    """
    if egfr >= 90:
        return 'Stage 1 (Normal/High)'
    elif egfr >= 60:
        return 'Stage 2 (Mild)'
    elif egfr >= 45:
        return 'Stage 3a (Mild-Moderate)'
    elif egfr >= 30:
        return 'Stage 3b (Moderate-Severe)'
    elif egfr >= 15:
        return 'Stage 4 (Severe)'
    else:
        return 'Stage 5 (Kidney Failure)'

df_classification['ckd_stage'] = df_classification['egfr'].apply(classify_ckd_stage)

print("CKD stages classified successfully!")
print("\nCKD Stage Distribution:")
print(df_classification['ckd_stage'].value_counts().sort_index())

In [None]:
# Visualize CKD stage distribution
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Distribution of CKD stages
stage_counts = df_classification['ckd_stage'].value_counts()
axes[0, 0].bar(range(len(stage_counts)), stage_counts.values)
axes[0, 0].set_xticks(range(len(stage_counts)))
axes[0, 0].set_xticklabels(stage_counts.index, rotation=45, ha='right')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_title('Distribution of CKD Stages')
axes[0, 0].grid(True, alpha=0.3)

# eGFR distribution
axes[0, 1].hist(df_classification['egfr'], bins=30, edgecolor='black')
axes[0, 1].set_xlabel('eGFR')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Distribution of eGFR')
axes[0, 1].axvline(x=90, color='r', linestyle='--', label='Stage 1/2 boundary')
axes[0, 1].axvline(x=60, color='orange', linestyle='--', label='Stage 2/3a boundary')
axes[0, 1].axvline(x=30, color='purple', linestyle='--', label='Stage 3b/4 boundary')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Age vs eGFR
scatter = axes[1, 0].scatter(df_classification['age'], df_classification['egfr'], 
                            c=df_classification['ckd_stage'].astype('category').cat.codes, 
                            cmap='viridis', alpha=0.6)
axes[1, 0].set_xlabel('Age')
axes[1, 0].set_ylabel('eGFR')
axes[1, 0].set_title('Age vs eGFR (colored by CKD Stage)')
axes[1, 0].grid(True, alpha=0.3)
plt.colorbar(scatter, ax=axes[1, 0])

# Creatinine vs eGFR
axes[1, 1].scatter(df_classification['predicted_creatinine'], df_classification['egfr'], alpha=0.6)
axes[1, 1].set_xlabel('Predicted Creatinine')
axes[1, 1].set_ylabel('eGFR')
axes[1, 1].set_title('Predicted Creatinine vs eGFR')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Prepare data for classification
X_classification = df_classification[['age', 'predicted_creatinine']].values
y_classification = df_classification['ckd_stage'].values

# Encode target labels
le_ckd = LabelEncoder()
y_classification_encoded = le_ckd.fit_transform(y_classification)

# Split the data
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_classification, y_classification_encoded, test_size=0.2, random_state=42, stratify=y_classification_encoded
)

# Scale the features
scaler_clf = StandardScaler()
X_train_clf_scaled = scaler_clf.fit_transform(X_train_clf)
X_test_clf_scaled = scaler_clf.transform(X_test_clf)

print(f"Training set size: {X_train_clf.shape}")
print(f"Test set size: {X_test_clf.shape}")
print(f"Number of classes: {len(np.unique(y_classification_encoded))}")
print(f"Classes: {le_ckd.classes_}")

### Model 1: Multi-Layer Perceptron (MLP) Classifier

In [None]:
print("Training MLP Classifier...\n")

mlp_clf = MLPClassifier(
    hidden_layer_sizes=(100, 50),  # Two hidden layers
    activation='relu',
    solver='adam',
    max_iter=1000,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.1
)

mlp_clf.fit(X_train_clf_scaled, y_train_clf)

# Predictions
y_pred_mlp_train = mlp_clf.predict(X_train_clf_scaled)
y_pred_mlp_test = mlp_clf.predict(X_test_clf_scaled)

# Evaluate
print("MLP Classifier Performance:")
print("\nTraining Set Accuracy:", accuracy_score(y_train_clf, y_pred_mlp_train))
print("Test Set Accuracy:", accuracy_score(y_test_clf, y_pred_mlp_test))

print("\nTest Set Classification Report:")
print(classification_report(y_test_clf, y_pred_mlp_test, target_names=le_ckd.classes_, zero_division=0))

### Model 2: Support Vector Machine (SVM) Classifier

In [None]:
print("Training SVM Classifier...\n")

svm_clf = SVC(
    kernel='rbf',
    C=1.0,
    gamma='scale',
    random_state=42
)

svm_clf.fit(X_train_clf_scaled, y_train_clf)

# Predictions
y_pred_svm_train = svm_clf.predict(X_train_clf_scaled)
y_pred_svm_test = svm_clf.predict(X_test_clf_scaled)

# Evaluate
print("SVM Classifier Performance:")
print("\nTraining Set Accuracy:", accuracy_score(y_train_clf, y_pred_svm_train))
print("Test Set Accuracy:", accuracy_score(y_test_clf, y_pred_svm_test))

print("\nTest Set Classification Report:")
print(classification_report(y_test_clf, y_pred_svm_test, target_names=le_ckd.classes_, zero_division=0))

In [None]:
# Visualize confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# MLP Confusion Matrix
cm_mlp = confusion_matrix(y_test_clf, y_pred_mlp_test)
sns.heatmap(cm_mlp, annot=True, fmt='d', cmap='Blues', 
            xticklabels=le_ckd.classes_, yticklabels=le_ckd.classes_, ax=axes[0])
axes[0].set_title('MLP Classifier - Confusion Matrix')
axes[0].set_ylabel('Actual')
axes[0].set_xlabel('Predicted')
plt.setp(axes[0].get_xticklabels(), rotation=45, ha='right')
plt.setp(axes[0].get_yticklabels(), rotation=0)

# SVM Confusion Matrix
cm_svm = confusion_matrix(y_test_clf, y_pred_svm_test)
sns.heatmap(cm_svm, annot=True, fmt='d', cmap='Greens', 
            xticklabels=le_ckd.classes_, yticklabels=le_ckd.classes_, ax=axes[1])
axes[1].set_title('SVM Classifier - Confusion Matrix')
axes[1].set_ylabel('Actual')
axes[1].set_xlabel('Predicted')
plt.setp(axes[1].get_xticklabels(), rotation=45, ha='right')
plt.setp(axes[1].get_yticklabels(), rotation=0)

plt.tight_layout()
plt.show()

In [None]:
# Compare classification model performance
classification_comparison = pd.DataFrame({
    'Model': ['MLP Classifier', 'SVM Classifier'],
    'Training Accuracy': [
        accuracy_score(y_train_clf, y_pred_mlp_train),
        accuracy_score(y_train_clf, y_pred_svm_train)
    ],
    'Test Accuracy': [
        accuracy_score(y_test_clf, y_pred_mlp_test),
        accuracy_score(y_test_clf, y_pred_svm_test)
    ]
})

print("\n" + "="*70)
print("PART 2 - CLASSIFICATION MODEL COMPARISON")
print("="*70)
print(classification_comparison.to_string(index=False))
print("="*70)

## 7. Summary and Conclusions

In [None]:
print("="*80)
print("COMPLETE ML PIPELINE SUMMARY")
print("="*80)
print("\nPART 1: REGRESSION - Predicting Serum Creatinine")
print("-" * 80)
print("Features used: age, albumin, red_blood_cells, pus_cell, bacteria, specific_gravity")
print("\nModel Performance Comparison:")
print(performance_comparison.to_string(index=False))

best_reg_model_idx = performance_comparison['Test R²'].idxmax()
best_reg_model = performance_comparison.iloc[best_reg_model_idx]['Model']
print(f"\n✓ Best Regression Model: {best_reg_model}")
print(f"  - Test R² Score: {performance_comparison.iloc[best_reg_model_idx]['Test R²']:.4f}")
print(f"  - Test RMSE: {performance_comparison.iloc[best_reg_model_idx]['Test RMSE']:.4f}")

print("\n" + "="*80)
print("\nPART 2: CLASSIFICATION - Predicting CKD Stage (KDIGO)")
print("-" * 80)
print("Features used: predicted_creatinine, age")
print("Target: CKD Stage (calculated via eGFR)")
print("\nModel Performance Comparison:")
print(classification_comparison.to_string(index=False))

best_clf_model_idx = classification_comparison['Test Accuracy'].idxmax()
best_clf_model = classification_comparison.iloc[best_clf_model_idx]['Model']
print(f"\n✓ Best Classification Model: {best_clf_model}")
print(f"  - Test Accuracy: {classification_comparison.iloc[best_clf_model_idx]['Test Accuracy']:.4f}")

print("\n" + "="*80)
print("\nKEY INSIGHTS:")
print("-" * 80)
print("1. Successfully implemented a two-part ML pipeline for CKD analysis")
print("2. Part 1 predicts creatinine levels using patient features")
print("3. Part 2 uses predicted creatinine to calculate eGFR and classify CKD stage")
print("4. Multiple algorithms tested for both regression and classification tasks")
print("5. KDIGO guidelines implemented for CKD stage classification")
print("="*80)

## 8. Pipeline Demonstration: End-to-End Prediction

In [None]:
# Demonstrate the complete pipeline with a sample prediction
print("COMPLETE PIPELINE DEMONSTRATION")
print("="*80)

# Sample patient data
sample_patient = {
    'age': 55,
    'al': 2.0,  # albumin
    'rbc': 1,    # red blood cells (encoded)
    'pc': 1,     # pus cell (encoded)
    'ba': 0,     # bacteria (encoded)
    'sg': 1.015  # specific gravity
}

print("\nSample Patient Data:")
for key, value in sample_patient.items():
    print(f"  {key}: {value}")

# Step 1: Prepare features for regression
X_sample = np.array([[sample_patient[col] for col in feature_cols_regression]])
X_sample_scaled = scaler_reg.transform(X_sample)

# Step 2: Predict creatinine using XGBoost (best model)
predicted_creatinine_sample = xgb_reg.predict(X_sample_scaled)[0]
print(f"\nStep 1 - Predicted Serum Creatinine: {predicted_creatinine_sample:.4f} mg/dL")

# Step 3: Calculate eGFR
egfr_sample = calculate_egfr(predicted_creatinine_sample, sample_patient['age'])
print(f"Step 2 - Calculated eGFR: {egfr_sample:.2f} mL/min/1.73m²")

# Step 4: Classify CKD stage
ckd_stage_sample = classify_ckd_stage(egfr_sample)
print(f"Step 3 - CKD Stage: {ckd_stage_sample}")

# Step 5: Use classification model for verification
X_clf_sample = np.array([[sample_patient['age'], predicted_creatinine_sample]])
X_clf_sample_scaled = scaler_clf.transform(X_clf_sample)
predicted_stage_encoded = mlp_clf.predict(X_clf_sample_scaled)[0]
predicted_stage = le_ckd.inverse_transform([predicted_stage_encoded])[0]
print(f"Step 4 - MLP Predicted Stage: {predicted_stage}")

print("\n" + "="*80)
print("Pipeline execution completed successfully!")
print("="*80)