# Chronic Kidney Disease Prediction using Machine Learning

This notebook demonstrates the process of building a machine learning model to predict chronic kidney disease using the UCI Chronic Kidney Disease Dataset. It covers data loading, preprocessing, model training, and evaluation.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import data_preprocessing utility
from data_preprocessing import preprocess_tabular_data

# --- 1. Data Loading ---
data_path = '../data/raw/kidney_disease.csv'

# Column names for the chronic kidney disease dataset
column_names = ['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'classification']

try:
    data = pd.read_csv(data_path, names=column_names)
    print("Dataset loaded successfully.")
    print(data.head())
except FileNotFoundError:
    print(f"Error: {data_path} not found. Creating dummy data for demonstration.")
    # Create dummy data
    data = pd.DataFrame({
        'age': np.random.randint(20, 80, 100),
        'bp': np.random.randint(50, 180, 100),
        'sg': np.random.choice([1.005, 1.010, 1.015, 1.020, 1.025], 100),
        'al': np.random.randint(0, 5, 100),
        'su': np.random.randint(0, 5, 100),
        'rbc': np.random.choice(['normal', 'abnormal'], 100),
        'pc': np.random.choice(['normal', 'abnormal'], 100),
        'pcc': np.random.choice(['present', 'notpresent'], 100),
        'ba': np.random.choice(['present', 'notpresent'], 100),
        'bgr': np.random.randint(70, 300, 100),
        'bu': np.random.randint(10, 150, 100),
        'sc': np.random.uniform(0.5, 15, 100),
        'sod': np.random.randint(120, 160, 100),
        'pot': np.random.uniform(2.5, 6.0, 100),
        'hemo': np.random.uniform(8, 18, 100),
        'pcv': np.random.randint(25, 55, 100),
        'wc': np.random.randint(3000, 15000, 100),
        'rc': np.random.uniform(3.0, 6.0, 100),
        'htn': np.random.choice(['yes', 'no'], 100),
        'dm': np.random.choice(['yes', 'no'], 100),
        'cad': np.random.choice(['yes', 'no'], 100),
        'appet': np.random.choice(['good', 'poor'], 100),
        'pe': np.random.choice(['yes', 'no'], 100),
        'ane': np.random.choice(['yes', 'no'], 100),
        'classification': np.random.choice(['ckd', 'notckd'], 100)
    })

# --- 2. Exploratory Data Analysis (EDA) ---
print("\nDataset Info:")
data.info()

print("\nDataset Description:")
data.describe()

print("\nMissing values:")
print(data.isnull().sum())

# Target distribution
plt.figure(figsize=(6, 4))
data['classification'].value_counts().plot(kind='bar')
plt.title("Target Distribution")
plt.xlabel("Classification (ckd: Chronic Kidney Disease, notckd: Not CKD)")
plt.ylabel("Count")
plt.show()

# Correlation matrix for numerical features
numerical_features = data.select_dtypes(include=[np.number]).columns
plt.figure(figsize=(12, 10))
sns.heatmap(data[numerical_features].corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Matrix (Numerical Features)")
plt.show()

# --- 3. Data Preprocessing ---
# Define columns for preprocessing
numerical_cols = ['age', 'bp', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']
categorical_cols = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'sg', 'al', 'su']
target_column = 'classification'

# Convert target to binary (0: notckd, 1: ckd)
data[target_column] = data[target_column].map({'notckd': 0, 'ckd': 1})

X_processed, y, preprocessor = preprocess_tabular_data(data, numerical_cols, categorical_cols, target_column)

print("\nShape of preprocessed features:", X_processed.shape)
print("First 5 rows of preprocessed features:")
print(X_processed.head())

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42, stratify=y)

print("Data split into training and testing sets.")
print(f"Training features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")

# --- 4. Model Building (Gradient Boosting) ---
model = GradientBoostingClassifier(n_estimators=100, random_state=42)

print("Training Gradient Boosting model...")
model.fit(X_train, y_train)
print("Model training complete.")

# --- 5. Model Evaluation ---
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("\nModel Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall (Sensitivity): {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

# Classification Report
from sklearn.metrics import classification_report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ROC Curve
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc='lower right')
plt.show()

# Feature Importance
feature_importance = pd.DataFrame({
    'feature': X_processed.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
plt.title("Top 10 Feature Importances")
plt.xlabel("Importance")
plt.show()

# --- 6. Save Model (Optional) ---
import joblib
joblib.dump(model, 'kidney_disease_gradient_boosting_model.pkl')
joblib.dump(preprocessor, 'kidney_disease_preprocessor.pkl')
print("Model and preprocessor saved.")
