<a href="https://colab.research.google.com/github/QwuophyRain/Kidney-Disease-Early-Detection-System-KDEDS-/blob/main/KDEDS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Kidney Disease Early Detection System (KDEDS)




In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Bidirectional, Attention, Dropout, MultiHeadAttention
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import shap
import matplotlib.pyplot as plt
import joblib
import datetime

class KidneyDiseaseEarlyDetectionSystem:
    """
    A comprehensive system for early detection of kidney diseases using
    temporal deep learning with attention mechanisms.
    """

    def _init_(self, config=None):
        """
        Initialize the kidney disease early detection system.

        Args:
            config: Dictionary containing configuration parameters
        """
        self.config = {
            'sequence_length': 10,  # Number of time steps to consider
            'feature_dim': 42,      # Number of features after processing
            'lstm_units': 128,      # Number of LSTM units
            'attention_heads': 8,   # Number of attention heads
            'dropout_rate': 0.3,    # Dropout rate for regularization
            'learning_rate': 0.001, # Learning rate for optimizer
            'batch_size': 64,       # Batch size for training
            'epochs': 100,          # Maximum epochs for training
            'early_stopping': 10,   # Patience for early stopping
            'threshold': 0.7,       # Default prediction threshold
        }

        if config:
            self.config.update(config)

        self.model = None
        self.data_preprocessor = None
        self.feature_names = None
        self.explainer = None

    def preprocess_data(self, data, is_training=True):
        """
        Preprocess the input data for the model.

        Args:
            data: DataFrame containing patient data
            is_training: Whether this is for training or inference

        Returns:
            Preprocessed data ready for the model
        """
        # Handle missing values using Multiple Imputation by Chained Equations
        if is_training:
            self.data_preprocessor = IterativeImputer(max_iter=10, random_state=42)
            imputed_data = self.data_preprocessor.fit_transform(data.select_dtypes(include=[np.number]))
            joblib.dump(self.data_preprocessor, 'kidney_imputer.pkl')
        else:
            if self.data_preprocessor is None:
                self.data_preprocessor = joblib.load('kidney_imputer.pkl')
            imputed_data = self.data_preprocessor.transform(data.select_dtypes(include=[np.number]))

        # Replace the numeric columns with imputed values
        numeric_cols = data.select_dtypes(include=[np.number]).columns
        data_imputed = data.copy()
        data_imputed[numeric_cols] = imputed_data

        # Create temporal sequences for each patient
        patient_ids = data_imputed['patient_id'].unique()
        sequences = []
        labels = []

        for patient_id in patient_ids:
            patient_data = data_imputed[data_imputed['patient_id'] == patient_id].sort_values('timestamp')

            # Extract features and normalize
            features = patient_data.drop(['patient_id', 'timestamp', 'kidney_disease'], axis=1)
            if is_training:
                self.feature_names = features.columns.tolist()
                self.scaler = StandardScaler()
                features_scaled = self.scaler.fit_transform(features)
                joblib.dump(self.scaler, 'kidney_scaler.pkl')
            else:
                if self.scaler is None:
                    self.scaler = joblib.load('kidney_scaler.pkl')
                features_scaled = self.scaler.transform(features)

            # Create sequences of specified length
            for i in range(len(patient_data) - self.config['sequence_length'] + 1):
                seq = features_scaled[i:i + self.config['sequence_length']]
                sequences.append(seq)

                # Label is positive if kidney disease is diagnosed within next 6 months
                future_diagnosis = patient_data.iloc[i + self.config['sequence_length'] - 1:]['kidney_disease'].max()
                labels.append(future_diagnosis)

        return np.array(sequences), np.array(labels)

    def build_model(self):
        """
        Build the multi-stage temporal deep learning model with attention mechanisms.

        Returns:
            Compiled Keras model
        """
        # Input layer
        input_layer = Input(shape=(self.config['sequence_length'], self.config['feature_dim']))

        # Bidirectional LSTM to capture temporal patterns
        lstm_layer = Bidirectional(LSTM(self.config['lstm_units'], return_sequences=True))(input_layer)
        lstm_layer = Dropout(self.config['dropout_rate'])(lstm_layer)

        # Multi-head attention mechanism
        attention_layer = MultiHeadAttention(
            num_heads=self.config['attention_heads'],
            key_dim=self.config['lstm_units'] // self.config['attention_heads']
        )(lstm_layer, lstm_layer)

        # Skip connection
        combined = tf.keras.layers.Add()([lstm_layer, attention_layer])
        combined = tf.keras.layers.LayerNormalization()(combined)

        # Flatten and dense layers for classification
        flatten = tf.keras.layers.Flatten()(combined)
        dense1 = Dense(64, activation='relu')(flatten)
        dense1 = Dropout(self.config['dropout_rate'])(dense1)
        output = Dense(1, activation='sigmoid')(dense1)

        # Create and compile model
        model = Model(inputs=input_layer, outputs=output)
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=self.config['learning_rate']),
            loss='binary_crossentropy',
            metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
        )

        self.model = model
        return model

    def train(self, train_data, validation_data=None, validation_split=0.2):
        """
        Train the model on the provided data.

        Args:
            train_data: DataFrame containing training data
            validation_data: Optional DataFrame for validation
            validation_split: Validation split if validation_data not provided

        Returns:
            Training history
        """
        # Preprocess the data
        X_train, y_train = self.preprocess_data(train_data, is_training=True)

        # Split for validation if validation_data not provided
        if validation_data is not None:
            X_val, y_val = self.preprocess_data(validation_data, is_training=False)
        else:
            X_train, X_val, y_train, y_val = train_test_split(
                X_train, y_train, test_size=validation_split, random_state=42
            )

        # Build the model if not already built
        if self.model is None:
            self.build_model()

        # Set up callbacks
        early_stopping = EarlyStopping(
            monitor='val_auc',
            patience=self.config['early_stopping'],
            mode='max',
            restore_best_weights=True
        )

        # Train the model
        history = self.model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=self.config['epochs'],
            batch_size=self.config['batch_size'],
            callbacks=[early_stopping],
            verbose=1
        )

        # Build explainer for model interpretability
        self._build_explainer(X_train)

        return history

    def _build_explainer(self, train_data_sample):
        """
        Build a SHAP explainer for model interpretability.

        Args:
            train_data_sample: Sample of training data for explainer
        """
        # Create a simplified model to explain predictions
        def model_predict(x):
            return self.model.predict(x)

        # Create the explainer
        self.explainer = shap.KernelExplainer(
            model_predict,
            shap.sample(train_data_sample, 100)  # Sample 100 instances for background distribution
        )

    def predict(self, patient_data):
        """
        Generate predictions for patient data.

        Args:
            patient_data: DataFrame containing patient data

        Returns:
            Dictionary containing predictions and explanations
        """
        # Preprocess the data
        X_test, _ = self.preprocess_data(patient_data, is_training=False)

        # Generate predictions
        risk_scores = self.model.predict(X_test).flatten()
        predictions = risk_scores > self.config['threshold']

        # Generate explanations for predictions
        if self.explainer is not None:
            try:
                shap_values = self.explainer.shap_values(X_test[:10])  # Limit to first 10 for efficiency
                feature_importance = {}
                for i, feature in enumerate(self.feature_names):
                    feature_importance[feature] = np.abs(shap_values[0][:, i]).mean()
            except:
                feature_importance = None
        else:
            feature_importance = None

        # Return results
        results = {
            'patient_ids': patient_data['patient_id'].unique().tolist(),
            'risk_scores': risk_scores.tolist(),
            'predictions': predictions.tolist(),
            'feature_importance': feature_importance,
            'timestamp': datetime.datetime.now().isoformat()
        }

        return results

    def save_model(self, path='kidney_model'):
        """
        Save the trained model and associated components.

        Args:
            path: Base path for saving
        """
        if self.model is None:
            raise ValueError("Model has not been trained yet")

        # Save Keras model
        self.model.save(f'{path}.h5')

        # Save configuration and preprocessing components
        np.save(f'{path}_config.npy', self.config)
        if self.feature_names:
            with open(f'{path}_features.txt', 'w') as f:
                f.write('\n'.join(self.feature_names))

    def load_model(self, path='kidney_model'):
        """
        Load a trained model and associated components.

        Args:
            path: Base path for loading
        """
        # Load Keras model
        self.model = tf.keras.models.load_model(f'{path}.h5')

        # Load configuration and preprocessing components
        self.config = np.load(f'{path}_config.npy', allow_pickle=True).item()
        with open(f'{path}_features.txt', 'r') as f:
            self.feature_names = [line.strip() for line in f.readlines()]

        self.scaler = joblib.load('kidney_scaler.pkl')
        self.data_preprocessor = joblib.load('kidney_imputer.pkl')

    def evaluate(self, test_data, threshold=None):
        """
        Evaluate the model performance on test data.

        Args:
            test_data: DataFrame containing test data
            threshold: Optional custom threshold for positive prediction

        Returns:
            Dictionary containing evaluation metrics
        """
        # Use default threshold if not specified
        if threshold is None:
            threshold = self.config['threshold']

        # Preprocess the data
        X_test, y_test = self.preprocess_data(test_data, is_training=False)

        # Generate predictions
        y_pred_proba = self.model.predict(X_test).flatten()
        y_pred = y_pred_proba > threshold

        # Calculate metrics
        accuracy = np.mean(y_pred == y_test)
        roc_auc = roc_auc_score(y_test, y_pred_proba)

        # Calculate precision-recall curve and AUC
        precision, recall, pr_thresholds = precision_recall_curve(y_test, y_pred_proba)
        pr_auc = auc(recall, precision)

        # Find the threshold that maximizes F1 score
        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
        best_threshold_idx = np.argmax(f1_scores)
        best_threshold = pr_thresholds[best_threshold_idx]

        # Calculate confusion matrix
        TP = np.sum((y_pred == 1) & (y_test == 1))
        TN = np.sum((y_pred == 0) & (y_test == 0))
        FP = np.sum((y_pred == 1) & (y_test == 0))
        FN = np.sum((y_pred == 0) & (y_test == 1))

        # Compile evaluation results
        results = {
            'accuracy': accuracy,
            'roc_auc': roc_auc,
            'pr_auc': pr_auc,
            'sensitivity': TP / (TP + FN) if (TP + FN) > 0 else 0,
            'specificity': TN / (TN + FP) if (TN + FP) > 0 else 0,
            'precision': TP / (TP + FP) if (TP + FP) > 0 else 0,
            'recall': TP / (TP + FN) if (TP + FN) > 0 else 0,
            'f1_score': 2 * TP / (2 * TP + FP + FN) if (2 * TP + FP + FN) > 0 else 0,
            'best_threshold': best_threshold,
            'confusion_matrix': {
                'TP': int(TP),
                'TN': int(TN),
                'FP': int(FP),
                'FN': int(FN)
            }
        }

        return results

    def visualize_predictions(self, patient_data, save_path=None):
        """
        Visualize predictions and feature importance for a patient.

        Args:
            patient_data: DataFrame containing data for a single patient
            save_path: Optional path to save visualization

        Returns:
            Matplotlib figure
        """
        # Ensure we're working with a single patient
        patient_id = patient_data['patient_id'].iloc[0]
        patient_data_sorted = patient_data.sort_values('timestamp')

        # Preprocess and predict
        X, _ = self.preprocess_data(patient_data, is_training=False)
        risk_scores = self.model.predict(X).flatten()

        # Calculate SHAP values
        if self.explainer is not None:
            shap_values = self.explainer.shap_values(X[0:1])[0]
            feature_importance = {}
            for i, feature in enumerate(self.feature_names):
                feature_importance[feature] = np.abs(shap_values[:, i]).mean()

            # Sort features by importance
            sorted_features = sorted(
                feature_importance.items(),
                key=lambda x: x[1],
                reverse=True
            )[:10]  # Top 10 features
        else:
            sorted_features = []

        # Create visualization
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), gridspec_kw={'height_ratios': [1, 2]})

        # Plot risk trajectory
        timestamps = patient_data_sorted['timestamp'].iloc[self.config['sequence_length']-1:].values
        ax1.plot(timestamps, risk_scores, 'o-', color='blue')
        ax1.axhline(y=self.config['threshold'], color='red', linestyle='--', label='Risk Threshold')
        ax1.set_title(f'Kidney Disease Risk Trajectory for Patient {patient_id}')
        ax1.set_ylabel('Risk Score')
        ax1.set_ylim(0, 1)
        ax1.legend()
        ax1.grid(True)

        # Plot feature importance
        if sorted_features:
            features, importance = zip(*sorted_features)
            y_pos = np.arange(len(features))
            ax2.barh(y_pos, importance, align='center')
            ax2.set_yticks(y_pos)
            ax2.set_yticklabels(features)
            ax2.invert_yaxis()  # Labels read top-to-bottom
            ax2.set_title('Top 10 Features Influencing Prediction')
            ax2.set_xlabel('Feature Importance (SHAP value)')
        else:
            ax2.text(0.5, 0.5, 'Feature importance not available',
                    horizontalalignment='center', verticalalignment='center',
                    transform=ax2.transAxes)

        plt.tight_layout()

        if save_path:
            plt.savefig(save_path)

        return fig


# Example usage
if _name_ == "_main_":
    # Load sample data (this would be real patient data in practice)
    # This is just a placeholder - in reality, you would load your own dataset
    from sklearn.datasets import make_classification
    import random

    # Create synthetic patient data for demonstration
    def create_synthetic_patient_data(n_patients=100, n_visits_per_patient=15):
        """Create synthetic patient data for demonstration purposes."""
        data_rows = []

        # Features that would be relevant for kidney disease prediction
        features = [
            'age', 'gender', 'weight', 'height', 'blood_pressure_systolic',
            'blood_pressure_diastolic', 'heart_rate', 'temperature',
            'serum_creatinine', 'blood_urea_nitrogen', 'glomerular_filtration_rate',
            'urine_albumin', 'urine_creatinine', 'albumin_creatinine_ratio',
            'serum_sodium', 'serum_potassium', 'serum_chloride', 'serum_bicarbonate',
            'hemoglobin', 'diabetes', 'hypertension', 'cardiovascular_disease',
            'smoking_status', 'alcohol_consumption', 'medication_ace_inhibitors',
            'medication_arbs', 'medication_diuretics', 'medication_beta_blockers',
            'medication_nsaids', 'family_history_kidney_disease'
        ]

        # Generate base values for each patient
        patient_base_values = {}
        for i in range(n_patients):
            patient_id = f"P{i:04d}"

            # Generate base values that will stay relatively consistent for each patient
            patient_base_values[patient_id] = {
                'age': random.randint(18, 85),
                'gender': random.choice([0, 1]),  # 0 for female, 1 for male
                'weight': random.uniform(50, 120),
                'height': random.uniform(150, 190),
                'diabetes': random.choice([0, 0, 0, 1]),  # 25% chance of diabetes
                'hypertension': random.choice([0, 0, 1]),  # 33% chance of hypertension
                'cardiovascular_disease': random.choice([0, 0, 0, 1]),  # 25% chance of CVD
                'smoking_status': random.choice([0, 0, 1]),  # 33% chance of smoking
                'alcohol_consumption': random.choice([0, 1, 2]),  # 0-none, 1-moderate, 2-heavy
                'family_history_kidney_disease': random.choice([0, 0, 0, 1])  # 25% chance
            }

            # Determine if this patient will develop kidney disease
            # Higher risk for patients with diabetes, hypertension, and older age
            risk_factors = (
                patient_base_values[patient_id]['diabetes'] +
                patient_base_values[patient_id]['hypertension'] +
                (1 if patient_base_values[patient_id]['age'] > 60 else 0) +
                patient_base_values[patient_id]['cardiovascular_disease'] +
                patient_base_values[patient_id]['family_history_kidney_disease']
            )

            will_develop_kidney_disease = random.random() < (0.1 + 0.15 * risk_factors)

            # When the disease will be diagnosed (if at all)
            diagnosis_visit = random.randint(n_visits_per_patient // 2, n_visits_per_patient) if will_develop_kidney_disease else None

            # Generate visits for this patient
            for visit in range(n_visits_per_patient):
                timestamp = f"2023-{random.randint(1, 12):02d}-{random.randint(1, 28):02d}"

                # Create a row for this visit
                row = {'patient_id': patient_id, 'timestamp': timestamp}

                # Add base values with small random variations
                for feature, base_value in patient_base_values[patient_id].items():
                    if isinstance(base_value, int) and base_value in [0, 1]:  # Binary features
                        row[feature] = base_value
                    elif feature == 'age':
                        # Age increases slightly over time
                        row[feature] = base_value + visit / 12  # Assuming monthly visits
                    else:
                        # Add small random variations to continuous features
                        row[feature] = base_value * random.uniform(0.95, 1.05)

                # Add visit-specific measurements
                # Normal ranges for kidney-related biomarkers
                row['serum_creatinine'] = random.uniform(0.6, 1.2)  # mg/dL
                row['blood_urea_nitrogen'] = random.uniform(7, 20)  # mg/dL
                row['glomerular_filtration_rate'] = random.uniform(90, 120)  # mL/min/1.73m²
                row['urine_albumin'] = random.uniform(0, 30)  # mg/24h
                row['urine_creatinine'] = random.uniform(800, 2000)  # mg/24h
                row['albumin_creatinine_ratio'] = row['urine_albumin'] / row['urine_creatinine'] * 1000  # mg/g

                # Other biomarkers
                row['serum_sodium'] = random.uniform(135, 145)  # mEq/L
                row['serum_potassium'] = random.uniform(3.5, 5.0)  # mEq/L
                row['serum_chloride'] = random.uniform(96, 106)  # mEq/L
                row['serum_bicarbonate'] = random.uniform(22, 29)  # mEq/L
                row['hemoglobin'] = random.uniform(12, 17)  # g/dL

                # Vital signs
                row['blood_pressure_systolic'] = random.uniform(110, 140)
                row['blood_pressure_diastolic'] = random.uniform(70, 90)
                row['heart_rate'] = random.uniform(60, 100)
                row['temperature'] = random.uniform(36.1, 37.2)

                # Medications
                row['medication_ace_inhibitors'] = random.choice([0, 0, 1]) if row['hypertension'] else 0
                row['medication_arbs'] = random.choice([0, 0, 1]) if row['hypertension'] else 0
                row['medication_diuretics'] = random.choice([0, 0, 1]) if row['hypertension'] else 0
                row['medication_beta_blockers'] = random.choice([0, 0, 1]) if row['cardiovascular_disease'] else 0
                row['medication_nsaids'] = random.choice([0, 0, 0, 1])  # 25% chance regardless

                # For patients who will develop kidney disease, gradually worsen biomarkers
                if will_develop_kidney_disease and visit >= diagnosis_visit // 2:
                    progress_factor = (visit - diagnosis_visit // 2) / (diagnosis_visit - diagnosis_visit // 2) if visit < diagnosis_visit else 1

                    # Worsen kidney function markers
                    row['serum_creatinine'] += progress_factor * random.uniform(0.2, 1.5)
                    row['blood_urea_nitrogen'] += progress_factor * random.uniform(5, 40)
                    row['glomerular_filtration_rate'] -= progress_factor * random.uniform(10, 60)
                    row['urine_albumin'] += progress_factor * random.uniform(30, 300)
                    row['albumin_creatinine_ratio'] += progress_factor * random.uniform(30, 300)

                    # Also affect other markers
                    row['serum_potassium'] += progress_factor * random.uniform(0, 1.5)
                    row['hemoglobin'] -= progress_factor * random.uniform(0, 3)

                # Set the kidney_disease flag
                row['kidney_disease'] = 1 if (will_develop_kidney_disease and visit >= diagnosis_visit) else 0

                # Add the row to our dataset
                data_rows.append(row)

        # Convert to DataFrame
        df = pd.DataFrame(data_rows)

        # Ensure all needed columns exist
        for feature in features:
            if feature not in df.columns:
                df[feature] = 0

        return df

    # Create synthetic dataset
    synthetic_data = create_synthetic_patient_data(n_patients=200, n_visits_per_patient=20)

    # Split into train/test sets by patient
    patient_ids = synthetic_data['patient_id'].unique()
    train_patients, test_patients = train_test_split(patient_ids, test_size=0.2, random_state=42)

    train_data = synthetic_data[synthetic_data['patient_id'].isin(train_patients)]
    test_data = synthetic_data[synthetic_data['patient_id'].isin(test_patients)]

    # Initialize and train the model
    kdeds = KidneyDiseaseEarlyDetectionSystem()
    history = kdeds.train(train_data)

    # Evaluate on test data
    evaluation = kdeds.evaluate(test_data)
    print("Model Evaluation:")
    for metric, value in evaluation.items():
        if metric != 'confusion_matrix':
            print(f"  {metric}: {value:.4f}")

    # Get predictions for a sample patient
    sample_patient_id = test_patients[0]
    sample_patient_data = test_data[test_data['patient_id'] == sample_patient_id]
    predictions = kdeds.predict(sample_patient_data)

    # Visualize predictions
    kdeds.visualize_predictions(sample_patient_data, save_path="kidney_prediction_visualization.png")

    # Save the model
    kdeds.save_model()

    print("Example completed. Model trained, evaluated, and saved.")

NameError: name '_name_' is not defined