<div style="background-color: #3D3D3A; padding: 20px; border-radius: 10px; margin-bottom: 20px;">
    <h1 style="color: white; text-align: center; margin: 0;">🔍 Diabetes Classification Model Evaluation</h1>
</div>

<div style="background-color: #3D3D3A; padding: 15px; border-radius: 8px; margin-bottom: 20px;">
    <p style="color: white; margin: 0;">This notebook provides a comprehensive evaluation of our diabetes classification models. We'll:</p>
    <ul style="color: white; margin: 10px 0 0 0;">
        <li>Load the preprocessed data</li>
        <li>Train multiple classification models</li>
        <li>Calculate and compare performance metrics</li>
        <li>Visualize results with professional plots</li>
    </ul>
</div>

<div style="background-color: #3D3D3A; padding: 15px; border-radius: 8px; margin: 20px 0;">
    <h2 style="color: white; margin: 0;">📚 Import Required Libraries</h2>
</div>

In [1]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path
import warnings 
warnings.filterwarnings('ignore')

sys.path.append(str(Path.cwd().parent))

# Import our custom modules
from src.models.model_factory import ModelFactory
from src.evaluation.performance_metrics import PerformanceMetrics
from src.evaluation.performance_visualization import PerformanceVisualizer
from src.data.data_versioning import DataVersioner
from src.training.Training import DiabetesModelTrainer

# Set random seed for reproducibility
np.random.seed(42)

<div style="background-color: #3D3D3A; padding: 15px; border-radius: 8px; margin: 20px 0;">
    <h2 style="color: white; margin: 0;">📥 Load and Prepare Data</h2>
</div>

In [None]:
data_versioner = DataVersioner(base_dir='../data')
data = data_versioner.get_version("2025_02_23_16_22_58") # the preprocessed data version

In [3]:
# drop all the records where the Diabetes is 1

data = data.drop(data[data['Diabetes_012'] == 1].index)

In [4]:
# data.value_counts(['Diabetes_012'])

<div style="background-color: #3D3D3A; padding: 15px; border-radius: 8px; margin: 20px 0;">
    <h2 style="color: white; margin: 0;">🤖 Train Models</h2>
</div>

In [5]:
# from src.training.imbalance_handler import ImbalanceHandler
# data = pd.read_csv('../data/extracted/diabetes.csv')

# imbalance_handler = ImbalanceHandler()

# X = data.drop('Diabetes', axis=1)
# y = data['Diabetes']

# X_train, X_test, y_train, y_test = imbalance_handler.split_and_resample(X, y)


In [6]:
# y_train.value_counts()

In [None]:
trainer = DiabetesModelTrainer()

X_train, X_test, y_train, y_test = trainer.prepare_data(data=data, already_split=True, target_column='Diabetes')

In [None]:
trained_models  = trainer.train_models(X_train, y_train)

In [None]:
summary_df, detailed_results = trainer.evaluate_models(trained_models, X_test, y_test)

<div style="background-color: #3D3D3A; padding: 15px; border-radius: 8px; margin: 20px 0;">
    <h2 style="color: white; margin: 0;">📊 Calculate Performance Metrics</h2>
</div>

In [None]:
# Initialize metrics calculator
metrics = PerformanceMetrics()

# Calculate metrics for all models
results = metrics.compare_models(models, X_train, y_train)

# Get summary DataFrames
summary_df = metrics.get_summary_df(results)
per_class_dfs = metrics.get_per_class_df(results)

# Display overall performance summary
print("Overall Performance Summary:")
display(summary_df.style.format({
    'Accuracy': '{:.3f}',
    'Precision': '{:.3f}',
    'Recall': '{:.3f}',
    'F1 Score': '{:.3f}'
}).background_gradient(cmap='RdYlGn'))

<div style="background-color: #3D3D3A; padding: 15px; border-radius: 8px; margin: 20px 0;">
    <h2 style="color: white; margin: 0;">📈 Visualize Results</h2>
</div>

In [None]:
# Initialize visualizer
visualizer = PerformanceVisualizer(save_dir='reports/metrics_visualizations')

# Plot overall metrics comparison
visualizer.plot_overall_metrics(summary_df, 
                               title="Model Performance Comparison",
                               figsize=(12, 6))

# Plot confusion matrices
visualizer.plot_confusion_matrices(results, figsize=(20, 5))

# Plot per-class metrics
visualizer.plot_per_class_metrics(per_class_dfs, figsize=(20, 6))

<div style="background-color: #3D3D3A; padding: 20px; border-radius: 10px; margin: 20px 0;">
    <h2 style="color: white; margin: 0 0 10px 0;">🔍 Analysis Summary</h2>
    <p style="color: white; margin: 0;">Based on the evaluation results:</p>
    <ul style="color: white; margin: 10px 0 0 0;">
        <li>Compare the performance of different models</li>
        <li>Analyze per-class performance to identify strengths and weaknesses</li>
        <li>Look for patterns in the confusion matrices</li>
        <li>Consider the trade-offs between different metrics</li>
    </ul>
</div>