In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import custom modules
import sys
sys.path.append('../src') # Add src directory to Python path
from data_loader import DataLoader
from preprocessor import DataPreprocessor
from models import ModelTrainer
from evaluator import ModelEvaluator

# Set plot style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.dpi'] = 100

In [2]:
print("--- 1. Data Loading and Initial Exploration ---")
loader = DataLoader()
df = loader.load_sample_data('diabetes')
loader.get_info()

# Visualize target distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='Outcome', data=df)
plt.title('Distribution of Target Variable (0: No Diabetes, 1: Diabetes)')
plt.show()

# Visualize a few feature distributions (example)
# Add more plots as discussed in the project requirements
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(df['glucose'], kde=True, bins=30)
plt.title('Distribution of Glucose')
plt.subplot(1, 2, 2)
sns.histplot(df['bmi'], kde=True, bins=30)
plt.title('Distribution of BMI')
plt.tight_layout()
plt.show()

In [None]:
print("\n--- 2. Data Preprocessing ---")
preprocessor = DataPreprocessor()

# Split data
X_train, X_val, X_test, y_train, y_val, y_test = preprocessor.split_data(df, target_column='Outcome')

# Store original feature names before preprocessing transforms them
original_feature_names = X_train.columns.tolist()

# Create and apply preprocessing pipeline
preprocessor_pipeline = preprocessor.create_preprocessing_pipeline(X_train)
X_train_processed, X_val_processed, X_test_processed = preprocessor.preprocess(X_train, X_val, X_test)

print(f"Processed X_train shape: {X_train_processed.shape}")
print(f"Processed X_val shape: {X_val_processed.shape}")
print(f"Processed X_test shape: {X_test_processed.shape}")

# Example: Correlation analysis on original data (before scaling, for interpretability)
plt.figure(figsize=(14, 12))
sns.heatmap(X_train.corr(), annot=False, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix of Features (Training Data)')
plt.show()

In [None]:
print("\n--- 3. Model Training ---")
trainer = ModelTrainer()

lr_model = trainer.train_logistic_regression(X_train_processed, y_train)
dt_model = trainer.train_decision_tree(X_train_processed, y_train)

print("\nAll trained models:", trainer.get_models())

In [None]:
print("\n--- 4. Model Evaluation and Interpretability ---")

# Combine X_test and y_test for easier SHAP access to original values for context
original_X_test_with_target = X_test.copy()
original_X_test_with_target['target'] = y_test # Add target back for contextual display if needed by SHAP

evaluator = ModelEvaluator(feature_names=original_feature_names)

# Evaluate Logistic Regression
evaluator.evaluate_model(lr_model, X_test_processed, y_test, model_name="Logistic Regression")
evaluator.plot_feature_importance(lr_model, model_name="Logistic Regression")
# Note: SHAP for models *after* ColumnTransformer can be tricky. You might need to adjust or explain carefully.
# For simplicity, we'll demonstrate SHAP with Decision Tree as it's more direct post-processing.
# evaluator.plot_shap_values(lr_model, X_test_processed, original_X_test_with_target, model_name="Logistic Regression")


# Evaluate Decision Tree
evaluator.evaluate_model(dt_model, X_test_processed, y_test, model_name="Decision Tree")
evaluator.plot_feature_importance(dt_model, model_name="Decision Tree")
evaluator.plot_shap_values(dt_model, X_test_processed, original_X_test_with_target, model_name="Decision Tree") # SHAP works well here

In [None]:
print("\n--- 5. Critical Discussion and Conclusion ---")

# Discuss model performance, comparing LR and DT.
# Which metric is most important and why? (e.g., Recall for medical diagnosis)
# How robust are the models? What are their limitations?
# Can these models be used in practice? How?
# Reiterate that AI is a supporting tool, not a replacement for human experts.

print("""
The Logistic Regression model provided a good baseline performance, demonstrating its effectiveness for this binary classification task.
The Decision Tree Classifier, while potentially overfitting without proper tuning, offered clear feature importance, which can be valuable
for medical professionals to understand the factors driving a diagnosis.

For a medical diagnosis task like diabetes detection, 'Recall' is a paramount metric. Minimizing False Negatives (missing a diabetes case)
is crucial, even if it comes at the cost of a slightly higher False Positive rate (healthy patients misclassified as diabetic), as the latter can
be followed up with further diagnostic tests, while a missed case could have severe consequences.

Our models show promising results, but their deployment in a real-world clinical setting would require rigorous validation,
external testing, and integration with existing medical workflows. Furthermore, the model should only serve as an assistive tool,
providing a preliminary assessment or highlighting areas of concern, with the final diagnostic decision always resting with a qualified medical professional.
The interpretability provided by feature importance and SHAP values is critical for building trust and understanding among clinicians.
""")