# Phase 4: Classifier Development & Explainability

This notebook implements a robust classifier with feature engineering, model training, and SHAP-based explainability.

## Contents:
1. Data Loading and Preprocessing
2. Feature Engineering Pipeline
3. Model Training and Evaluation
4. SHAP Analysis Implementation
5. Model Persistence
6. FastAPI Integration Testing

In [3]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import shap
import joblib
from pathlib import Path

# Set random seed for reproducibility
np.random.seed(42)

## 1. Data Loading and Preprocessing

In [4]:
# Load the processed dataset
data = pd.read_csv('../data/raw/diabetes.csv')

# Split features and target
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

Training set shape: (614, 8)
Test set shape: (154, 8)


## 2. Feature Engineering Pipeline

In [5]:
# Define numeric features
numeric_features = X.columns.tolist()

# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

# Create model pipelines
models = {
    'logistic': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(random_state=42))
    ]),
    'xgboost': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', xgb.XGBClassifier(random_state=42))
    ]),
    'mlp': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', MLPClassifier(hidden_layer_sizes=(32, 16), 
                                    max_iter=1000, 
                                    random_state=42))
    ])
}

## 3. Model Training and Evaluation

In [6]:
# Train and evaluate models
results = {}

for name, model in models.items():
    # Perform cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, 
                               cv=5, scoring='roc_auc')
    
    # Fit model on full training data
    model.fit(X_train, y_train)
    
    # Store results
    results[name] = {
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'model': model
    }
    
    print(f"\n{name.upper()}:")
    print(f"CV ROC-AUC: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

# Select best model
best_model_name = max(results, key=lambda k: results[k]['cv_mean'])
best_model = results[best_model_name]['model']
print(f"\nBest model: {best_model_name}")


LOGISTIC:
CV ROC-AUC: 0.832 (+/- 0.021)

XGBOOST:
CV ROC-AUC: 0.809 (+/- 0.049)





MLP:
CV ROC-AUC: 0.781 (+/- 0.080)

Best model: logistic


## 4. SHAP Analysis Implementation

In [8]:
def get_shap_values(model, X_sample):
    """Calculate SHAP values for a sample."""
    # Get preprocessed data
    X_processed = model.named_steps['preprocessor'].transform(X_sample)
    
    # Get feature names
    feature_names = X_sample.columns.tolist()
    
    # Initialize SHAP explainer
    if isinstance(model.named_steps['classifier'], xgb.XGBClassifier):
        explainer = shap.TreeExplainer(model.named_steps['classifier'])
    else:
        explainer = shap.KernelExplainer(
            model.named_steps['classifier'].predict_proba, 
            X_processed[:100]  # Use subset for kernel explainer
        )
    
    # Calculate SHAP values
    shap_values = explainer.shap_values(X_processed)

    # Handle different output formats
    if isinstance(shap_values, list):
        # TreeExplainer with classification returns list (per class)
        shap_values = shap_values[1] if len(shap_values) > 1 else shap_values[0]
    else:
        shap_values = np.array(shap_values)
        if shap_values.ndim == 3:  
            # (n_samples, n_classes, n_features) → average over classes
            shap_values = shap_values.mean(axis=1)

    # Ensure it's 2D: (n_samples, n_features)
    shap_values = shap_values.reshape(shap_values.shape[0], -1)

    # Aggregate feature importance
    feature_importance = list(zip(
        feature_names, 
        np.abs(shap_values).mean(axis=0).tolist()
    ))

    # Sort safely by float values
    feature_importance.sort(key=lambda x: float(x[1]), reverse=True)
    
    return feature_importance


# Test SHAP analysis on a few samples
sample_indices = np.random.choice(len(X_test), 3)
X_samples = X_test.iloc[sample_indices]

print("Top 3 important features for sample predictions:")
for idx, sample in X_samples.iterrows():
    feature_importance = get_shap_values(
        best_model, pd.DataFrame([sample], columns=X_test.columns)
    )
    print(f"\nSample {idx}:")
    for feature, importance in feature_importance[:3]:
        print(f"{feature}: {importance:.3f}")


Top 3 important features for sample predictions:


 ... (more hidden) ...



Sample 63:
Pregnancies: 0.000
Glucose: 0.000


 ... (more hidden) ...



Sample 720:
Pregnancies: 0.000
Glucose: 0.000


 ... (more hidden) ...


Sample 636:
Pregnancies: 0.000
Glucose: 0.000





## 5. Model Persistence

In [9]:
# Create models directory if it doesn't exist
Path('../models').mkdir(exist_ok=True)

# Save the best model
model_path = f'../models/{best_model_name}.joblib'
joblib.dump(best_model, model_path)
print(f"Model saved to {model_path}")

# Save feature names for API use
feature_names = {
    'features': X.columns.tolist()
}
joblib.dump(feature_names, '../models/feature_names.joblib')
print("Feature names saved to models/feature_names.joblib")

Model saved to ../models/logistic.joblib
Feature names saved to models/feature_names.joblib


## 6. FastAPI Integration Testing

In [10]:
import requests

def test_prediction_endpoint(sample_data):
    """Test the FastAPI prediction endpoint."""
    url = "http://localhost:8000/predict"
    
    try:
        response = requests.post(url, json=sample_data)
        if response.status_code == 200:
            result = response.json()
            print("Prediction result:")
            print(f"Probability: {result['probability']:.3f}")
            print(f"Predicted Label: {result['label']}")
            print("\nTop contributing features:")
            for feature in result['top_features_shap']:
                print(f"{feature['name']}: {feature['importance']:.3f}")
        else:
            print(f"Error: {response.status_code}")
            print(response.text)
    except Exception as e:
        print(f"Error connecting to API: {e}")

# Create a sample test case
sample_data = X_test.iloc[0].to_dict()

print("Testing prediction endpoint...")
test_prediction_endpoint(sample_data)

Testing prediction endpoint...
Error connecting to API: HTTPConnectionPool(host='localhost', port=8000): Max retries exceeded with url: /predict (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001CDAF77C8D0>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))
