# Install Packages

In [1]:
!pip install shap
!pip install explainerdashboard

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting shap
  Downloading shap-0.41.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (569 kB)
[K     |████████████████████████████████| 569 kB 5.1 MB/s 
Collecting slicer==0.0.7
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.41.0 slicer-0.0.7
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting explainerdashboard
  Downloading explainerdashboard-0.3.8.2-py3-none-any.whl (305 kB)
[K     |████████████████████████████████| 305 kB 4.6 MB/s 
Collecting jupyter-dash
  Downloading jupyter_dash-0.4.2-py3-none-any.whl (23 kB)
Collecting dtreeviz>=1.3
  Downloading dtreeviz-1.3.7.tar.gz (62 kB)
[K     |████████████████████████████████| 62 kB 1.1 MB/s 
Collecting dash-bootstrap-components<1
  Downloading dash_bootstrap_components-0.13.1-py3-none-an

# Import Modules

In [2]:
import os
from pathlib import Path
from google.colab import drive
import random
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import joblib
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import shap
from explainerdashboard import ClassifierExplainer, ExplainerDashboard



The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`

The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`


# Set Hyperparameters

In [3]:
# set seed for reproducibility
seed = 123

# define features and target
features = ['Pregnancies', 'PlasmaGlucose', 'DiastolicBloodPressure', 'TricepsThickness', 'SerumInsulin', 'BMI', 'DiabetesPedigree', 'Age']
target = 'Diabetic'

# Mount Google Drive

In [4]:
# mount data
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Set Path Variables

In [5]:
# set paths
ROOTDIR = os.getcwd()
DATAPATH = ROOTDIR + '/data'
MODELPATH = ROOTDIR + '/model'

Path(DATAPATH).mkdir(parents=True, exist_ok=True)
Path(MODELPATH).mkdir(parents=True, exist_ok=True)


In [6]:
ROOTDIR

'/content'

# Helper Functions

In [6]:
def generate_random_indices(n:int, range_min:int, range_max:int, seed:int)->list:
  """ generate n random indices in given range for given seed. """
  random.seed(seed)
  random_indices = random.sample(range(range_min, range_max), n)
  return random_indices

def add_NAs_to_features(data:pd.DataFrame, feature_list:list, number_na:int, seed:int):
  """ Add number_na NAs in given features i.e. feature_list to given dataframe i.e. data. provide seed for reproducibility. """
  counter = 0
  for column in feature_list:
    counter += 1
    print('add NAs to feature ', column)
    # lets generate NAs in number_na rows
    random_indices = generate_random_indices(n=number_na, 
                                            range_min=0, range_max=data.shape[0], 
                                            seed=seed+counter)
    # impute NAs at random indices
    data.loc[random_indices, column] = np.nan
  return data

def visualize_numfeatures_stratifiedbytarget(data:pd.DataFrame, numeric_features:list, target:str, positive_class:str, negative_class:str):
  """ Plot histograms and boxplots for numeric features given a binary target """
  for col in numeric_features:
    try:
      fig, ax = plt.subplots(1,2, figsize=(12,5))
      
      sns.histplot(data=data, 
                  x=col, 
                  hue=target, 
                  stat='percent', 
                  kde=True,
                  element='step',
                  ax=ax[0])
      ax[0].set_title(col)
      ax[0].legend([positive_class, negative_class])

      sns.boxplot(data=data, 
                  y=col, 
                  x=target,
                  ax=ax[1])
      ax[1].set_title(col)
      ax[1].set_xticklabels([negative_class, positive_class])

      plt.show()
    except:
      print('Singular matrix error due to constant feature value')

# Data Import

In [7]:
# load the diabetes dataset containing the artificially generated NAs
print("Loading Data...")
df = pd.read_csv(Path(DATAPATH) / 'diabetes_nas.csv')
df.head()

Loading Data...


FileNotFoundError: ignored

# Train Test Split

In [None]:
# get features and target
X = df[features]
y = df[target]

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=seed)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

# Import Model Pipeline

In [None]:
# Load the model from the file
filename = '/diabetes_model.pkl'
model = joblib.load(MODELPATH + filename)
model

In [None]:
# predict on a new sample
# The model accepts an array of feature arrays (so you can predict the classes of multiple patients in a single call)
# We'll create an array with a single array of features, representing one patient
X_new = np.array([[2,180,74,24,21,23,1.4,22]])
print ('New sample: {}'.format(list(X_new[0])))

# Get a prediction
pred = model.predict(X_new)

In [None]:
pred

# Model Explainability

## Global Model Explainability

In [None]:
# init explainer
explainer = shap.PartitionExplainer(model.predict, 
                                    X_test,
                                    model_output='probability')

In [None]:
shap_values = explainer(X_test[0:100])

In [None]:
# explain test data
shap.plots.bar(shap_values, max_display=15)

In [None]:
shap.plots.beeswarm(shap_values, max_display=15)

In [None]:
# show shap dependence plot for numeric features
for nf in X_test.columns:
  fig = plt.figure(figsize=(8, 6))
  ax = fig.gca()
  shap.dependence_plot(nf, 
                      shap_values = shap_values.values, 
                      features = X_test[0:100],
                      x_jitter = 0.5,
                      xmin="percentile(5.0)",
                      xmax="percentile(95.0)", 
                      interaction_index=None,
                      title = 'SHAP Dependence Plot: SHAP Value vs {}'.format(nf),
                      ax=ax,
                      show=False)
  ax.grid('on')

## Local Model Explanation

In [None]:
# explain selected prediction
ind=10
shap.plots.waterfall(shap_values[ind], max_display=20) 

## Explainer Dashboard

In [None]:
explainer = ClassifierExplainer(model, 
                                X_test[0:50], 
                                y_test[0:50], 
                                X_background=shap.sample(X_test[50:], 100))

In [None]:
ExplainerDashboard(explainer).run()

In [None]:
ExplainerDashboard.terminate(8050)