In [1]:
from mlinspect.monkeypatching._mlinspect_ndarray import MlinspectNdarray
import pandas as pd
import os
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from mlinspect.utils import get_project_root
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# prepare data for explainer
COUNTIES_OF_INTEREST = ['county2', 'county3']

patients = pd.read_csv(os.path.join(str(get_project_root()), "example_pipelines", "healthcare",
                                    "patients.csv"), na_values='?')
histories = pd.read_csv(os.path.join(str(get_project_root()), "example_pipelines", "healthcare",
                                     "histories.csv"), na_values='?')

data = patients.merge(histories, on=['ssn'])
complications = data.groupby('age_group') \
    .agg(mean_complications=('complications', 'mean'))
data = data.merge(complications, on=['age_group'])
data['label'] = data['complications'] > 1.2 * data['mean_complications']
data = data[['smoker', 'last_name', 'county', 'num_children', 'race', 'income', 'label']]
data = data[data['county'].isin(COUNTIES_OF_INTEREST)]
train_data, test_data = train_test_split(data)
y_train = train_data["label"]
y_test = test_data["label"]
X_train = train_data.drop("label", axis=1)
X_test = test_data.drop("label", axis=1)

impute_and_one_hot_encode = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])
featurisation = ColumnTransformer(transformers=[
    ("impute_and_one_hot_encode", impute_and_one_hot_encode, ['smoker', 'county', 'race']),
    ('numeric', StandardScaler(), ['num_children', 'income']),
], remainder='drop')

X_t_train: MlinspectNdarray = featurisation.fit_transform(train_data, y_train)
X_t_test: MlinspectNdarray = featurisation.fit_transform(X_test, y_test)



In [2]:
import numpy as np

X_t_train.view(np.ndarray)

array([[ 1.        ,  0.        ,  0.        , ...,  0.        ,
        -0.85896456, -0.67878089],
       [ 0.        ,  1.        ,  1.        , ...,  0.        ,
        -0.85896456,  1.58939849],
       [ 0.        ,  1.        ,  0.        , ...,  1.        ,
        -0.85896456, -0.79217875],
       ...,
       [ 1.        ,  0.        ,  1.        , ...,  0.        ,
        -1.4506957 ,  0.17507158],
       [ 1.        ,  0.        ,  1.        , ...,  0.        ,
        -1.4506957 ,  0.41983706],
       [ 1.        ,  0.        ,  1.        , ...,  0.        ,
         1.50796   ,  0.57241166]])

In [3]:
from features.explainability.inspections.explainability_methods_enum import ExplainabilityMethodsEnum
from features.explainability.inspections.explainer import Explainer
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Some tensorflow warnings in the pipeline we inspect
from mlinspect.utils import get_project_root

from mlinspect import PipelineInspector

EXAMPLE_PIPELINE = os.path.join(str(get_project_root()), "features", "explainability", "pipeline_with_inspection.py")

inspector_result = PipelineInspector\
    .on_pipeline_from_py_file(EXAMPLE_PIPELINE) \
    .add_required_inspection(Explainer([ExplainabilityMethodsEnum.SHAP], X_t_train.view(np.ndarray), X_t_test.view(np.ndarray)[:1])) \
    .execute()

extracted_dag = inspector_result.dag
dag_node_to_inspection_results = inspector_result.dag_node_to_inspection_results
check_results = inspector_result.check_to_check_results

Using 217 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


  0%|          | 0/1 [00:00<?, ?it/s]

TypeError: 'NoneType' object does not support item assignment

In [5]:
from mlinspect import OperatorType

relevant_node = [node for node in extracted_dag.nodes if node.operator_info.operator in {OperatorType.ESTIMATOR,}][0]
print("\n\033[1m{} ({})\033[0m\n{}\n{}".format(
    relevant_node.operator_info.operator,
    relevant_node.details.description,
    relevant_node.optional_code_info.source_code,
    relevant_node.optional_code_info.code_reference))
inspection_result = dag_node_to_inspection_results[relevant_node][Explainer([ExplainabilityMethodsEnum.SHAP], X_t_train.view(np.ndarray), X_t_test.view(np.ndarray))]
print(inspection_result)


[1mOperatorType.ESTIMATOR (Neural Network)[0m
KerasClassifier(model=create_model_predict, epochs=10, batch_size=1, verbose=0, loss='binary_crossentropy',)
CodeReference(lineno=49, col_offset=13, end_lineno=49, end_col_offset=121)
[<shap.explainers._kernel.KernelExplainer object at 0xffff2259d970>, array([[-0.25534585, -0.08086055,  0.01702019, -0.01314442,  0.01240813,
         0.01164159, -0.01186372, -0.00545947, -0.13522539]])]
