In [1]:
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from lohrasb.best_estimator import BaseModel
from sklearn.pipeline import Pipeline
from feature_engine.imputation import (
    CategoricalImputer,
    MeanMedianImputer
    )
from category_encoders import OrdinalEncoder
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score)
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from interpret import show
from interpret.blackbox import ShapKernel
import shap


from interpret import set_visualize_provider
from interpret.provider import InlineProvider
set_visualize_provider(InlineProvider())


ModuleNotFoundError: No module named 'pandas'

#### Example: Use Adult Data Set (a classification problem)
  
https://archive.ics.uci.edu/ml/datasets/Adult

#### Read data


In [None]:
urldata= "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
# column names
col_names=["age", "workclass", "fnlwgt" , "education" ,"education-num",
"marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week",
"native-country","label"
]
# read data
data = pd.read_csv(urldata,header=None,names=col_names,sep=',')
data.head()

#### Define labels


In [None]:
data.loc[data['label']=='<=50K','label']=0
data.loc[data['label']==' <=50K','label']=0

data.loc[data['label']=='>50K','label']=1
data.loc[data['label']==' >50K','label']=1

data['label']=data['label'].astype(int)

#### Train test split

In [None]:
X = data.loc[:, data.columns != "label"]
y = data.loc[:, data.columns == "label"]

X_train, X_test, y_train, y_test =train_test_split(X, y, \
     test_size=0.33, stratify=y['label'], random_state=42)

y_train=y_train.values.ravel()
y_test=y_test.values.ravel()


#### Find feature types for later use

In [None]:
int_cols =  X_train.select_dtypes(include=['int']).columns.tolist()
float_cols =  X_train.select_dtypes(include=['float']).columns.tolist()
cat_cols =  X_train.select_dtypes(include=['object']).columns.tolist()


#### Define estimator and set its arguments 


In [None]:
estimator = RandomForestClassifier()
estimator_params = {
        "max_depth": [4, 5],
        "n_estimators":[100,200],
        "max_features" :["sqrt", "log2"],


    }
    

In [None]:
kwargs = {  # params for fit method or fit_params 
            'fit_grid_kwargs' :{
            'sample_weight':None,
            },
            # params for GridSearchCV 
            'grid_search_kwargs' : {
            'estimator':estimator,
            'param_grid':estimator_params,
            'scoring' :'f1',
            'verbose':3,
            'n_jobs':-1,
            'cv':KFold(2),
            }
            }



In [None]:
blackbox_model = BaseModel().optimize_by_gridsearchcv(
    kwargs=kwargs
        )

In [None]:


pipeline =Pipeline([
            # int missing values imputers
            ('intimputer', MeanMedianImputer(
                imputation_method='median', variables=int_cols)),
            # category missing values imputers
            ('catimputer', CategoricalImputer(variables=cat_cols)),
            #
            ('catencoder', OrdinalEncoder()),
            # classification model
            #('obj', RandomForestClassifier())

 ])


#### Run Pipeline

In [None]:
X_train = pipeline.fit_transform(X_train,y_train)
X_test = pipeline.transform(X_test)

blackbox_model.fit(X_train, y_train)
y_pred = blackbox_model.predict(X_test)


#### Check performance of the pipeline

In [None]:
print('F1 score : ')
print(f1_score(y_test,y_pred))
print('Classification report : ')
print(classification_report(y_test,y_pred))
print('Confusion matrix : ')
print(confusion_matrix(y_test,y_pred))


####  Interpret of the pipeline and its decisions with SHAP. The visualizations provided will be for local explanations.



In [None]:
# train set is big only we use 500 as a sample
shap_kernel = ShapKernel(predict_fn=blackbox_model.predict_proba, data=shap.sample(X_train,500))
shap_local = shap_kernel.explain_local(X_test[:20], y_test[:20])
show(shap_local)
