In [1]:
import pandas as pd 
import shap
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score


In [None]:
!conda install -c conda-forge shap

In [None]:
wine_df = pd.read_csv("../data/winequality-red.csv")
wine_df['quality_classif'] = wine_df.quality.apply(lambda x: 1 if x >= 7 else 0)

X = wine_df.loc[:, 'fixed acidity': 'alcohol']
y = wine_df['quality_classif']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
param_grid = {'n_neighbors': range(5,20)}
knn_clf_grid = GridSearchCV(KNeighborsClassifier(), param_grid, refit = True, verbose = 3)
knn_clf_grid.fit(X_train, y_train)

In [None]:
y_pred = knn_clf_grid.best_estimator_.predict(X_test)

print(knn_clf_grid.best_params_)
print(f"accuracy {knn_clf_grid.best_estimator_.score(X_test, y_test)}")
print(f"precision {precision_score(y_test, y_pred)}")
print(f"recall {recall_score(y_test, y_pred)}")
print(f"f1 {f1_score(y_test, y_pred)}")

In [None]:
knn_clf_grid.best_estimator_

## Look into Local Explainability for a few data points

#### Understand why an instance was classified as low quality

In [None]:
shap.initjs() # we need to run this so that our plots will not cause an error

explainer = shap.KernelExplainer(knn_clf_grid.best_estimator_.predict_proba, shap.sample(X_train)) 

index = 0
# Get shap values for the test data observation whose index is 0, i.e. first observation in the test set
shap_values = explainer.shap_values(X_test.iloc[index,:])

target_class = 1 
# Generate a force plot for this first observation using the derived shap values
shap.force_plot(explainer.expected_value[target_class], shap_values[target_class], X_test.iloc[index])

The `expected_value` attribute is the average of the probabilities for a given class target. For this particular case average probability values for a wine classified as low class is 88% while of high class it is at 11%.

In [None]:
knn_clf_grid.best_estimator_.predict_proba(X_test).mean(axis=0)

In [None]:
explainer.expected_value

### Understand why an instance was classified as high quality

In [None]:
# we look for an instance wherein the instance was correctly classified as good quality
temp = y_test.reset_index()
temp['pred'] = y_pred
temp[temp.quality_classif > 0]

In [None]:
shap.initjs() # we need to run this so that our plots will not cause an error

explainer = shap.KernelExplainer(knn_clf_grid.best_estimator_.predict_proba, shap.sample(X_train)) 

index = 188
# Get shap values for the test data observation whose index is 0, i.e. first observation in the test set
shap_values = explainer.shap_values(X_test.iloc[index,:])

target_class = 1 
# Generate a force plot for this first observation using the derived shap values
shap.force_plot(explainer.expected_value[target_class], shap_values[target_class], X_test.iloc[index])

## Look into Global Explainability for a few data points


In [None]:
explainer = shap.KernelExplainer(knn_clf_grid.best_estimator_.predict, shap.sample(X_train)) 
shap_values = explainer.shap_values(X_test.iloc[:100,:])

In [None]:
shap.summary_plot(shap_values, X_test.iloc[:100, :])

## Attempt to use Tree Based Algo

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
params = {
    'n_estimators': range(100, 500, 50),
    'max_depth': range(5,50,5),
    'learning_rate': [0.0001, 0.001, 0.1, 1],
    'gamma': np.arange(0.5, 2, .2),
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [1, 1.5, 2, 3, 4.5],
}

clf = xgb.XGBClassifier(objective='binary:logistic', eval_metric="logloss",  use_label_encoder=False)
clf_grid = RandomizedSearchCV(clf, params, n_iter=250)
clf_grid.fit(X_train, y_train)

In [None]:
y_pred = clf_grid.best_estimator_.predict(X_test)

print(clf_grid.best_params_)
print(f"accuracy {clf_grid.best_estimator_.score(X_test, y_test)}")
print(f"precision {precision_score(y_test, y_pred)}")
print(f"recall {recall_score(y_test, y_pred)}")
print(f"f1 {f1_score(y_test, y_pred)}")

In [None]:
explainer = shap.TreeExplainer(clf_grid.best_estimator_, 
                               shap.sample(X_train),
                               feature_perturbation='interventional',
                               model_output='predict_proba')
shap_values = explainer.shap_values(X_test)

In [None]:
clf_grid.best_estimator_.predict_proba(X_test).mean(axis=0)

In [None]:
explainer.expected_value

In [None]:
index = 0
target_class = 0
shap.force_plot(explainer.expected_value[0], shap_values[index][target_class], X_test.iloc[index,:])

In [None]:
# we look for an instance wherein the instance was correctly classified as good quality
temp = y_test.reset_index()
temp['pred'] = y_pred
temp[temp.quality_classif > 0]

In [None]:
index = 10
target_class = 1
shap.force_plot(explainer.expected_value[0], shap_values[target_class][index], X_test.iloc[index,:])

In [None]:
shap_values[0]

In [None]:
explainer = shap.TreeExplainer(clf_grid.best_estimator_, 
                               shap.sample(X_train),
                               feature_perturbation='interventional',
                               model_output='probability')
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)

## References

Examples of using other Explainers - https://shap-lrjball.readthedocs.io/en/docs_update/examples.html#tree-explainer-examples

## Group Activity

We will use this miro board link https://miro.com/app/board/uXjVOcXrpC8=/?invite_link_id=123408165247 

Using the final project groupings, work with your team in developing a global and local explainability using the customer churn telco data. Output

1. Use 1 machine learning algorithm and perform hyperparamter optimization
2. Show the accuracy, precision, recall and f1-score for the test data
3. Plot the global explainability of the model using SHAP
4. Write down the teams analysis on the global explainability 
5. Show 2 local explainability one for churn and another for not curn and write down your insights 

Afterwhich I will ask the teams to present their findings on the different items 