Contact author via [LinkedIn](http://piomazur.pl/linkedin) or [Github](http://piomazur.pl/github)

In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
import json

import datasets
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shap
import transformers
import torch
import torchvision
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

shap.initjs()

In [None]:
pd.options.display.max_colwidth = 300

# Data & models setup

## Data

### Read & show dataset

In [None]:
taxi_df = pd.read_csv("data/taxi_out_2020.csv.zip", compression="zip")
taxi_df

### Convert categorical features

In [None]:
categorical_features = [
    "OriginIcao",
    "DestIcao",
    "AirlineIcao",
    "DepHourLocal",
    "DepWeekdayLocal",
    "DepMonthLocal",
]
taxi_df[categorical_features] = taxi_df[categorical_features].apply(
    lambda x: x.astype("category")
)

### Prepare features & target

In [None]:
# Prepare X & y
X = taxi_df.drop(columns=["TaxiOut", "DepDateTimeUTC"])
y = taxi_df.TaxiOut

# Split dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, shuffle=False)

### Sample dataset

In [None]:
sample_size = 0.1
X_test = X_val.sample(frac=sample_size)
y_test = y_val[X_test.index].reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

## Model

### Taxi prediction model

In [None]:
model = lgb.Booster(model_file="taxi_model.txt")
model.params["objective"] = "regression"

## Image model

In [None]:
image_model = torchvision.models.mobilenet_v3_small(pretrained=True).eval()

## Text model

In [None]:
raw_text_model = transformers.AutoModelForSequenceClassification.from_pretrained(
    "nateraw/bert-base-uncased-emotion"
)
text_tokenizer = transformers.AutoTokenizer.from_pretrained(
    "nateraw/bert-base-uncased-emotion", use_fast=True
)
try:
    raw_text_model = raw_text_model.cuda()
except AssertionError:
    print("Torch not compiled with CUDA enabled. Using CPU.")
    raw_text_model = raw_text_model.cpu()
text_model = transformers.pipeline(
    "text-classification",
    model=raw_text_model,
    tokenizer=text_tokenizer,
    return_all_scores=True,
)

# Feature Importance

## Permutation importance

### Calculate original error

In [None]:
# Code here

### Calculate permutation importance for feature OriginIcao

In [None]:
# Code here

# Choose your feature


# Make a copy of your feature


# Shuffle feature values


# Predict & calculate new error


# Restore feature to original form


# Print error change


### Implement method that calculates permutation importance for a feature specified through parameter

In [None]:
# Code here


### Calculate importances for all features

In [None]:
# Code here


### Plot calculated permutation importances

In [None]:
# Code here


## Built-in feature importance

### Plot feature importance using [lgb.plot_importance](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.plot_importance.html) method

In [None]:
# Code here


### What importance type is being shown? Check others

In [None]:
# Code here


# Partial Dependence Plots

## Calculate partial dependence values for DepHourLocal

In [None]:
# Code here

# Choose your feature


# Make a copy of your feature


# Prepare structure for holding results


# Iterate over all possible values of column


# Assign the new value to whole column


# Predict


# Calculate mean of predictions

# END LOOP

# Restore feature to original form


## Plot partial dependence

In [None]:
# Code here


# SHAP Values

[Read more about SHAP values](https://towardsdatascience.com/one-feature-attribution-method-to-supposedly-rule-them-all-shapley-values-f3e04534983d)

## Single value from Taxi dataset

### Get an example of flight from Los Angeles (KLAX) to Seattle (KSEA) that had 20min Taxi Out

In [None]:
example_to_check = X_val[
    (X_val.OriginIcao == "KLAX") & (X_val.DestIcao == "KSEA") & (y_val == 20)
][:1]
example_to_check

### Load model into SHAP Explainer

In [None]:
# Code here

### Generate & show SHAP values

In [None]:
# Code here

### Visualize SHAP values with an additive force plot

In [None]:
# Code here

## Explaining Images with SHAP

### Helper methods for PyTorch & ImageNet

In [None]:
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]

def normalize(image):
    if image.max() > 1:
        image /= 255
    image = (image - mean) / std
    # in addition, roll the axis so that they suit pytorch
    return torch.tensor(image.swapaxes(-1, 1).swapaxes(2, 3)).float()

# load the ImageNet class names
url = "https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json"
fname = shap.datasets.cache(url)
with open(fname) as f:
    class_names = json.load(f)
    
def get_image_names(indexes):
    return np.vectorize(lambda x: class_names[str(x)][1])(indexes)

def adjust_shap_values(shap_values):
    return [np.swapaxes(np.swapaxes(s, 2, 3), 1, -1) for s in shap_values]

### Load samples from ImageNet50 dataset

In [None]:
image_X = shap.datasets.imagenet50()[0][[49, 31, 19, 2]] / 255
normalized_X = normalize(image_X)

### Show image samples

In [None]:
# Code here


### Check model

In [None]:
# Code here


### Load last convolutional layer into GradientExplainer (use `normalized_X` for data parameter)

In [None]:
# Code here


### Generate SHAP values

In [None]:
# Code here


### Plot SHAP values

In [None]:
# Code here


### Check what would happen if first or middle layer would be used

In [None]:
# Code here


## Explaining Text with SHAP

### Load emotion dataset

In [None]:
emotion_classes = ["sadness", "joy", "love", "anger", "fear", "surprise"]
text_dataset = datasets.load_dataset("emotion", split="train")
text_data = pd.DataFrame(
    {
        "text": text_dataset["text"],
        "emotion": [emotion_classes[el] for el in text_dataset["label"]],
    }
)

### Show emotion dataset

In [None]:
# Code here


### Check model

In [None]:
# Code here


### Load model into Explainer

In [None]:
# Code here


### Generate SHAP values for examples from each emotion

In [None]:
# Code here


### Plot SHAP values

In [None]:
# Code here
