# Regression: synthetic dataset

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor  # for regression
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
import sklearn
import copy

import shap

from pred_diff import preddiff
from pred_diff.imputers import imputer_base, tabular_imputers, general_imputers
import pred_diff.tools.plot as c_plt
from pred_diff.datasets.datasets_regression import SyntheticDataset
from pred_diff.tools import init_plt

Select plotting style

In [None]:
# paper style
init_plt.update_rcParams(fig_width_pt=234.88*2)

# default
# plt.style.use('default')


### Set-up experiment
1. Specify mean and covariance for dataset


In [None]:
# create synthetic dataset
cov = [[1, 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]]
mean = np.array([0, 0, 0, 0])

2. Create custom target functions

In [None]:
def f(x: np.ndarray) -> np.ndarray:
    x0 = x[:, 0]; x1 = x[:, 1]; x2 = x[:, 2]; x3 = x[:, 3]

    y_additive = x0 ** 2 + 3 * x1 + np.sin(np.pi * x2) - 0.5 * x3 ** 3
    y_interaction = 2 * np.sign(x0) * np.abs(x1)

    return y_additive + y_interaction

dataset = SyntheticDataset(function=f, mean=mean, cov=cov)

3. Select model

In [None]:
model_selection = 'RandomForest'
# model_selection = 'FullyConnectedNetwork'
# model_selection = 'GaussianProcess'

4. Train model

In [None]:
x_df, y_df = dataset.load_pd()
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x_df, y_df, test_size=0.1)
if model_selection == 'RandomForest':
    reg = RandomForestRegressor(n_estimators=100)
elif model_selection == 'FullyConnectedNetwork':
    reg = MLPRegressor([int(x_df.shape[0])])
elif model_selection == 'GaussianProcess':
    reg = GaussianProcessRegressor(1.0 * RBF() + WhiteKernel())
else:
    assert False, f'please enter a valid model_selection = {model_selection}'

reg.fit(x_train, y_train)
print(f"train data:     "
      f"score = {reg.score(x_train, y_train):.3f}\n"
      f"test data:      "
      f"score = {reg.score(x_test, y_test):.3f}")

## *PredDiff*
### Relevances and interactions
Figure 1: *PredDiff* contributions

In [None]:
n_imputations = 200
error_bars = True

imputer = tabular_imputers.GaussianProcessImputer(train_data=x_train)


explainer = preddiff.PredDiff(model=reg, train_data=copy.deepcopy(x_train.to_numpy()),
                              n_imputations=n_imputations, imputer=imputer, fast_evaluation=(error_bars==False),
                              n_group=100, unified_integral=True)

c_plt.shielded_effects(explainer=explainer, data_test=x_test.to_numpy(),
                       x=x_test['0'], y=x_test['1'],
                       title='PredDiff - 0 vs. 1', axis_symmetric=True, error_bars=error_bars)
c_plt.scatter_m_plots(explainer=explainer, df_test=x_test, n_imputations=n_imputations, error_bars=error_bars)


# c_plt.scatter_m_plots(reg, df_train=copy.deepcopy(x_train), df_test=copy.deepcopy(x_test), imputer=imputer,
#                       n_imputations=n_imputations)


# # use interaction matrix
# relevance_matrix = explainer.interaction_matrix(df_test=x_test, n_imputations=n_imputations)
# c = pd.DataFrame(np.diagonal(relevance_matrix, axis1=1, axis2=2), columns=x_test.columns)
# c_plt._scatter(c=c, x_df=x_test, method=f'Residual PredDiff, n={n_imputations}')

# c_plt.scatter_2d_heatmap(x=x_test['0'], y=x_test['1'], relevance=relevance_matrix[:, 1, 0],
#                          title='PredDiff - 0 vs. 1', axis_symmetric=True)

Figure 2: Computational dependence fo *PredDiff*

In [None]:
imputer = tabular_imputers.GaussianProcessImputer(train_data=x_train)
for n_imputations in [5, 50, 200]:
    explainer = preddiff.PredDiff(model=reg, train_data=copy.deepcopy(x_train.to_numpy()),
                                  n_imputations=n_imputations, imputer=imputer,
                                  n_group=100, unified_integral=True)
    c_plt.plot_n_dependence(explainer=explainer, x_test=x_test, n_imputations=n_imputations)


### Comparison to SHAP
Appendix

In [None]:
explainer = shap.TreeExplainer(reg)
shap_interaction_values = explainer.shap_interaction_values(x_test)
shap_values = explainer.shap_values(x_test)

c = pd.DataFrame(shap_values, columns=x_test.columns)
c_plt._scatter(c=c, x_df=x_test, method='SHAPTree')

c_plt.shap_interaction(shap_interaction_values=shap_interaction_values, x=x_test['0'], y=x_test['1'],
                       title='PredDiff - 0 vs. 1', axis_symmetric=True)