# PredDiff

In [None]:
import matplotlib.pylab as plt
import shap #for plotting only
from matplotlib import cm
import numpy as np

from pred_diff.datasets.uci_dataset import UCI_DataFrame, UCI_Bike_DataFrame, UCI_Adult_DataFrame
from pred_diff.datasets.sikonja_synthetic import Sikonja_Synthetic_DataFrame


from pred_diff.preddiff import *
from pred_diff.tools.preddiff_plotting import *

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from pred_diff.imputers.impute import *

Select dataset here:

In [None]:
#DATASET FROM GAL'S MC DROPOUT PAPER
# dataDirectory = "bostonHousing" 
# dataDirectory = "concrete"
# dataDirectory = "energy"
# dataDirectory = "kin8nm"
# dataDirectory = "naval-propulsion-plant"
# dataDirectory = "power-plant"
# dataDirectory = "protein-tertiary-structure"
# dataDirectory = "wine-quality-red"
# dataDirectory = "yacht"

#OTHER UCI DATASETS
dataDirectory = "bike-sharing"
# dataDirectory = "adult"#aka census income (classification)

#Sikonja synthetic datasets
# dataDirectory = "sikonja_0"

In [None]:
if(dataDirectory.startswith("sikonja")):
    dataset = int(dataDirectory.split("_")[1])
    uci_df = Sikonja_Synthetic_DataFrame(dataset=dataset)
    regression = False
elif(dataDirectory == "bike-sharing"):
    uci_df=UCI_Bike_DataFrame(daily=True)
    regression=True
elif(dataDirectory == "adult"):
    uci_df = UCI_Adult_DataFrame()
    regression=False
else:
    uci_df = UCI_DataFrame(dataDirectory)
    regression=True

In [None]:
uci_df.columns_features

In [None]:
if(regression):
    reg = RandomForestRegressor(n_estimators=1000)
else:
    reg = RandomForestClassifier()
x_df = uci_df.get_train_df().loc[:, uci_df.columns_features]#uci_df.get_train_df(1).loc[:, uci_df.columns_features]
y_df = uci_df.get_train_df().loc[:, uci_df.columns_target]#uci_df.get_train_df(1).loc[:, uci_df.columns_target]
reg.fit(x_df, y_df)

x_df_test = uci_df.get_test_df().loc[:, uci_df.columns_features]#uci_df.get_train_df(1).loc[:, uci_df.columns_features]
y_df_test = uci_df.get_test_df().loc[:, uci_df.columns_target]#uci_df.get_train_df(1).loc[:, uci_df.columns_target]

In [None]:
np.mean(y_df_test)

In [None]:
np.sqrt(np.mean(np.power(reg.predict(x_df_test)-np.array(y_df_test),2)))

## Relevances

Select desired imputer

In [None]:
imputer_selection="TrainSetImputer"
# imputer_selection="TrainSetMahalanobisImputer"
#imputer_selection="IterativeImputerEnhanced"
n_group = 5

In [None]:
if(imputer_selection=="TrainSetImputer"):
    mvi = PredDiff(reg, x_df, imputer_cls=impute.TrainSetImputer,regression=regression, n_group=n_group)
elif(imputer_selection=="TrainSetMahalanobisImputer"):
    mvi = PredDiff(reg, x_df, imputer_cls=impute.TrainSetMahalanobisImputer,regression=regression, 
                   batch_size_test=512, sigma=10, gpu=1, , n_group=n_group)
elif(imputer_selection=="IterativeImputerEnhanced"):
    mvi = PredDiff(reg, x_df, imputer_cls=impute.IterativeImputerEnhanced,regression=regression, n_group=n_group)

m_list = mvi.relevances(x_df_test, n_imputations=100)

In [None]:
assert False, 'SHAP format has changed, needs fixing'
e=preddiff_list_to_shap_explanation(m_list,x_df_test) 
#shap.plots.bar(e)
#shap.plots.bar(e[0])
#shap.plots.beeswarm(e)
#shap.plots.scatter(e[:,"temp"],color=e[:,"atemp"])
    
fig, ax = plt.subplots(figsize=(16, 8))
shap.plots.beeswarm(e,plot_size=None,show=False)
ax.set_xlabel('m-value')


In [None]:

for i,col in enumerate(x_df.columns):
    if(regression):
        plt.title(col)
        plt.errorbar(x_df_test[col], m_list[i]['mean'], m_list[i]['high']-m_list[i]['low'], marker='.', linestyle='None')
        plt.show()
    else:
        #just class 1 for now
        for c in range(len(m_list[0]["mean"].iloc[0])):
            plt.title(col+" class "+str(c))
            m=m_list[i]['mean'].apply(lambda x:x[c])
            s=m_list[i]['std'].apply(lambda x:x[c])
            #plot everything
            #plt.scatter(x_df[col],m, marker='.',s=10, linestyle='None',c=y_df,cmap=cm.coolwarm)
            #plot just the target class
            plt.errorbar(x_df_test.iloc[np.where(y_df_test==1)[0]][col],m.iloc[np.where(y_df_test==1)[0]],s.iloc[np.where(y_df==1)[0]], marker='.', linestyle='None')
            plt.axhline(0)
            plt.show()

In [None]:
if(regression):
    m_stats = calculate_global_preddiff_stats(m_list,uci_df.columns_features)
    plot_global_preddiff_stats(m_stats)
else:
    m_stats = calculate_global_preddiff_stats_clas(m_list,y_df,uci_df.columns_features)
    for c in range(len(y_df[0])):
        print("Class",c)
        plot_global_preddiff_stats(m_stats[c],min_value=0)

## Interactions

In [None]:
#pick 5 most important features
interaction_vars = np.array(m_stats.iloc[:5].col)
interaction_cols =[]

for i in range(len(interaction_vars)):
    for j in range(i+1,len(interaction_vars)):
        interaction_cols.append([[interaction_vars[i]],[interaction_vars[j]]])

interaction_cols_txt = ["&".join(i1)+" AND \n"+"&".join(i2) for [i1,i2] in interaction_cols]

In [None]:
m_int = mvi.interactions(x_df_test, interaction_cols, n_imputations=200)

In [None]:
m_int_stats = calculate_global_preddiff_stats(m_int,interaction_cols_txt)
plot_global_preddiff_stats(m_int_stats)