# INTERPRETABILITE AVEC UN MODELE XGBOOST TARGET SIMPLE

In [None]:
import pandas as pd
import numpy as np
import panel as pn

data = pd.read_csv('result_requetes.csv')
#data.head()

In [None]:
#On affecte le bon type aux variables qualitatives
data["city"]=pd.Categorical(data["city"],ordered=False)
data["language"]=pd.Categorical(data["language"],ordered=False)
data["group"]=pd.Categorical(data["group"],ordered=False)
data["brand"]=pd.Categorical(data["brand"],ordered=False)

data["mobile"]=pd.Categorical(data["mobile"],ordered=False)
data["parking"]=pd.Categorical(data["parking"],ordered=False)
data["pool"]=pd.Categorical(data["pool"],ordered=False)
data["children_policy"]=pd.Categorical(data["children_policy"],ordered=False)

data["hotel_id"]=pd.Categorical(data["hotel_id"],ordered=False)
data["avatar_id"]=pd.Categorical(data["avatar_id"],ordered=False)

data["price"]=pd.DataFrame(data["price"], dtype=float)

#data.dtypes
#data.head()

## APPRENTISSAGE DE MODELE

Avec TargetEncoder / MEstimateEncoder

In [None]:
#!pip install category_encoders > /dev/null 2>&1   EN LOCAL ?!

In [None]:
from category_encoders import TargetEncoder
#from category_encoders import MEstimateEncoder

In [None]:
Y = data[["price"]]

In [None]:
X=data[["city","language","mobile","hotel_id","group","brand","parking","pool","children_policy"]]
enc = TargetEncoder()
#enc = MEstimateEncoder()
Xenc = enc.fit_transform(X,Y)
dataDum=pd.DataFrame(Xenc,columns=enc.get_feature_names())
#dataDum

In [None]:
dataQuant = data[["nb_requete","date","stock"]]
dfC = pd.concat([dataQuant,dataDum],axis=1)
dfC

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dfC, Y, test_size=0.1, random_state=0)

In [None]:
y_train = np.ravel(y_train)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import QuantileTransformer, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

xgb = make_pipeline(
    StandardScaler(),
    GradientBoostingRegressor(),
)

In [None]:
xgb.fit(X_train, y_train)

In [None]:
print(f"XGB score: {xgb.score(X_test, y_test):.2f}")

## INTERPRETABILITE

In [None]:
#feature_names = enc.get_feature_names_out()
feature_names = dfC.columns

## Features importance

In [None]:
#!pip install eli5 > /dev/null 2>&1   EN LOCAL !

In [None]:
import eli5
from eli5.sklearn import PermutationImportance
import matplotlib.pyplot as plt
import seaborn as sns

features_importance_dict = {}
plt.figure()
permumtation_impor = PermutationImportance(xgb, random_state=0).fit(X_test, y_test)
features_importance = {'Feature_name':feature_names, 'Importance':permumtation_impor.feature_importances_}  
features_importance = pd.DataFrame(features_importance) 
features_importance = features_importance.sort_values(['Importance'], ascending=False) 
features_importance_dict['xgboost'] = features_importance

In [None]:
ax = sns.barplot(x="Importance", y="Feature_name", data=features_importance[2:])
plt.title('xgboost')

## PDP et ICE plots

In [None]:
#!pip install pdpbox > /dev/null 2>&1   EN LOCAL !

In [None]:
from pdpbox import pdp, get_dataset, info_plots
model = xgb
model_name = 'xgboost'

top_10_features = features_importance_dict[model_name].Feature_name[:10].values
#top_10_features = features_importance_dict[model_name].Feature_name.values
for i, feature in enumerate(top_10_features, 1):
    pdp_feat = pdp.pdp_isolate(model=model, dataset=X_test, model_features=feature_names, feature=feature)
    pdp.pdp_plot(pdp_feat, feature, plot_lines=True, frac_to_plot=0.1, figsize=(10,5))

In [None]:
features_to_plot = ['stock', 'date'] #???
inter1 = pdp.pdp_interact(model=model, dataset=X_test, model_features=feature_names, features=features_to_plot)

pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=features_to_plot, plot_type='contour')
plt.show()

## SHAP

In [None]:
#!pip install shap > /dev/null 2>&1   EN LOCAL !

In [None]:
import shap
shap.initjs() #needed to plot results directly on the notebook

idx = 1 # index of the instance we want to explain

explainer = shap.KernelExplainer(xgb.predict, X_test)
#shap_values = explainer.shap_values(X_test.iloc[0,:])
#shap.force_plot(explainer.expected_value, shap_values, X_test.iloc[0,:])

In [None]:
shap_values = explainer.shap_values(X_test.iloc[0:10,:]) #To speed up we just compute the shap values for 100 exemples
shap.summary_plot(shap_values, X_test.iloc[0:10,:])

## LIME

In [None]:
#!pip install lime > /dev/null 2>&1   EN LOCAL !

In [None]:
import lime
import lime.lime_tabular

index = 0

explainer = lime.lime_tabular.LimeTabularExplainer(X_test.values, feature_names=feature_names, mode="regression")
exp = explainer.explain_instance(X_test.iloc[index], xgb.predict, num_features=5, top_labels=1)
exp.show_in_notebook(show_table=True, show_all=True)