# imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Preprocessing tools
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
# Models & evaluation metrics
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import shap
import os
os.makedirs("images/", exist_ok=True)
import joblib
loaded_joblib = joblib.load("best_models.joblib")
loaded_joblib.keys()

In [None]:
#Model evaluation function
def evaluate_regression(model, X_train,y_train, X_test, y_test):
    """Evaluates a scikit learn regression model using r-squared and RMSE"""

    ## Training Data
    y_pred_train = model.predict(X_train)
    r2_train = metrics.r2_score(y_train, y_pred_train)
    rmse_train = metrics.mean_squared_error(y_train, y_pred_train,
                                            squared=False)

    print(f"Training Data:\tR^2= {r2_train:.2f}\tRMSE= {rmse_train:.2f}")


    ## Test Data
    y_pred_test = model.predict(X_test)
    r2_test = metrics.r2_score(y_test, y_pred_test)
    rmse_test = metrics.mean_squared_error(y_test, y_pred_test,
                                            squared=False)

    print(f"Test Data:\tR^2= {r2_test:.2f}\tRMSE= {rmse_test:.2f}")

In [None]:
# Saving the train/test split data into individual variables
X_train = loaded_joblib['X_train']
y_train = loaded_joblib['y_train']
X_test  = loaded_joblib['X_test']
y_test  = loaded_joblib['y_test']
#  Saving the models & preprocessor from POPS-Revisited
rf_reg  = loaded_joblib['RandomForestRegressor']
lin_reg  = loaded_joblib['LinearRegression']

preprocessor = loaded_joblib['preprocessor']

In [None]:
# Evaluate loaded model
evaluate_regression(rf_reg, X_train, y_train, X_test, y_test)

In [None]:
shap.initjs()

In [None]:
# Create X_shap variable
X_shap = shap.sample(X_train, nsamples = 400, random_state = 321)
X_shap

In [None]:
# Create y_shap variable
y_shap = y_train.loc[X_shap.index].astype(int)
y_shap

In [None]:
# reset index values
X_shap = X_shap.reset_index(drop=True)
y_shap = y_shap.reset_index(drop=True)
X_shap

In [None]:
# create model explainer
explainer = shap.Explainer(rf_reg)
explainer

In [None]:
# Getting the shap values
shap_values = explainer(X_shap, y_shap)
type(shap_values)

In [None]:
# confirming shap values are 2 dimension
shap_values.shape

In [None]:
# Create a summary plot - with plot_type='bar':
fig, ax = plt.subplots()
shap.summary_plot(shap_values,features= X_shap, plot_type='bar')
# Save your figure as a .png file inside your repository
fig.savefig('images/summary_plot_1.png')

In [None]:
#Create a second summary plot - with plot_type='dot'
fig, ax = plt.subplots()
shap.summary_plot(shap_values,features= X_shap)
# Save your figure as a .png file inside your repository
fig.savefig('images/summary_plot_2.png')