In [1]:
# Importing libraries for data manipulation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Importing libraries for machine learning
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from joblib import dump,load
import shap

# Display setting for exploration
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
# Check out if the environment is the correct Anaconda one
import sys
print('environment: ',sys.executable)

# Set up directory to be the github repository
# requires git
import os
import subprocess
os.getcwd()
output = subprocess.check_output(['git', 'rev-parse', '--show-toplevel'])
path = output.decode('utf-8').strip()
print('working directory: ',path)
os.chdir(path)


environment:  c:\Users\ricca\anaconda3\envs\Thesis\python.exe
working directory:  C:/Users/ricca/Documents/GitHub/Thesis-SEM-ML


In [3]:
from Source.DataPreparation.DataProcessor import DataProcessor
from Source.Regression.latent_variable_regressors import GBoostRegression, RFRegression, LinearRegressionModel

In [4]:
# Assuming you have data in X and y variables
Results_RF = []
Results_XGB = []
Results_LM=[]
shap_values_list_RF = None
shap_values_list_xgb = None
shap_values_list_LM = None

montecarloiter=3

for i in range(montecarloiter):
    print(f"Starting the iteration n.{i+1}")

    processorclass=DataProcessor()
    processorclass.read_df()
    processorclass.split_data(test_size=0.3)
    processorclass.save_data()
    processorclass.process_CFA()

    X_train,y_train,X_test,y_test = processorclass.train_test_data_for_WEtarget(target_variable='WorkEngagement')

    # XGBRegressor
    Xgboost=GBoostRegression(x_train=X_train,y_train=y_train,x_test=X_test,y_test=y_test)
    Xgboost.train(verbosity=0,n_iter=90)
    mxgbresults = Xgboost.get_results()
    mse_xgb, r2_xgb, mae_xgb, train_mse_xgb, train_r2_xgb = (mxgbresults['mse'],mxgbresults['r2'],mxgbresults['mae'],mxgbresults['train_mse'],mxgbresults['train_r2'])
    Results_XGB.append((mse_xgb, r2_xgb, mae_xgb,train_mse_xgb, train_r2_xgb))

    if shap_values_list_xgb is None:
        shap_values_list_xgb = Xgboost.get_shap_values()
    else:
        shap_values_list_xgb += Xgboost.get_shap_values()
    print(f"Iteration {i+1} XGBoost: MSE: {mse_xgb}, R^2: {r2_xgb}, MAE: {mae_xgb}, TRAIN_MSE:{train_mse_xgb}, TRAIN_R^2:{train_r2_xgb}" )


    # RandomForest
    RFRegre=RFRegression(x_train=X_train,y_train=y_train,x_test=X_test,y_test=y_test)
    RFRegre.median_imputation()
    RFRegre.train(verbosity=0,n_iter=30,computeshap=False)

    mRFresults = RFRegre.get_results()
    mse_RF, r2_RF, mae_RF, train_mse_RF, train_r2_RF  = (mRFresults['mse'],mRFresults['r2'],mRFresults['mae'],mRFresults['train_mse'],mRFresults['train_r2'])
    Results_RF.append((mse_RF, r2_RF, mae_RF, train_mse_RF, train_r2_RF))


    #if shap_values_list_RF is None:
    #    shap_values_list_RF = RFRegre.get_shap_values()
    #else:
    #    shap_values_list_RF += RFRegre.get_shap_values()
    print(f"Iteration {i+1} RandomForest: MSE: {mse_RF}, R^2: {r2_RF}, MAE: {mae_RF}, TRAIN_MSE:{train_mse_RF}, TRAIN_R^2:{train_r2_RF}" )


    # Linear Regression
    LMreg= LinearRegressionModel(x_train=X_train,y_train=y_train,x_test=X_test,y_test=y_test)
    LMreg.median_imputation()
    LMreg.train(computeshap=True)
    LMresults = LMreg.get_results()
    mse_LM, r2_LM, mae_LM, train_mse_LM, train_r2_LM  = (LMresults['mse'],LMresults['r2'],LMresults['mae'],LMresults['train_mse'],LMresults['train_r2'])
    Results_LM.append((mse_LM, r2_LM, mae_LM,train_mse_LM, train_r2_LM))

    if shap_values_list_LM is None:
        shap_values_list_LM = LMreg.get_shap_values()
    else:
        shap_values_list_LM += LMreg.get_shap_values()
    print(f"Iteration {i+1} Linear Regression: MSE: {mse_LM}, R^2: {r2_LM}, MAE: {mae_LM}, TRAIN_MSE:{train_mse_LM}, TRAIN_R^2:{train_r2_LM}")

    print(("-------------------------------------------------"))



#shap_values_list_RF /= montecarloiter
shap_values_list_xgb /= montecarloiter
shap_values_list_LM /= montecarloiter

Starting the iteration n.1
The dataframe was loaded
A Train-Test split was performed with a test size of 0.3
Datasets were saved
Starting the CFA
Starting the XGBRegressor training
{'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': 0.8832793527591418, 'colsample_bynode': None, 'colsample_bytree': 0.9230194998959509, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': 0.0, 'gpu_id': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.01584253029449818, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 4, 'max_leaves': None, 'min_child_weight': 9, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 400, 'n_jobs': None, 'num_parallel_tree': None, 'predictor': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.01, 'sampling_method': None, 'sc

The objective has been evaluated at this point before.
The objective has been evaluated at this point before.


Iteration 1 RandomForest: MSE: 0.15413414389679275, R^2: 0.6948772414258415, MAE: 0.30565079948242496, TRAIN_MSE:0.013463117894662778, TRAIN_R^2:0.9673312540525095
Starting the Linear Regression training
Iteration 1 Linear Regression: MSE: 0.14961842340465822, R^2: 0.7038165267696037, MAE: 0.2971899605318235, TRAIN_MSE:0.05598495302616047, TRAIN_R^2:0.8641504723048674
-------------------------------------------------
Starting the iteration n.2
The dataframe was loaded
A Train-Test split was performed with a test size of 0.3
Datasets were saved
Starting the CFA
Starting the XGBRegressor training
{'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': 0.5, 'colsample_bynode': None, 'colsample_bytree': 1.0, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': 0.023803239123148703, 'gpu_id': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, '

The objective has been evaluated at this point before.
The objective has been evaluated at this point before.


Iteration 2 RandomForest: MSE: 0.06581157535882141, R^2: 0.8642881056269232, MAE: 0.20114015484692913, TRAIN_MSE:0.020445061654916902, TRAIN_R^2:0.9524856632939843
Starting the Linear Regression training
Iteration 2 Linear Regression: MSE: 0.05648890882261486, R^2: 0.8835126376235956, MAE: 0.18290838129223633, TRAIN_MSE:0.08819067830939749, TRAIN_R^2:0.7950448057212446
-------------------------------------------------
Starting the iteration n.3
The dataframe was loaded
A Train-Test split was performed with a test size of 0.3
Datasets were saved
Starting the CFA
Starting the XGBRegressor training
{'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': 1.0, 'colsample_bynode': None, 'colsample_bytree': 0.8090128411875246, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': 0.03365781658932243, 'gpu_id': None, 'grow_policy': None, 'importance_type': None, 'interaction_constr

The objective has been evaluated at this point before.


Iteration 3 RandomForest: MSE: 0.13656704328485217, R^2: 0.7130467689250013, MAE: 0.28536846158296, TRAIN_MSE:0.03401428176111478, TRAIN_R^2:0.9198289591210773
Starting the Linear Regression training
Iteration 3 Linear Regression: MSE: 0.12451818657036673, R^2: 0.738363699582786, MAE: 0.2735603171758556, TRAIN_MSE:0.06269639920148184, TRAIN_R^2:0.8522257321602631
-------------------------------------------------


TypeError: unsupported operand type(s) for /=: 'NoneType' and 'int'

In [None]:
import numpy as np

def print_avg_metrics(Results, model_name):
    mse_avg = np.mean([res[0] for res in Results])
    r2_avg = np.mean([res[1] for res in Results])
    mae_avg = np.mean([res[2] for res in Results])
    train_mse_avg = np.mean([res[3] for res in Results])
    train_r2_avg = np.mean([res[4] for res in Results])
    
    print(f"Model: {model_name}")
    print(f"Average MSE over {montecarloiter} iterations: {mse_avg}")
    print(f"Average MAE over {montecarloiter} iterations: {mae_avg}")
    print(f"Average R^2 over {montecarloiter} iterations: {r2_avg}")
    print(f"Average TRAIN MSE over {montecarloiter} iterations: {train_mse_avg}")
    print(f"Average TRAIN R^2 over {montecarloiter} iterations: {train_r2_avg}")
    print("-------------------------------------------------")

# Print the metrics for each model
print_avg_metrics(Results_XGB, "XGBoost")
print_avg_metrics(Results_RF, "Random Forest")
print_avg_metrics(Results_LM, "Linear Model")

In [None]:

shap.plots.beeswarm(shap_values_list_xgb)


In [None]:
#shap.waterfall_plot(shap_values_list_xgb[2])


In [None]:
shap.plots.heatmap(shap_values_list_xgb)

In [None]:
shap.plots.scatter(shap_values_list_xgb[:, "ProfessionalSupport"],color=shap_values_list_xgb)
#color=plt.get_cmap("cool")
#color=shap_values[:,"Workclass"])
