In [5]:
'''
Investigating various feature importance techniques to analyse prediction results of different ML methods
Feature importance techniques:
1) Impurity-based feature importance
2) Permutation feature importance
3) SHAP
'''

import numpy as np
import pandas as pd
import os
from sklearn.inspection import permutation_importance
import shap
import joblib
import nbimporter
from Plots import *
from Prediction import *
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [8]:
# Set the working directory
os.chdir("N:/WG_ENRI/20_projects/Noise2NAKO/04_data/grid_prediction/3_output")

# Impurity-based Feature Importance
When provided by the algorithms (e.g., Random Forest), we show them!

In [11]:
def Impurity_feature_importance(method, X, y, file_path, model = None):
    
    ''' This function finds and plots the corresponding feature importances for various methods using 
    impurity feature importance when fitting a model.
        
        Input:
        --------------------------
        
        method (str): name of the method
        
        X (pandas dataframe): data for the predictors to which you try to find importance of features when performing prediction
        
        y (pandas dataframe, pandas serie): output to which you try to find importance of features when 
        performing prediction
        
        file_path (str): the path for saving results
        
                
        model (e.g. SKlearn models): a trained model
        
        Output:
        ---------------------------
        variable_importances (pandas series): feature importance for all variables
    '''
    
    if not model:
        model = Build_fit_model(method, X, y)
    
    if method in ['RF' , 'XGBoost', 'AdaB']:
        importances = model.feature_importances_
    
    if method in ['LR', 'LR_Ridge', 'LR_Lasso', 'LR_Elastic']:
        coef = model.coef_ - min(model.coef_)
        importances = coef/sum(coef)

    variable_importances = pd.Series(importances, index= X.columns)
    single_feature_imp_plot_bar(variable_importances, method, file_path + '/' + method + '_MDI_feature_imp_plot', 'MDI')
    variable_importances.to_csv(file_path + '_values.csv', index= False, sep = ',')
    
    return variable_importances

# Permutaion Feature Importance

In [12]:
def Permutation_feature_importance(method, X_train, y_train, X, y, file_path, model = None):
    
    ''' This function finds the corresponding feature importance for a method employing permutation feature importance.
        
        Input:
        --------------------------
        
        method (str): name of the method
        
        X (pandas dataframe): data for the predictors to which you try to find importance of features when performing prediction
        
        y (pandas dataframe, pandas serie): output to which you try to find importance of features when 
        performing prediction
        
        file_path (str): the path for saving results
        
        model (e.g. SKlearn models): if provided, it is a trained model
        
        Output:
        ---------------------------
        variable_importances (pandas series): feature importance for all variables
    '''
        
    if not model:
        # Build and fit the model with the proper data
        model = Build_fit_model(method, X_train, y_train)
    
    result = permutation_importance(model, X, y, n_repeats=10, random_state=42, n_jobs=2)

    variable_importances = pd.Series(result.importances_mean, index=X.columns)
    single_feature_imp_plot_bar(variable_importances, method, file_path, 'Permutation')
    variable_importances.to_csv(file_path + '_values.csv', index= False, sep = ',')
    
    return variable_importances

In [13]:
def Apply_permutation_feature_importance(methods, X_train, y_train, X, y, output_path, file_name, models = None):
    
    ''' This function finds the corresponding feature importance for a bunch of methods employing 
    permutation feature importance.
        
        Input:
        --------------------------
        methods (list(str)): A list containing the name of the methods
        
        X_train (pandas dataframe): data for training methods 
        
        y_train (pandas dataframe, pandas serie): output for training methods 

        X (pandas dataframe): data for the predictors to which we try to find importance of features when performing
        prediction
        
        y (pandas dataframe, pandas serie): output to which we try to find importance of features when performing prediction
        
        file_path (str): the path for saving results
                      
        models ( list of models e.g. SKlearn models): a bunch of trained models
        
        
        Output:
        ---------------------------
        variable_importances (pandas series): feature importance for all variables
    '''
    
    if models:
        feature_imp = pd.DataFrame(columns = X_train.columns)
        for i in range(len(methods)):
            toAppend_imp = Permutation_feature_importance(methods[i], X_train, y_train, X, y, output_path 
                                                          + methods[i] + '/' + methods[i] 
                                                          + '_permutation_feature_importance_' + file_name, models[i])
            feature_imp = feature_imp.append(toAppend_imp, ignore_index=True)
        feature_imp['Method'] = methods
        feature_imp.to_csv(output_path + 'permutation_feature_importance_' + file_name + '.csv', sep=',', index=False)

    else:
        feature_imp = pd.DataFrame(columns = X_train.columns)
        for i in methods:
            toAppend_imp = permutation_feature_importance(i, X_train, y_train, X, y, output_path 
                                                            + methods[i] + '/' + methods[i] 
                                                            + '_permutation_feature_importance_' + file_name)
            feature_imp = feature_imp.append(toAppend_imp, ignore_index=True)
        feature_imp['Method'] = methods
        feature_imp.to_csv(output_path + 'permutation_feature_importance_' + file_name + '.csv', sep=',', index=False)
        
    return feature_imp

# SHAP

In [14]:
def Compute_shap_values(method, X_train, y_train , X, y, file_path, model = None):
    
    ''' This function computes shapely values for all features and observation employing SHAP algorithm
        
        Input:
        --------------------------
        method (str): name of the method
        
        X_train (pandas dataframe): data for training methods
        
        y_train (pandas dataframe, pandas serie): output for training methods
        
        X (pandas dataframe): data for the predictors to which we try to find importance of features when performing prediction
        
        y (pandas dataframe, pandas serie): output to which we try to find importance of features when 
        performing prediction
        
        file_path (str): the path for saving results
        
        model (e.g. SKlearn models): if provided, it is a trained model
        
        Output:
        ---------------------------
        shap_values (numpy ndarray): shapely values for all features and observation
    '''
    
    if not model:
        # Build and fit the model with the proper data
        model = Build_fit_model(method, X_train, y_train)
    
    # make a sample out of the data for faster computations
    X100 = shap.sample(X_train, 100)
    
    # There are different explainers w.r.t. every method. For further explanations please see SHAP documentation
    if method == 'GAM':
        explainer = shap.explainers.Additive(model.predict, X100)
        shap_values = explainer(X).values
    elif method in ['LR',  'LR_Ridge', 'LR_Lasso', 'LR_Elastic']:
        explainer = shap.explainers.Linear(model, X100)
        shap_values = explainer(X).values
    elif method in ['XGBoost', 'RF']:
        explainer = shap.Explainer(model)
        shap_values = explainer(X).values
    else:
        explainer = shap.KernelExplainer(model.predict, X100)
        shap_values = explainer.shap_values(X)
    
    # save shap values in two formats
    np.savetxt(file_path + '_shap_values.txt', shap_values)
    np.save(file_path + '_shap_values', shap_values)
    
    # plot the summary of shapley values for all features and observations
    shap_summary_plot(method, shap_values, X, X.columns, file_path + '_summary_plot')
    
    return shap_values

In [15]:
def shap_global_feature_importance(method, X_train, y_train, X, y, file_path, model = None, metric = None):
    
    ''' This function finds the feature importance employing shap global feature importance
        
        Input:
        --------------------------
        method (str): name of the method
        
        X_train (pandas dataframe): data for training methods
        
        y_train (pandas dataframe, pandas serie): output for training methods
        
        X (pandas dataframe): data for the predictors to which you try to find importance of features when performing prediction
        
        y (pandas dataframe, pandas serie): output to which you try to find importance of features when 
        performing prediction
        
        file_path (str): the path for saving results
        
        model (e.g. SKlearn models): if provided, it is a trained model
        
        metric (str): the strategy to summarize shapely values and find global feature importance 
        (in the current implementation only 'mean' and 'max absolute value (mav)' are considered, default is mav)
        
        Output:
        ---------------------------
        variable_importances (pandas series): feature importance for all variables
    '''

    model_shap_values = Compute_shap_values(method, X_train, y_train, X, y, file_path, model)
    
    if metric == 'mean':
        variable_importances = pd.Series((np.abs(model_shap_values)).mean(0), index=X.columns)
    else:
        variable_importances = pd.Series((np.abs(model_shap_values)).max(0), index=X.columns)

    single_feature_imp_plot_bar(variable_importances, method, file_path, 'SHAP')
    plt.show()
    return variable_importances

In [16]:
def Apply_shap_global_feature_importance(methods, X_train, y_train, X, y, output_path, file_name,
                                         models = None, metric = None):
    
    ''' This function finds the feature importance employing shap global feature importance for a bunch of methods
        
        Input:
        --------------------------        
        methods (list(str)): A list containing the name of the methods
        
        X_train (pandas dataframe): data for training methods
        
        y_train (pandas dataframe, pandas serie): output for training methods
        
        X (pandas dataframe): data for the predictors to which we try to find importance of features when 
        performing prediction
        
        y (pandas dataframe, pandas serie): output to which we try to find importance of features when 
        performing prediction
        
        file_path (str): the path for saving results
        
        models (list of models e.g. SKlearn models): a bunch of trained models

        metric (str): the strategy to summarize shapely values and find global feature importance 
        (in the current implementation only 'mean' and 'max absolute value (mav)' are considered, default is mav)
        
        Output:
        ---------------------------
        feature_imp (pandas series): feature importance for all variables and all methods
    '''
    
    if models:
        feature_imp = pd.DataFrame(columns = X_train.columns)
        for i in range(len(methods)):
            toAppend_imp = shap_global_feature_importance(methods[i], X_train, y_train, X, y, output_path 
                                                + methods[i] + '/' + methods[i] + '_shap_feature_importance_' + file_name,
                                                models[i], metric)
            feature_imp = feature_imp.append(toAppend_imp, ignore_index=True)
        feature_imp['Method'] = methods
        feature_imp.to_csv(output_path + 'shap_feature_importance_' + file_name + '.csv', sep=',', index=False)

    else:
        feature_imp = pd.DataFrame(columns = X_train.columns)
        for i in range(len(methods)):
            toAppend_imp = shap_global_feature_importance(methods[i], X_train, y_train, X, y,
                                                output_path + methods[i] + '/' + methods[i] + '_shap_feature_importance_' +
                                                          file_name, metric = metric)
            feature_imp = feature_imp.append(toAppend_imp, ignore_index=True)
        feature_imp['Method'] = methods
        feature_imp.to_csv(output_path + 'shap_feature_importance_' + file_name + '.csv', sep=',', index=False)
        
    return feature_imp