# Compare featrure importances for all models

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.cm import ScalarMappable
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
import sklearn
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import seaborn as sns
import plotly.express as px
from skopt.plots import plot_convergence
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import xgboost.sklearn as xgb
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import permutation_importance
from sklearn.neural_network import MLPClassifier
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize
from itertools import product
from sklearn.utils import resample
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import make_scorer
from tabulate import tabulate
import warnings
import os
from pathlib import Path
from mlxai4cat.utils.data import prepare_dataset, stratified_sampling, resampling 
from mlxai4cat.utils.visualization import get_formatted_results, plot_feature_importance, plot_feature_importance_distribution, custom_palette
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=UserWarning, module="skopt")

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
storing_path = Path('../results')
figure_path = Path('../figures')

## Load feature importances of all models and combine in single DataFrame

### With resampling

In [None]:
df_dt_feature_importance = pd.read_csv(os.path.join(storing_path, 'DT_feature_imp_with_sklearn_results.csv')).set_index('Feature')
                         
df_rf_feature_importance = pd.read_csv(os.path.join(storing_path, 'RF_feature_imp_with_sklearn_results.csv')).set_index('Feature')

df_lr_feature_importance = pd.read_csv(os.path.join(storing_path, 'LR_feature_imp_with_sklearn_results.csv')).set_index('Feature')
                         
df_nn_feature_importance = pd.read_csv(os.path.join(storing_path, 'sorted_mean_lrp_NN.csv')).set_index('Feature')

df_svm_feature_importance = pd.read_csv(os.path.join(storing_path, 'sorted_mean_lrp_SVM.csv')).set_index('Feature')

df_nn_abs_feature_importance = pd.read_csv(os.path.join(storing_path, 'sorted_mean_abs_lrp_NN.csv')).set_index('Feature')

df_svm_abs_feature_importance = pd.read_csv(os.path.join(storing_path, 'sorted_mean_abs_lrp_SVM.csv')).set_index('Feature')

df_feature_importance = pd.concat([df_dt_feature_importance, df_rf_feature_importance,
                                   df_lr_feature_importance], axis=1)

df_feature_importance['Neural Network'] = df_nn_feature_importance['Importance Score']
df_feature_importance['Neural Network abs'] = df_nn_abs_feature_importance['Importance Score']
df_feature_importance['SVM'] = df_svm_feature_importance['Importance Score']
df_feature_importance['SVM abs'] = df_svm_abs_feature_importance['Importance Score']

df_feature_importance = df_feature_importance.reset_index()

### Without resampling

In [None]:
df_dt_feature_importance_nr = pd.read_csv(os.path.join(storing_path, 'DT_feature_imp_with_sklearn_NO_Resampling_results.csv')).set_index('Feature')
                         
df_rf_feature_importance_nr = pd.read_csv(os.path.join(storing_path, 'RF_feature_imp_with_sklearn_NO_Resampling_results.csv')).set_index('Feature')

df_lr_feature_importance_nr = pd.read_csv(os.path.join(storing_path, 'LR_feature_imp_with_sklearn_NO_Resampling_results.csv')).set_index('Feature')
                                                     
df_nn_feature_importance_nr = pd.read_csv(os.path.join(storing_path, 'sorted_mean_lrp_NN_NO_Resampling.csv')).set_index('Feature')

df_svm_feature_importance_nr = pd.read_csv(os.path.join(storing_path, 'sorted_mean_lrp_SVM_NO_Resampling.csv')).set_index('Feature')
                                                     
df_nn_abs_feature_importance_nr = pd.read_csv(os.path.join(storing_path, 'sorted_mean_abs_lrp_NN_NO_Resampling.csv')).set_index('Feature')

df_svm_abs_feature_importance_nr = pd.read_csv(os.path.join(storing_path, 'sorted_mean_abs_lrp_SVM_NO_Resampling.csv')).set_index('Feature')

df_feature_importance_nr = pd.concat([df_dt_feature_importance_nr, df_rf_feature_importance_nr,
                                   df_lr_feature_importance_nr], axis=1)

df_feature_importance_nr['Neural Network'] = df_nn_feature_importance_nr['Importance Score']
df_feature_importance_nr['Neural Network abs'] = df_nn_abs_feature_importance_nr['Importance Score']
df_feature_importance_nr['SVM'] = df_svm_feature_importance_nr['Importance Score']
df_feature_importance_nr['SVM abs'] = df_svm_abs_feature_importance_nr['Importance Score']

df_feature_importance_nr = df_feature_importance_nr.reset_index()

## Comparison of feature importances between the various tree based models

In [None]:
# Melt the DataFrame to transform it into the suitable format for Plotly
selected_models = ['Decision tree','DT prepruned', 'DT postpruned', 'Random forest', 'XGBoost'] + ['Feature']

df_feature_importance_selected = df_feature_importance[selected_models]

#print(df_feature_importance_selected)

df_melted = pd.melt(df_feature_importance_selected, id_vars='Feature', var_name='Model', value_name='Importance')

# Set up the plot
plt.figure(figsize=(20, 8))

# Create a grouped bar plot
sns.barplot(x='Feature', y='Importance', hue='Model', data=df_melted, palette='Set2')

# Customize the plot
plt.title('Feature Importance Comparison')
plt.xlabel('Feature')
plt.ylabel('Feature Importance')
plt.legend(title='Model')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show the plot
plt.show()

In [None]:
def normalize_column(df, column_name, range_min=0, range_max=1):
    """
    Normalize the values of a column in a DataFrame to a specified range [range_min, range_max].

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the column to be normalized.
    - column_name (str): The name of the column to be normalized.
    - range_min (float): The minimum value of the normalized range.
    - range_max (float): The maximum value of the normalized range.

    Returns:
    - pd.DataFrame: The DataFrame with the normalized column.
    """
    min_val = df[column_name].min()
    max_val = df[column_name].max()
    df[column_name] = (df[column_name] - min_val) / (max_val - min_val) * (range_max - range_min) + range_min
    return df


## Normalizing SVM and Neural Network values

In [None]:
df_feature_importance = normalize_column(df_feature_importance, 'SVM', range_min=-1, range_max=1)
df_feature_importance = normalize_column(df_feature_importance, 'Neural Network', range_min=-1, range_max=1)
df_feature_importance = normalize_column(df_feature_importance, 'SVM abs', range_min=0, range_max=1)
df_feature_importance = normalize_column(df_feature_importance, 'Neural Network abs', range_min=0, range_max=1)

## Comparison of feature importances between all models on a barplot

In [None]:
# Melt the DataFrame to transform it into the suitable format for Plotly
df_melted = pd.melt(df_feature_importance, id_vars='Feature', var_name='Model', value_name='Importance')


# Set up the plot
plt.figure(figsize=(14, 6))

# Create a grouped bar plot
sns.barplot(x='Feature', y='Importance', hue='Model', data=df_melted, palette='Dark2')

# Customize the plot
plt.title('Feature Importance Comparison of all ML models', fontsize=14)
plt.xlabel('Feature', fontsize=16)
plt.ylabel('Feature Importance', fontsize=16)
plt.xticks(rotation=45, fontsize=14)
plt.yticks(fontsize=14)
plt.legend(title='Model', fontsize=12, title_fontsize=12, loc='upper right', bbox_to_anchor=(1.25, 0.85))

plt.grid(axis='y', linestyle='--', alpha=0.9)
#plt.savefig("/Users/parastoo/phd_projects/OCM/plots/Feature_Importance_Comparison_all_models.png", facecolor=(1,1,1,0), bbox_inches='tight', dpi=300)

# Show the plot
plt.show()

## Average feature importances across all tree-based models

In [None]:
df_feature_analysis = df_feature_importance.copy()

selected_tree_models = ['Decision tree', 'DT prepruned', 'DT postpruned', 'Random forest', 'XGBoost']

# Calculate the average importance for each feature across selected models
df_feature_analysis['AverageImportance_trees'] = df_feature_importance[selected_tree_models].mean(axis=1)
df_feature_analysis['StdImportance_trees'] = df_feature_importance[selected_tree_models].std(axis=1)

# Sort the DataFrame based on 'AverageImportance_trees' in descending order for plotting
df_sorted_selected = df_feature_analysis.sort_values(by='AverageImportance_trees', ascending=False)

# Use the custom palette for the colors
colors = custom_palette(df_sorted_selected['AverageImportance_trees'])

# Plotting for selected models
fig, ax = plt.subplots(figsize=(8, 16))
ax.barh(df_sorted_selected['Feature'], df_sorted_selected['AverageImportance_trees'], xerr=df_sorted_selected['StdImportance_trees'], capsize=5, color=colors)

# Set labels and title for selected models
ax.set_xlabel('Importance Score', fontsize=25)
ax.set_ylabel('Features', fontsize=25)
ax.tick_params(axis='both', which='major', labelsize=20)
plt.savefig(os.path.join(figure_path, "Average_Feature_Importance_trees_Models.png"), dpi=300)

# Show the plot
plt.show()


## Average feature importances across random forest, logistic regression, neural networks and SVM

In [None]:
df_feature_analysis = df_feature_importance.copy()

selected_models = ['Random forest', 'Logistic regression', 'SVM abs', 'Neural Network abs']

# Calculate the average importance for each feature across selected models
df_feature_analysis['AverageImportance_selected'] = df_feature_importance[selected_models].mean(axis=1)

df_feature_analysis['StdImportance_selected'] = df_feature_importance[selected_models].std(axis=1)

# Sort the DataFrame based on 'AverageImportance_trees' in descending order for plotting
df_sorted_selected = df_feature_analysis.sort_values(by='AverageImportance_selected', ascending=False)

# Sort the DataFrame based on 'AverageImportance_trees' in descending order for plotting
df_sorted_selected = df_feature_analysis.sort_values(by='AverageImportance_selected', ascending=False)

# Use the custom palette for the colors
colors = custom_palette(df_sorted_selected['AverageImportance_selected'])

# Plotting for selected models
fig, ax = plt.subplots(figsize=(8, 16))
ax.barh(df_sorted_selected['Feature'], df_sorted_selected['AverageImportance_selected'], xerr=df_sorted_selected['StdImportance_selected'], capsize=5, color=colors)

# Set labels and title for selected models
ax.set_xlabel('Importance Score', fontsize=25)
ax.set_ylabel('Features', fontsize=25)
ax.tick_params(axis='both', which='major', labelsize=20)
#plt.savefig("/Users/parastoo/phd_projects/OCM/plots/Average_Feature_Importance_selected_Models.png", dpi=300)

# Show the plot
plt.savefig(os.path.join(figure_path, "Mean_Feature_Importance_with_Error_Bars_selected_horizontal.png"), dpi=300, facecolor=(1,1,1,0), bbox_inches='tight')
plt.show()


## Fisher Z correlation transformation

In [None]:
# Apply Fisher Z transformation
def fisher_z_transform(r):
    if r == 1:
        return np.inf
    elif r == -1:
        return -np.inf
    else:
        return np.log((1 + r) / (1 - r)) / 2
        
def inverse_fisher_z_transform(z):
    if z == np.inf:
        return 1.0
    elif z == -np.inf:
        return -1.0
    else:
        return (np.exp(2*z) - 1) / (np.exp(2*z) + 1)

Correlation analysis beetween absolute feature importances

In [None]:
model_columns = [col for col in df_feature_importance.columns if col not in ['Feature']]
model_columns

#model_columns = [col for col in df_feature_importance.columns if col not in ['Feature']]
model_columns = ['Decision tree', 'DT prepruned', 'DT postpruned', 'Random forest', 'XGBoost', 'Logistic regression', 'SVM abs', 'Neural Network abs']
df_models_only = df_feature_importance[model_columns]

# Calculate Pearson correlation coefficients for model columns only
correlation_matrix = df_models_only.corr()
# Fill diagonal with zeros to avoid division by zero
np.fill_diagonal(correlation_matrix.values, 0)

# Calculate mean of transformed values for each model pair
average_correlation_coefficients = {}
for model1 in correlation_matrix.index:
    for model2 in correlation_matrix.index:
        if model1 != model2:
            correlation_value = correlation_matrix.loc[model1, model2]
            correlation_value_transformed = fisher_z_transform(correlation_value)
            average_correlation_coefficients[(model1, model2)] = inverse_fisher_z_transform(correlation_value_transformed)


# Create a matrix representation
matrix_size = len(model_columns)
correlation_matrix_matrix = np.zeros((matrix_size, matrix_size))

for i, model1 in enumerate(model_columns):
    for j, model2 in enumerate(model_columns):
        if i != j:
            correlation_matrix_matrix[i, j] = average_correlation_coefficients[(model1, model2)]

np.fill_diagonal(correlation_matrix_matrix, 1)
# Create a heatmap

plt.figure(figsize=(15, 15))
ticks= ['Decision Tree','DT prepruned', 'DT postpruned', 'Random Forest', 'XGBoost', 'Logistic Regression', 'SVM', 'Neural Networks']
heatmap = sns.heatmap(correlation_matrix_matrix, annot=True, fmt=".2f", cmap="coolwarm", xticklabels=ticks, yticklabels=ticks, annot_kws={"size": 16, "color":'black'}, vmin=0)
plt.xticks(rotation=90, fontsize=16)
plt.yticks(rotation=0, fontsize=16)
# plt.xticks(rotation=90, fontsize=14)
# plt.yticks(rotation=0, fontsize=14)
plt.xlabel("Models", fontsize=20)
plt.ylabel("Models", fontsize=20)
plt.xlabel("Models")
plt.ylabel("Models")

# Get the colorbar object
cbar = heatmap.collections[0].colorbar

# Set the font size of colorbar tick labels
cbar.ax.tick_params(labelsize=16)

plt.savefig(os.path.join(figure_path, "Average_corr_coeff_for_all_models.png"), facecolor=(1,1,1,0), bbox_inches='tight', dpi=300)
plt.show()

