# Compare classification performance for all models

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.cm import ScalarMappable
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
import sklearn
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import seaborn as sns
import plotly.express as px
from skopt.plots import plot_convergence
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import xgboost.sklearn as xgb
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import permutation_importance
from sklearn.neural_network import MLPClassifier
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize
from itertools import product
from sklearn.utils import resample
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import make_scorer
from tabulate import tabulate
import warnings
import os
from pathlib import Path
from mlxai4cat.utils.data import prepare_dataset, stratified_sampling, resampling 
from mlxai4cat.utils.visualization import get_formatted_results, plot_feature_importance, plot_feature_importance_distribution
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=UserWarning, module="skopt")

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
storing_path = Path('../results')
figure_path = Path('../figures')

## Load performance metrics of all models and combine in single DataFrame

### With resampling

In [None]:
df_dt_metrics = pd.read_csv(os.path.join(storing_path, 'DT_metrics_results.csv'))
                         
df_rf_metrics = pd.read_csv(os.path.join(storing_path, 'RF_metrics_results.csv'))
                         
df_nn_metrics = pd.read_csv(os.path.join(storing_path, 'NN_metrics_results.csv'))

df_lr_metrics = pd.read_csv(os.path.join(storing_path, 'LR_metrics_results.csv'))
                            
df_svm_metrics = pd.read_csv(os.path.join(storing_path, 'SVM_metrics_results.csv'))

df_metrics = pd.concat([df_dt_metrics, df_rf_metrics, df_nn_metrics, df_lr_metrics, df_svm_metrics])

### Without resampling

In [None]:
df_dt_metrics_nr = pd.read_csv(os.path.join(storing_path, 'DT_metrics_NO_Resampling_results.csv'))
                         
df_rf_metrics_nr = pd.read_csv(os.path.join(storing_path, 'RF_metrics_NO_Resampling_results.csv'))
                         
df_nn_metrics_nr = pd.read_csv(os.path.join(storing_path, 'NN_metrics_NO_Resampling_results.csv'))

df_lr_metrics_nr = pd.read_csv(os.path.join(storing_path, 'LR_metrics_NO_Resampling_results.csv'))
                            
df_svm_metrics_nr = pd.read_csv(os.path.join(storing_path, 'SVM_metrics_NO_Resampling_results.csv'))

df_metrics_nr = pd.concat([df_dt_metrics_nr, df_rf_metrics_nr, df_nn_metrics_nr, df_lr_metrics_nr, df_svm_metrics_nr])

## Converting the df_metrics and saving it for exporting

In [None]:
df_metrics_accandF1 = df_metrics[['Model', 'Accuracy_Mean', 'Accuracy_Std' ,'F1_Mean', 'F1_Std']]

# Round the numeric columns to two decimal places
df_metrics_accandF1[['Accuracy_Mean', 'Accuracy_Std' ,'F1_Mean', 'F1_Std']] = df_metrics_accandF1[['Accuracy_Mean', 'Accuracy_Std' ,'F1_Mean', 'F1_Std']].round(2)

# Save the new DataFrame to a CSV file
file_path = os.path.join(storing_path, 'Model_Comparison_accuracyandF1.csv')  # Specify the file path
df_metrics_accandF1.to_csv(file_path, index=False)


In [None]:
df_metrics_complete = df_metrics.copy()

# Exclude the first column (assuming it's non-numeric)
numeric_cols = df_metrics_complete.columns[1:]

# Convert the numeric columns to float type
df_metrics_complete[numeric_cols] = df_metrics_complete[numeric_cols].astype(float)

# Round the values in the numeric columns to two decimal places
df_metrics_complete[numeric_cols] = df_metrics_complete[numeric_cols].round(2)

# Print the rounded DataFrame
print(df_metrics_complete)

# Specify the file path
file_path = os.path.join(storing_path, 'data_for_model_comparision_complete.csv')

# Save the rounded DataFrame to a CSV file
df_metrics_complete.to_csv(file_path, index=False)

## Plot the performance results for models with resampling

In [None]:

selected_models = ['Decision Tree','DT prepruned', 'DT postpruned', 'Random forest', 'XGBoost', 'Logistic regression','SVM', 'Neural Networks']

# Filter the dataframe to include only the selected models
df_selected_metrics = df_metrics[df_metrics['Model'].isin(selected_models)]

# Define colors for each metric
#colors = ['saddlebrown', 'olivedrab', 'rosybrown', 'gray' ]
#colors = ['#377eb8', '#ff7f00', '#4daf4a', '#984ea3']
colors = ['#c0c0c0', '#3cb371', '#ffc1a1', '#b0c4de']

# Extract data for plotting
models = df_selected_metrics['Model']
metrics = ['Accuracy', 'F1', 'Precision', 'Recall']
mean_columns = [f'{metric}_Mean' for metric in metrics]
std_columns = [f'{metric}_Std' for metric in metrics]
mean_values = df_selected_metrics[mean_columns].values
std_values = df_selected_metrics[std_columns].values

# Plotting
bar_width = 0.2
opacity = 0.8
index = np.arange(len(models))

fig, ax = plt.subplots(figsize=(16, 8))

for i, metric in enumerate(metrics):
    mean_data = mean_values[:, i]
    std_data = std_values[:, i]
    ax.bar(index + i * bar_width, mean_data, bar_width,
           alpha=opacity, label=f'{metric}_Mean', yerr=std_data, capsize=5, color=colors[i])

# Adjust font sizes and labels for selected models
fontsize = 14
ax.set_xlabel('Models', fontsize=16)
ax.set_ylabel('Performance Metrics', fontsize=16)
#ax.set_title('Comparison of Model Performance Metrics', fontsize=fontsize)
ax.set_xticks(index + (bar_width * (len(metrics) - 1)) / 2)
ax.set_xticklabels(['Decision Tree','DT prepruned', 'DT postpruned', 'Random Forest', 'XGBoost', 'Logistic Regression', 'SVM', 'Neural Networks'], fontsize=fontsize)
plt.yticks(fontsize=14)
ax.legend(fontsize=fontsize)

plt.tight_layout()

# Save the figure
plt.savefig(os.path.join(figure_path, 'Model_Performance_Comparision_with_Resampling_new.png'), facecolor=(1,1,1,0), bbox_inches='tight')

# Show the plot
plt.show()


## Plot the performance results for models with no resampling

In [None]:
colors = ['#c0c0c0', '#3cb371', '#ffc1a1', '#b0c4de']
models = df_metrics_nr['Model']
metrics = ['Accuracy', 'F1', 'Precision', 'Recall']
mean_columns = [f'{metric}_Mean' for metric in metrics]
std_columns = [f'{metric}_Std' for metric in metrics]
mean_values = df_metrics_nr[mean_columns].values
std_values = df_metrics_nr[std_columns].values

# Plotting
bar_width = 0.2
opacity = 0.8
index = np.arange(len(models))

fig, ax = plt.subplots(figsize=(20, 8))

for i, metric in enumerate(metrics):
    mean_data = mean_values[:, i]
    std_data = std_values[:, i]
    ax.bar(index + i * bar_width, mean_data, bar_width,
           alpha=opacity, label=f'{metric}_Mean', yerr=std_data, capsize=5, color=colors[i])

ax.set_xlabel('Models')
ax.set_ylabel('Performance Metrics')
ax.set_title('Comparison of Model Performance Metrics without resampling')
ax.set_xticks(index + (bar_width * (len(metrics) - 1)) / 2)
ax.set_xticklabels(models)
ax.legend()

plt.tight_layout()
fig.savefig(os.path.join(figure_path, 'Model_Performance_Comparision_without_Resampling_new.png'), facecolor=(1,1,1,0), bbox_inches='tight')

plt.show()


In [None]:
df_metrics_accandF1_nr = df_metrics_nr[['Model', 'Accuracy_Mean', 'Accuracy_Std' ,'F1_Mean', 'F1_Std']]

# Round the numeric columns to two decimal places
df_metrics_accandF1_nr[['Accuracy_Mean', 'Accuracy_Std' ,'F1_Mean', 'F1_Std']] = df_metrics_accandF1_nr[['Accuracy_Mean', 'Accuracy_Std' ,'F1_Mean', 'F1_Std']].round(2)

# Save the new DataFrame to a CSV file
file_path = os.path.join(storing_path, 'Model_Comparison_accuracyandF1_nr_new.csv')  # Specify the file path
df_metrics_accandF1_nr.to_csv(file_path, index=False)

In [None]:
# Rename columns in df_metrics_nr to distinguish from df_metrics
df_metrics_nr_renamed = df_metrics_nr.rename(columns={
    'Accuracy_Mean': 'Accuracy_Mean_nr',
    'Accuracy_Std': 'Accuracy_Std_nr',
    'F1_Mean': 'F1_Mean_nr',
    'F1_Std': 'F1_Std_nr',
    'Precision_Mean': 'Precision_Mean_nr',
    'Precision_Std': 'Precision_Std_nr',
    'Recall_Mean': 'Recall_Mean_nr',
    'Recall_Std': 'Recall_Std_nr'
})

# Merge the two DataFrames on the 'Model' column
merged_df = pd.merge(df_metrics, df_metrics_nr_renamed, on='Model')


In [None]:
# Define the metrics
metrics = ['Accuracy', 'F1', 'Precision', 'Recall']

# Create individual plots for each metric
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(16, 12))

colors = ['#c0c0c0', '#3cb371', '#ffc1a1', '#b0c4de']
# Reshape axes to fit the new layout
axes = axes.flatten()

for i, metric in enumerate(metrics):
    # Extract relevant columns
    mean_metrics_column = f'{metric}_Mean'
    std_metrics_column = f'{metric}_Std'
    mean_metrics_nr_column = f'{metric}_Mean_nr'
    std_metrics_nr_column = f'{metric}_Std_nr'

    # Calculate the differences
    mean_diff = merged_df[mean_metrics_column] - merged_df[mean_metrics_nr_column]

    # Create a bar plot with error bars
    models = merged_df['Model']
    index = np.arange(len(models))

    bars = axes[i].bar(index, mean_diff, alpha=0.7, label=f'{metric} Difference', color=colors[i])

    axes[i].set_xlabel('Models')
    axes[i].set_ylabel(f'{metric} Difference')
    axes[i].set_title(f'Difference in {metric} Mean with and without Resampling')
    axes[i].set_xticks(index)
    axes[i].set_xticklabels(models, rotation=45, ha='right')  # Rotate x-axis labels for better visibility
    axes[i].legend()

    # Add grid lines for better readability
    axes[i].grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()

plt.savefig(os.path.join(figure_path, 'Difference_between_Metrics_with_and_without_Resampling.png'), facecolor=(1,1,1,0), bbox_inches='tight', dpi=300)
plt.show()
