# Evaluate ML framework for SVM and Logistic regression

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.cm import ScalarMappable
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
import sklearn
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import seaborn as sns
import plotly.express as px
from skopt.plots import plot_convergence
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import xgboost.sklearn as xgb
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize
from itertools import product
from sklearn.utils import resample
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import make_scorer
from tabulate import tabulate
import warnings
import os
from pathlib import Path

from mlxai4cat.utils.data import prepare_dataset, stratified_sampling, resampling 
from mlxai4cat.utils.visualization import get_formatted_results, plot_feature_importance, plot_feature_importance_distribution
from mlxai4cat.models.neuralized_svm import NeuralizedSVM
from mlxai4cat.utils.LRP_tools import LRPAnalyzer
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=UserWarning, module="skopt")

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
storing_path = Path('../results')
figure_path = Path('../figures')

## Import data

In [None]:
_, X, y, X_pos, y_pos, X_neg, y_neg, feature_names = prepare_dataset('../data/ocm_cat_data.csv')

## Logistic regression

In [None]:
n = 100
acc_logit = []
precision_logit = []
recall_logit = []
f1_logit = []

# Initialize an empty list to store feature importances
feature_importances_logit = []

for rs in range(n):
    X_train, y_train, X_test, y_test = stratified_sampling(X_pos, X_neg, y_pos, y_neg, rs * 1234 + 567)
    X_train, y_train = resampling(X_train, y_train, overratio=0.6, underratio=1, randomstate=123)
    
    clf_logit = LogisticRegression(random_state=0).fit(X_train, y_train)
    y_pred = clf_logit.predict(X_test)
    acc_logit.append(accuracy_score(y_test, y_pred))
    precision_logit.append(precision_score(y_test, y_pred, zero_division=1))
    recall_logit.append(recall_score(y_test, y_pred))
    f1_logit.append(f1_score(y_test, y_pred))
    
    # Append the coefficients (feature importances) to the list
    feature_importances_logit.append(abs(clf_logit.coef_[0]))

### Display different performance metrics

In [None]:
df_metrics = get_formatted_results(acc_logit, f1_logit, precision_logit, recall_logit, model_name="Logistic regression", verbose=True, df_metrics=None)
df_metrics 

In [None]:
df_feature_importance = plot_feature_importance(feature_importances_logit, feature_names, model_name="Logistic regression", df_feature_importance=None, savedir=figure_path)

In [None]:
plot_feature_importance_distribution(feature_importances_logit, feature_names, "Logistic Regression", color='gray', savedir='../figures')

## Logistic regression without Resampling

In [None]:
n = 100
acc_logit_nr = []
precision_logit_nr = []
recall_logit_nr = []
f1_logit_nr = []

# Initialize an empty list to store feature importances
feature_importances_logit_nr = []

for rs in range(n):
    X_train, y_train, X_test, y_test = stratified_sampling(X_pos, X_neg, y_pos, y_neg, rs * 1234 + 567)
    
    clf_logit_nr = LogisticRegression(random_state=0).fit(X_train, y_train)
    y_pred_nr = clf_logit_nr.predict(X_test)
    acc_logit_nr.append(accuracy_score(y_test, y_pred_nr))
    precision_logit_nr.append(precision_score(y_test, y_pred_nr, zero_division=1))
    recall_logit_nr.append(recall_score(y_test, y_pred_nr))
    f1_logit_nr.append(f1_score(y_test, y_pred_nr))
    
    # Append the coefficients (feature importances) to the list
    feature_importances_logit_nr.append(abs(clf_logit_nr.coef_[0]))


### Display different performance metrics

In [None]:
df_metrics_nr = get_formatted_results(acc_logit_nr, f1_logit_nr, precision_logit_nr, recall_logit_nr, model_name="Logistic regression", verbose=True, df_metrics=None)
df_metrics_nr

In [None]:
df_feature_importance_nr = plot_feature_importance(feature_importances_logit_nr, feature_names, model_name="Logistic regression", df_feature_importance=None, savedir=figure_path)
df_feature_importance_nr

In [None]:
plot_feature_importance_distribution(feature_importances_logit_nr, feature_names, "Logistic Regression (no resampling)", color='gray', savedir=figure_path)

### Save logistic regression models evaluation and importance score results

In [None]:
## SAVING ANALYSIS RESULTS
df_metrics.to_csv(os.path.join(storing_path, 'LR_metrics_results.csv'), index=False)
df_metrics_nr.to_csv(os.path.join(storing_path, 'LR_metrics_NO_Resampling_results.csv'), index=False)
df_feature_importance.to_csv(os.path.join(storing_path, 'LR_feature_imp_with_sklearn_results.csv'), index=False)
df_feature_importance_nr.to_csv(os.path.join(storing_path, 'LR_feature_imp_with_sklearn_NO_Resampling_results.csv'), index=False)


## SVM with resampling

### Training and nested cross-validation

In [None]:
n = 100
acc_svm = []
precision_svm = []
recall_svm = []
f1_svm = []
kernels = []
Cs = []
gammas = []
feature_importances_svm = []
R_svr_accumulated_all = []

evaluated_points = {}

# Loop through different random splits
for rs in range(n):
    X_train, y_train, X_test, y_test = stratified_sampling(X_pos, X_neg, y_pos, y_neg, rs * 1234 + 567)
    X_train, y_train = resampling(X_train, y_train, overratio=0.6, underratio=1, randomstate=123)

    clf_svm = SVC(random_state=0)
    
    # Define the space differently for different kernels
    space = [Categorical(['rbf'], name='kernel')]
    if 'rbf' in space[0]:
        space.append(Real(10**-2, 10**2, "log-uniform", name='C'))
        space.append(Real(10**-4, 10**1, "log-uniform", name='gamma'))
    else:
        space.append(Real(10**-2, 10**2, "log-uniform", name='C')) 
    
    @use_named_args(space)
    def objective(**params):
        params_key = tuple(sorted(params.items()))
        
        # Check if the point has been evaluated before
        if params_key in evaluated_points:
            return evaluated_points[params_key]  
        
        # Set the kernel parameter based on the space definition
        kernel = params['kernel']
        del params['kernel']  # Remove kernel from params
        
        clf_svm.set_params(kernel=kernel, **params)
        score = -np.mean(cross_val_score(clf_svm, X_train, y_train, cv=5, n_jobs=8, scoring="f1"))
        
        # Store the result
        evaluated_points[params_key] = score
        return score

    res_gp = gp_minimize(objective, space, n_calls=30, random_state=0)
    
    # Adjust the model parameters based on the optimization results
    if 'rbf' in space[0]:
        clf_svm_optim = SVC(random_state=0, kernel=res_gp.x[0], C=res_gp.x[1], gamma=res_gp.x[2]) 
        Cs.append(res_gp.x[1])  # Store C values for rbf kernel
        gammas.append(res_gp.x[2])  # Store gamma values for rbf kernel
    else:
        clf_svm_optim = SVC(random_state=0, kernel=res_gp.x[0], C=res_gp.x[1], gamma='auto')  # Use C value of 1000 for non-rbf kernels
        Cs.append(res_gp.x[1])  # Store C values for non-rbf kernels
        gammas.append('auto')  # Store gamma as 'auto' for non-rbf kernels
    
    kernels.append(res_gp.x[0])

    clf_svm_optim.fit(X_train, y_train)
    y_pred = clf_svm_optim.predict(X_test)
    acc_svm.append(accuracy_score(y_test, y_pred))
    precision_svm.append(precision_score(y_test, y_pred, zero_division=1))
    recall_svm.append(recall_score(y_test, y_pred))
    f1_svm.append(f1_score(y_test, y_pred))

    # Perform neuralisation
    svm_neural = NeuralizedSVM(clf_svm_optim)
    R_svr_accumulated = svm_neural.explain(X_test, first_rule="GI", with_intercept=False, reweight_explanation=True)
    R_svr_accumulated_all.append(R_svr_accumulated)

    if rs % 10 == 0:
        print("Split %s" % rs)


### Display different performance metrics

In [None]:
df_metrics = get_formatted_results(acc_svm, f1_svm, precision_svm, recall_svm, model_name="SVM", verbose=True, df_metrics=None)
df_metrics

In [None]:
optimal_kernel = res_gp.x[0]
optimal_C = res_gp.x[1]
optimal_gamma = res_gp.x[2]  # Add this line to access the optimized gamma value

print("Optimal Kernel:", optimal_kernel)
print("Optimal C:", optimal_C)
print("Optimal Gamma:", optimal_gamma)  # Print the optimized gamma value


### Signed and absolute average feature importances

In [None]:
analyzer = LRPAnalyzer(np.array(R_svr_accumulated_all).reshape(-1, len(feature_names)), feature_names)
analyzer.calculate_mean_lrp_scores()
analyzer.calculate_mean_abs_lrp_scores()
analyzer.plot_lrp_scores(os.path.join(figure_path, 'sorted_mean_lrp_SVM_GI.png'))
analyzer.plot_abs_lrp_scores(os.path.join(figure_path, 'sorted_mean_abs_lrp_SVM_GI.png'))
analyzer.save_scores_to_csv(os.path.join(storing_path, 'sorted_mean_lrp_SVM.csv'), os.path.join(storing_path, 'sorted_mean_abs_lrp_SVM.csv'))

In [None]:
# collect and reshape relevances for plotting
print(R_svr_accumulated_all[0].shape)
svr_rels = np.stack(R_svr_accumulated_all, 0)

plot_feature_importance_distribution(np.abs(svr_rels).mean(1), feature_names, "SVM (no resampling)", color='gray', savedir=figure_path)

## SVM without Resampling

### Training and nested cross-validation

In [None]:
n = 100
acc_svm_nr = []
precision_svm_nr = []
recall_svm_nr = []
f1_svm_nr = []
kernels_nr = []
Cs_nr = []
gammas_nr = []
feature_importances_svm_nr = []
R_svr_accumulated_all_nr = []

evaluated_points = {}  

# Loop through different random splits
for rs in range(n):
    X_train, y_train, X_test, y_test = stratified_sampling(X_pos, X_neg, y_pos, y_neg, rs * 1234 + 567)
    #X_train, y_train = resampling(X_train, y_train, overratio=0.6, underratio=1, randomstate=123)

    clf_svm_nr = SVC(random_state=0)
    
    # Define the space differently for different kernels
    space = [Categorical(['rbf'], name='kernel')]
    if 'rbf' in space[0]:
        space.append(Real(10**-2, 10**2, "log-uniform", name='C'))
        space.append(Real(10**-4, 10**1, "log-uniform", name='gamma'))
    else:
        space.append(Real(10**-2, 10**2, "log-uniform", name='C')) 
    
    @use_named_args(space)
    def objective(**params):
        params_key = tuple(sorted(params.items()))
        
        # Check if the point has been evaluated before
        if params_key in evaluated_points:
            return evaluated_points[params_key]  
        
        # Set the kernel parameter based on the space definition
        kernel = params['kernel']
        del params['kernel']  # Remove kernel from params
        
        clf_svm_nr.set_params(kernel=kernel, **params)
        score = -np.mean(cross_val_score(clf_svm_nr, X_train, y_train, cv=5, n_jobs=8, scoring="f1"))
        
        # Store the result
        evaluated_points[params_key] = score
        return score

    res_gp_svm_nr = gp_minimize(objective, space, n_calls=30, random_state=0)
    
    # Adjust the model parameters based on the optimization results
    if 'rbf' in space[0]:
        clf_svm_optim_nr = SVC(random_state=0, kernel=res_gp_svm_nr.x[0], C=res_gp_svm_nr.x[1], gamma=res_gp_svm_nr.x[2]) 
        Cs_nr.append(res_gp_svm_nr.x[1])  # Store C values for rbf kernel
        gammas_nr.append(res_gp_svm_nr.x[2])  # Store gamma values for rbf kernel
    else:
        clf_svm_optim_nr = SVC(random_state=0, kernel=res_gp_svm_nr.x[0], C=res_gp_svm_nr.x[1], gamma='auto')  # Use C value of 1000 for non-rbf kernels
        Cs_nr.append(res_gp_svm_nr.x[1])  # Store C values for non-rbf kernels
        gammas_nr.append('auto')  # Store gamma as 'auto' for non-rbf kernels
    
    kernels_nr.append(res_gp_svm_nr.x[0])

    clf_svm_optim_nr.fit(X_train, y_train)
    y_pred = clf_svm_optim_nr.predict(X_test)
    acc_svm_nr.append(accuracy_score(y_test, y_pred))
    precision_svm_nr.append(precision_score(y_test, y_pred, zero_division=1))
    recall_svm_nr.append(recall_score(y_test, y_pred))
    f1_svm_nr.append(f1_score(y_test, y_pred))
    
  # Perform neuralisation
    svm_neural_nr = NeuralizedSVM(clf_svm_optim_nr)
    R_svr_accumulated_nr = svm_neural_nr.explain(X_test, first_rule="GI", with_intercept=False, reweight_explanation=True)
    R_svr_accumulated_all_nr.append(R_svr_accumulated_nr)

    if rs % 10 == 0:
        print("Split %s" % rs)


In [None]:
optimal_kernel = res_gp_svm_nr.x[0]
optimal_C = res_gp_svm_nr.x[1]
optimal_gamma = res_gp_svm_nr.x[2]  # Add this line to access the optimized gamma value

print("Optimal Kernel:", optimal_kernel)
print("Optimal C:", optimal_C)
print("Optimal Gamma:", optimal_gamma)  # Print the optimized gamma value


In [None]:
analyzer = LRPAnalyzer(np.array(R_svr_accumulated_all_nr).reshape(-1, len(feature_names)), feature_names)
analyzer.calculate_mean_lrp_scores()
analyzer.calculate_mean_abs_lrp_scores()
analyzer.plot_lrp_scores(os.path.join(figure_path, 'sorted_mean_lrp_SVM_NO_Resampling_GI.png'))
analyzer.plot_abs_lrp_scores(os.path.join(figure_path, 'sorted_mean_abs_lrp_SVM_NO_Resampling_GI.png'))
analyzer.save_scores_to_csv(os.path.join(storing_path, 'sorted_mean_lrp_SVM_NO_Resampling.csv'), os.path.join(storing_path, 'sorted_mean_abs_lrp_SVM_NO_Resampling.csv'))

### Display different performance metrics

In [None]:
df_metrics_nr = get_formatted_results(acc_svm_nr, f1_svm_nr, precision_svm_nr, recall_svm_nr, model_name="SVM", verbose=True, df_metrics = None)
df_metrics_nr 

In [None]:
# drop the last row  of df_metrics and reset the index
df_metrics

### Save SVM models evaluation and importance score results

In [None]:
## SAVING ANALYSIS RESULTS
df_metrics.to_csv(os.path.join(storing_path, 'SVM_metrics_results.csv'), index=False)
df_metrics_nr.to_csv(os.path.join(storing_path, 'SVM_metrics_NO_Resampling_results.csv'), index=False)