# Evaluate ML framework for tree ensamble models

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.cm import ScalarMappable
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
import sklearn
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import seaborn as sns
import plotly.express as px
from skopt.plots import plot_convergence
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import xgboost.sklearn as xgb
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import permutation_importance
from sklearn.neural_network import MLPClassifier
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize
from itertools import product
from sklearn.utils import resample
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import make_scorer
from tabulate import tabulate
import warnings
import os
from mlxai4cat.utils.data import prepare_dataset, stratified_sampling, resampling 
from mlxai4cat.utils.visualization import get_formatted_results, plot_feature_importance, plot_feature_importance_distribution
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=UserWarning, module="skopt")

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path

## Import data

In [None]:
_, X, y, X_pos, y_pos, X_neg, y_neg, feature_names = prepare_dataset('../data/ocm_cat_data.csv')

## Random forest

### Training and nested-cross validation

In [None]:
n = 5
acc_forest = []
precision_forest = []
recall_forest = []
f1_forest = []
max_depth_rf = []
n_estimators_rf = []
min_samples_split_rf = []
min_samples_leaf_rf = []
feature_importances_forest = []

# Loop through different random splits
for rs in range(n):
    X_train, y_train, X_test, y_test = stratified_sampling(X_pos, X_neg, y_pos, y_neg, rs * 1234 + 567)
    X_train, y_train = resampling(X_train, y_train, overratio=0.6, underratio=1, randomstate=123)

    clf_forest = RandomForestClassifier(random_state=0)
    space = [Integer(1, 10, name='max_depth'),
             Integer(50, 500, name='n_estimators'),
             Integer(2, 20, name='min_samples_split'),
             Integer(1, 20, name='min_samples_leaf')]
    
    @use_named_args(space)
    def objective(**params): # determine objective for gaussian process optimization - the cross validation score
        clf_forest.set_params(**params)
        return -np.mean(cross_val_score(clf_forest, X_train, y_train, cv=5, n_jobs=-1, scoring="f1"))
        
    # optimize hyperparameters with gaussian process minimization
    res_gp = gp_minimize(objective, space, n_calls=30, random_state=0)
    
    clf_forest_optim = RandomForestClassifier(random_state=0, max_depth=res_gp.x[0], n_estimators=res_gp.x[1],
                                              min_samples_split=res_gp.x[2], min_samples_leaf=res_gp.x[3])
    
    max_depth_rf.append(res_gp.x[0])
    n_estimators_rf.append(res_gp.x[1])
    min_samples_split_rf.append(res_gp.x[2])
    min_samples_leaf_rf.append(res_gp.x[3])

    clf_forest_optim.fit(X_train, y_train)
    y_pred = clf_forest_optim.predict(X_test)

    # Store feature importance for this split
    feature_importances_forest.append(clf_forest_optim.feature_importances_)

    acc_forest.append(accuracy_score(y_test, y_pred))
    precision_forest.append(precision_score(y_test, y_pred, zero_division=1))
    recall_forest.append(recall_score(y_test, y_pred))
    f1_forest.append(f1_score(y_test, y_pred))

    if rs % 25 == 0:
        print("Split %s" % rs)

### Display different performance metrics

In [None]:
df_metrics = get_formatted_results(acc_forest, f1_forest, precision_forest, recall_forest, model_name="Random forest", verbose=True, df_metrics=None)
df_metrics

In [None]:
df_feature_importance = plot_feature_importance(feature_importances_forest, feature_names, model_name="Random forest", df_feature_importance=None, savedir='../figures')

In [None]:
plot_feature_importance_distribution(feature_importances_forest, feature_names, "Random Forest", color='gray', savedir='../figures')

## Random Forest without Resampling

In [None]:
# WITH DISTRIBUTION OF feature importance without resampling
n = 5
acc_forest_nr = []
precision_forest_nr = []
recall_forest_nr = []
f1_forest_nr = []
max_depth_rf_nr = []
n_estimators_rf_nr = []
min_samples_split_rf_nr = []
min_samples_leaf_rf_nr = []
feature_importances_forest_nr = []

# Loop through different random splits
for rs in range(n):
    # Stratified sampling without resampling
    X_train, y_train, X_test, y_test = stratified_sampling(X_pos, X_neg, y_pos, y_neg, rs * 1234 + 567)
    
    clf_forest = RandomForestClassifier(random_state=0)
    space = [Integer(1, 10, name='max_depth'),
             Integer(50, 500, name='n_estimators'),
             Integer(2, 20, name='min_samples_split'),
             Integer(1, 20, name='min_samples_leaf')]

    # determine objective for gaussian process optimization - the cross validation score
    @use_named_args(space)
    def objective(**params):
        clf_forest.set_params(**params)
        return -np.mean(cross_val_score(clf_forest, X_train, y_train, cv=5, n_jobs=8, scoring="f1"))
        
    # optimize hyperparameters with gaussian process minimization
    res_gp = gp_minimize(objective, space, n_calls=30, random_state=0)
    
    clf_forest_optim = RandomForestClassifier(random_state=0, max_depth=res_gp.x[0], n_estimators=res_gp.x[1],
                                              min_samples_split=res_gp.x[2], min_samples_leaf=res_gp.x[3])
    
    max_depth_rf_nr.append(res_gp.x[0])
    n_estimators_rf_nr.append(res_gp.x[1])
    min_samples_split_rf_nr.append(res_gp.x[2])
    min_samples_leaf_rf_nr.append(res_gp.x[3])

    clf_forest_optim.fit(X_train, y_train)
    y_pred = clf_forest_optim.predict(X_test)

    # Store feature importance for this split
    feature_importances_forest_nr.append(clf_forest_optim.feature_importances_)

    acc_forest_nr.append(accuracy_score(y_test, y_pred))
    precision_forest_nr.append(precision_score(y_test, y_pred, zero_division=1))
    recall_forest_nr.append(recall_score(y_test, y_pred))
    f1_forest_nr.append(f1_score(y_test, y_pred))



### Display different performance metrics

In [None]:
df_metrics_nr = get_formatted_results(acc_forest_nr, f1_forest_nr, precision_forest_nr, recall_forest_nr, model_name="Random forest", verbose=True, df_metrics = None)

In [None]:
df_feature_importance_nr = plot_feature_importance(feature_importances_forest_nr, feature_names, model_name="Random forest", df_feature_importance=None, savedir='../figures')
df_feature_importance_nr

### XGBoost

In [None]:
n = 5
acc_XGBT = []
precision_XGBT = []
recall_XGBT = []
f1_XGBT = []
learning_rate_XGBT = []
n_estimators_XGBT = []
reg_alphas_XGBT = []
reg_lambdas_XGBT = []
feature_importances_XGBT = []

# Loop through different random splits
for rs in range(n):
    X_train, y_train, X_test, y_test = stratified_sampling(X_pos, X_neg, y_pos, y_neg, rs * 1234 + 567)
    X_train, y_train = resampling(X_train, y_train, overratio=0.6, underratio=1, randomstate=123)

    clf_XGBT = xgb.XGBClassifier(random_state=0)
    space = [Integer(1, 10, name='max_depth'),
             Real(10**-5, 10**0, "log-uniform", name='learning_rate'),
             Real(10**-5, 10**0, "log-uniform", name='reg_alpha'),
             Real(10**-5, 10**0, "log-uniform", name='reg_lambda'),
             Integer(50, 500, name='n_estimators')]
    
    @use_named_args(space)
    def objective(**params):
        clf_XGBT.set_params(**params)
        return -np.mean(cross_val_score(clf_XGBT, X_train, y_train, cv=5, n_jobs=-1, scoring="f1"))
    
    res_gp = gp_minimize(objective, space, n_calls=30, random_state=0)
    
    clf_XGBT_optim = xgb.XGBClassifier(random_state=0, max_depth=res_gp.x[0], learning_rate=res_gp.x[1],
                                       reg_alpha=res_gp.x[2], reg_lambda=res_gp.x[3], n_estimators=res_gp.x[4])
    
    learning_rate_XGBT.append(res_gp.x[1])
    n_estimators_XGBT.append(res_gp.x[4])
    reg_alphas_XGBT.append(res_gp.x[2])
    reg_lambdas_XGBT.append(res_gp.x[3])
    
    clf_XGBT_optim.fit(X_train, y_train)
    y_pred = clf_XGBT_optim.predict(X_test)
    
    acc_XGBT.append(accuracy_score(y_test, y_pred))
    precision_XGBT.append(precision_score(y_test, y_pred, zero_division=1))
    recall_XGBT.append(recall_score(y_test, y_pred))
    f1_XGBT.append(f1_score(y_test, y_pred))
    
    # Save feature importances for this split
    feature_importances_XGBT.append(clf_XGBT_optim.feature_importances_)

    if rs % 10 == 0:
        print("Split %s" % rs)


### Display different performance metrics

In [None]:
df_metrics = get_formatted_results(acc_XGBT, f1_XGBT, precision_XGBT, recall_XGBT, model_name="XGBoost", verbose=True, df_metrics=df_metrics)
df_metrics

In [None]:
df_feature_importance = plot_feature_importance(feature_importances_XGBT, feature_names, model_name="XGBoost", df_feature_importance=df_feature_importance, savedir='../figures')
df_feature_importance

In [None]:
plot_feature_importance_distribution(feature_importances_XGBT, feature_names, "XGBoost", color='gray', savedir='../figures')

## XGBoost without Resampling

In [None]:
n = 5
acc_XGBT_nr = []
precision_XGBT_nr = []
recall_XGBT_nr = []
f1_XGBT_nr = []
learning_rate_XGBT_nr = []
n_estimators_XGBT_nr = []
reg_alphas_XGBT_nr = []
reg_lambdas_XGBT_nr = []
feature_importances_XGBT_nr = []

# Loop through different random splits
for rs in range(n):
    X_train, y_train, X_test, y_test = stratified_sampling(X_pos, X_neg, y_pos, y_neg, rs * 1234 + 567)

    clf_XGBT = xgb.XGBClassifier(random_state=0)
    space = [Integer(1, 10, name='max_depth'),
             Real(10**-5, 10**0, "log-uniform", name='learning_rate'),
             Real(10**-5, 10**0, "log-uniform", name='reg_alpha'),
             Real(10**-5, 10**0, "log-uniform", name='reg_lambda'),
             Integer(50, 500, name='n_estimators')]
    
    @use_named_args(space)
    def objective(**params):
        clf_XGBT.set_params(**params)
        return -np.mean(cross_val_score(clf_XGBT, X_train, y_train, cv=5, n_jobs=-1, scoring="f1"))
    
    res_gp = gp_minimize(objective, space, n_calls=30, random_state=0)
    
    clf_XGBT_optim = xgb.XGBClassifier(random_state=0, max_depth=res_gp.x[0], learning_rate=res_gp.x[1],
                                       reg_alpha=res_gp.x[2], reg_lambda=res_gp.x[3], n_estimators=res_gp.x[4])
    
    learning_rate_XGBT_nr.append(res_gp.x[1])
    n_estimators_XGBT_nr.append(res_gp.x[4])
    reg_alphas_XGBT_nr.append(res_gp.x[2])
    reg_lambdas_XGBT_nr.append(res_gp.x[3])
    
    clf_XGBT_optim.fit(X_train, y_train)
    y_pred = clf_XGBT_optim.predict(X_test)
    
    acc_XGBT_nr.append(accuracy_score(y_test, y_pred))
    precision_XGBT_nr.append(precision_score(y_test, y_pred, zero_division=1))
    recall_XGBT_nr.append(recall_score(y_test, y_pred))
    f1_XGBT_nr.append(f1_score(y_test, y_pred))
    
    # Save feature importances for this split
    feature_importances_XGBT_nr.append(clf_XGBT_optim.feature_importances_)

    if rs % 10 == 0:
        print("Split %s" % rs)


### Display different performance metrics

In [None]:
df_metrics_nr = get_formatted_results(acc_XGBT_nr, f1_XGBT_nr, precision_XGBT_nr, recall_XGBT_nr, model_name="XGBoost", verbose=True, df_metrics = df_metrics_nr)

In [None]:
df_feature_importance_nr = plot_feature_importance(feature_importances_XGBT_nr, feature_names, model_name="XGBoost", df_feature_importance=df_feature_importance_nr, savedir='../figures')
df_feature_importance_nr

### Save tree ensamble models evaluation and importance score results

In [None]:
if not os.path.exists('../results'):
    os.mkdir('../results')
df_metrics.to_csv('../results/RF_metrics_results.csv', index=False)
df_metrics_nr.to_csv('../results/RF_metrics_NO_Resampling_results.csv', index=False)
df_feature_importance.to_csv('../results/RF_feature_imp_with_sklearn_results.csv', index=False)
df_feature_importance_nr.to_csv('../results/RF_feature_imp_with_sklearn_NO_Resampling_results.csv', index=False)
