# Notebook for model evaluation


Notes on this notebook

1. Currently, the notebook picks the required feature files from the results folder. Maybe this is not sufficient if it is wished to run the evaluation notebook parallel to the feature generation notebook.

In [None]:
import pandas as pd
import numpy as np

#Import self-written functions
import os
import sys
src_dir = os.path.join(os.getcwd(), '..', 'src')
sys.path.append(src_dir)

from d04_model_evaluation.model_evaluation import *

In [None]:
#Get processed feature data
features_all_df = pd.read_excel(f'../results/1. Trace Link Feature Data/features_non-normalized.xlsx')
features_all_normalized_df = pd.read_excel(f'../results/1. Trace Link Feature Data/features_normalized.xlsx')

#Get number of features to pass to importance_array (model_evalution.py)
n_features_df = features_all_df.shape[1]
n_features_normalized_df = features_all_normalized_df.shape[1]

In [None]:
#Get processed feature data of created subsets
features_all_df = pd.read_excel(f'../results/5. Feature selection subsets/features_subset_non-normalised.xlsx')
features_all_normalized_df = pd.read_excel(f'../results/5. Feature selection subsets/features_subset_normalized.xlsx')

#Get number of features to pass to importance_array (model_evalution.py)
n_features_df = features_all_df.shape[1]
n_features_normalized_df = features_all_normalized_df.shape[1]

Perform additional preprocessing

In [None]:
#Set the NaN to 0
features_all_df = features_all_df.fillna(0)
features_all_normalized_df = features_all_normalized_df.fillna(0)

#Saving feature names for later use
feature_name_df = list(features_all_df.columns)
feature_name_normalised_df = list(features_all_normalized_df.columns)

#Transform pandas data frame into numpy arrays
features_all_array = np.array(features_all_df)
features_all_normalized_array = np.array(features_all_normalized_df)

#Load labels
labels_df = pd.read_pickle(r'../data/03_processed/labels_df.pkl')
labels_array = np.array(labels_df["is_valid"])

# 4 Evaluation - Non-normalized
## Random Forest

In [None]:
generate_evaluation_metrics(rebalancing_strategy = 'none', 
                            classification_algorithm = 'random_forests', 
                            data = features_all_array, 
                            labels = labels_array, 
                            feature_names = feature_name_df,
                            is_normalized = False,
                            n_runs = 2,
                            n_features = n_features_df)

generate_evaluation_metrics(rebalancing_strategy = 'over', 
                            classification_algorithm = 'random_forests', 
                            data = features_all_array, 
                            labels = labels_array, 
                            feature_names = feature_name_df,
                            is_normalized = False,
                            n_runs = 2,
                            n_features = n_features_df)

generate_evaluation_metrics(rebalancing_strategy = 'under', 
                            classification_algorithm = 'random_forests', 
                            data = features_all_array, 
                            labels = labels_array, 
                            feature_names = feature_name_df,
                            is_normalized = False,
                            n_runs = 2,
                            n_features = n_features_df)

generate_evaluation_metrics(rebalancing_strategy = '5050', 
                            classification_algorithm = 'random_forests', 
                            data = features_all_array, 
                            labels = labels_array, 
                            feature_names = feature_name_df,
                            is_normalized = False,
                            n_runs = 2,
                            n_features = n_features_df)

## XGBoost

In [None]:
generate_evaluation_metrics(rebalancing_strategy = 'none', 
                            classification_algorithm = 'xg_boost', 
                            data = features_all_array, 
                            labels = labels_array, 
                            feature_names = feature_name_df,
                            is_normalized = False,
                            n_runs = 2,
                            n_features = n_features_df)

generate_evaluation_metrics(rebalancing_strategy = 'over', 
                            classification_algorithm = 'xg_boost', 
                            data = features_all_array, 
                            labels = labels_array,
                            feature_names = feature_name_df,
                            is_normalized = False,
                            n_runs = 2,
                            n_features = n_features_df)

generate_evaluation_metrics(rebalancing_strategy = 'under', 
                            classification_algorithm = 'xg_boost', 
                            data = features_all_array, 
                            labels = labels_array, 
                            feature_names = feature_name_df,
                            is_normalized = False,
                            n_runs = 2,
                            n_features = n_features_df)

generate_evaluation_metrics(rebalancing_strategy = '5050', 
                            classification_algorithm = 'xg_boost', 
                            data = features_all_array, 
                            labels = labels_array, 
                            feature_names = feature_name_df,
                            is_normalized = False,
                            n_runs = 2,
                            n_features = n_features_df)

## LightGBM

In [None]:
generate_evaluation_metrics(rebalancing_strategy = 'none', 
                            classification_algorithm = 'light_gbm', 
                            data = features_all_array, 
                            labels = labels_array, 
                            feature_names = feature_name_df,
                            is_normalized = False,
                            n_runs = 2,
                            n_features = n_features_df)

generate_evaluation_metrics(rebalancing_strategy = 'over', 
                            classification_algorithm = 'light_gbm', 
                            data = features_all_array, 
                            labels = labels_array, 
                            feature_names = feature_name_df,
                            is_normalized = False,
                            n_runs = 2,
                            n_features = n_features_df)

generate_evaluation_metrics(rebalancing_strategy = 'under', 
                            classification_algorithm = 'light_gbm', 
                            data = features_all_array, 
                            labels = labels_array, 
                            feature_names = feature_name_df,
                            is_normalized = False,
                            n_runs = 2,
                            n_features = n_features_df)

generate_evaluation_metrics(rebalancing_strategy = '5050', 
                            classification_algorithm = 'light_gbm', 
                            data = features_all_array, 
                            labels = labels_array, 
                            feature_names = feature_name_df,
                            is_normalized = False,
                            n_runs = 2,
                            n_features = n_features_df)

# 4 Evaluation - Normalized
## Random Forest

In [None]:
generate_evaluation_metrics(rebalancing_strategy = 'none', 
                            classification_algorithm = 'random_forests', 
                            data = features_all_normalized_array, 
                            labels = labels_array, 
                            feature_names = feature_name_normalised_df,
                            is_normalized = True,
                            n_runs = 2,
                            n_features = n_features_normalized_df)

generate_evaluation_metrics(rebalancing_strategy = 'over', 
                            classification_algorithm = 'random_forests', 
                            data = features_all_normalized_array, 
                            labels = labels_array,
                            feature_names = feature_name_normalised_df,
                            is_normalized = True,
                            n_runs = 2,
                            n_features = n_features_normalized_df)

generate_evaluation_metrics(rebalancing_strategy = 'under', 
                            classification_algorithm = 'random_forests', 
                            data = features_all_normalized_array, 
                            labels = labels_array, 
                            feature_names = feature_name_normalised_df,
                            is_normalized = True,
                            n_runs = 2,
                            n_features = n_features_normalized_df)

generate_evaluation_metrics(rebalancing_strategy = '5050', 
                            classification_algorithm = 'random_forests', 
                            data = features_all_normalized_array, 
                            labels = labels_array, 
                            feature_names = feature_name_normalised_df,
                            is_normalized = True,
                            n_runs = 2,
                            n_features = n_features_normalized_df)

## XGBoost

In [None]:
generate_evaluation_metrics(rebalancing_strategy = 'none', 
                            classification_algorithm = 'xg_boost', 
                            data = features_all_normalized_array, 
                            labels = labels_array, 
                            feature_names = feature_name_normalised_df,
                            is_normalized = True,
                            n_runs = 2,
                            n_features = n_features_normalized_df)

generate_evaluation_metrics(rebalancing_strategy = 'over', 
                            classification_algorithm = 'xg_boost', 
                            data = features_all_normalized_array, 
                            labels = labels_array, 
                            feature_names = feature_name_normalised_df,
                            is_normalized = True,
                            n_runs = 2,
                            n_features = n_features_normalized_df)

generate_evaluation_metrics(rebalancing_strategy = 'under', 
                            classification_algorithm = 'xg_boost', 
                            data = features_all_normalized_array, 
                            labels = labels_array, 
                            feature_names = feature_name_normalised_df,
                            is_normalized = True,
                            n_runs = 2,
                            n_features = n_features_normalized_df)

generate_evaluation_metrics(rebalancing_strategy = '5050', 
                            classification_algorithm = 'xg_boost', 
                            data = features_all_normalized_array, 
                            labels = labels_array, 
                            feature_names = feature_name_normalised_df,
                            is_normalized = True,
                            n_runs = 2,
                            n_features = n_features_normalized_df)

## LightGBM

In [None]:
generate_evaluation_metrics(rebalancing_strategy = 'none', 
                            classification_algorithm = 'light_gbm', 
                            data = features_all_normalized_array, 
                            labels = labels_array, 
                            feature_names = feature_name_normalised_df,
                            is_normalized = True,
                            n_runs = 2,
                            n_features = n_features_normalized_df)

generate_evaluation_metrics(rebalancing_strategy = 'over', 
                            classification_algorithm = 'light_gbm', 
                            data = features_all_normalized_array, 
                            labels = labels_array, 
                            feature_names = feature_name_normalised_df,
                            is_normalized = True,
                            n_runs = 2,
                            n_features = n_features_normalized_df)

generate_evaluation_metrics(rebalancing_strategy = 'under', 
                            classification_algorithm = 'light_gbm', 
                            data = features_all_normalized_array, 
                            labels = labels_array, 
                            feature_names = feature_name_normalised_df,
                            is_normalized = True,
                            n_runs = 2,
                            n_features = n_features_normalized_df)

generate_evaluation_metrics(rebalancing_strategy = '5050', 
                            classification_algorithm = 'light_gbm', 
                            data = features_all_normalized_array, 
                            labels = labels_array, 
                            feature_names = feature_name_normalised_df,
                            is_normalized = True,
                            n_runs = 2,
                            n_features = n_features_normalized_df)