In [None]:
import pandas as pd
import numpy as np
from mrmr import mrmr_classif

import operator
import numpy as np
from itertools import chain
import heapq

#Import self-written functions
import os
import sys
src_dir = os.path.join(os.getcwd(), '..', 'src')
sys.path.append(src_dir)

from d04_model_evaluation.model_evaluation import *

'''
Parameters to configure
'''

#set dataset
dataset = 'example'

# Set the number of evaluation rounds that must be performed.
n_runs = 10

# Set number of items to return in list for top-n feature importance
top_n = 10


In [None]:
#Get processed feature data
features_all_df = pd.read_excel(f'../results/1. Trace Link Feature Data/features_non-normalized.xlsx')

#Load label dataframe
labels_df = pd.read_pickle(r'../data/03_processed/labels_df.pkl')
#print(labels_df)

# Encode Bool values of label to integers
features_all_df['is_valid'] = labels_df['is_valid']
features_all_df['is_valid'] = labels_df['is_valid'].values
features_all_df['is_valid'] = features_all_df['is_valid'].astype(int)

#Set the NaN to 0
features_all_df = features_all_df.fillna(0)



Start MRMR feature selection for 40 features

In [None]:
#Split
X = features_all_df.drop(['is_valid'], axis=1)
y = features_all_df['is_valid']
K_features = 40

selected_features = mrmr_classif(X, y, K = 40)

# create subset of original dataframe on output of selection process
feature_subset_df = features_all_df[selected_features]

# Export subset of features as .xslx file
feature_subset_df.to_excel(excel_writer = f"../results/5. Feature selection subsets/{dataset}_subset_K{K_features}.xlsx", index = False)

file = open(f"../results/5. Feature selection subsets/{dataset}_K{K_features}_selected_features.txt", "w")
for feature in selected_features:
    file.write(feature + "\n")
file.close()

Perform additonal processing to prepare for evaluation

In [None]:
#Get number of features to pass to importance_array (model_evalution.py)
n_features_df = feature_subset_df.shape[1]

#Saving feature names
feature_name_df = list(feature_subset_df.columns)

#Transform pandas dataframe into numpy arrays
features_all_array = np.array(feature_subset_df)
labels_array = np.array(y)

## Evaluation for K = 40

In [None]:
results_df = pd.DataFrame(columns=['algorithm', 'Rebalancing', 'K (#features)', 'Accuracy', 'Precision', 'Recall', 'F1', 'F2', 'F0.5', 'Average Precision'])

df_columns = ['algorithm'] + ['Rebalancing'] + ['K (#features)'] + feature_name_df
importance_df_40 = pd.DataFrame(columns=df_columns)

feature_results, feature_importance = generate_evaluation_metrics(rebalancing_strategy = 'none', 
                                                                classification_algorithm = 'xg_boost', 
                                                                data = features_all_array, 
                                                                labels = labels_array, 
                                                                feature_names = feature_name_df,
                                                                is_normalized = False,
                                                                n_runs = n_runs,
                                                                n_features = n_features_df,
                                                                K_features = K_features)

# Get averages of evaluation
temp_list = feature_results.mean(axis=0)
temp_list = temp_list.reset_index(drop=True)
temp_list = temp_list.tolist()
temp_list = ['xgboost'] + ['none'] + ['40'] + temp_list
results_df.loc[len(results_df)] = temp_list

results_df.loc[len(results_df)] = ['std below'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std']

#Standard deviation
temp_list = feature_results[['Accuracy', 'Precision', 'Recall', 'F1', 'F2', 'F0.5', 'Average Precision']].std()
temp_list = temp_list.reset_index(drop=True)
temp_list = temp_list.tolist()
temp_list = ['xgboost'] + ['none'] + ['40'] + temp_list
results_df.loc[len(results_df)] = temp_list

temp_list = feature_importance.mean(axis=0)
temp_list = temp_list.reset_index(drop=True)
temp_list = temp_list.tolist()
temp_list = ['xgboost'] + ['none'] + ['40'] + temp_list
importance_df_40.loc[len(importance_df_40)] = temp_list

feature_results, feature_importance = generate_evaluation_metrics(rebalancing_strategy = '5050', 
                                                                classification_algorithm = 'light_gbm', 
                                                                data = features_all_array, 
                                                                labels = labels_array, 
                                                                feature_names = feature_name_df,
                                                                is_normalized = False,
                                                                n_runs = n_runs,
                                                                n_features = n_features_df,
                                                                K_features = K_features)

# Get averages of evaluation
temp_list = feature_results.mean(axis=0)
temp_list = temp_list.reset_index(drop=True)
temp_list = temp_list.tolist()
temp_list = ['lightgbm'] + ['5050'] + ['40'] + temp_list
results_df.loc[len(results_df)] = temp_list

results_df.loc[len(results_df)] = ['std below'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std']

#Standard deviation
temp_list = feature_results[['Accuracy', 'Precision', 'Recall', 'F1', 'F2', 'F0.5', 'Average Precision']].std()
temp_list = temp_list.reset_index(drop=True)
temp_list = temp_list.tolist()
temp_list = ['lightgbm'] + ['5050'] + ['40'] + temp_list
results_df.loc[len(results_df)] = temp_list

temp_list = feature_importance.mean(axis=0)
temp_list = temp_list.reset_index(drop=True)
temp_list = temp_list.tolist()
temp_list = ['lightgbm'] + ['5050'] + ['40'] + temp_list
importance_df_40.loc[len(importance_df_40)] = temp_list

importance_df_40.to_excel(excel_writer = f"../results/5. Feature selection subsets/{dataset}_importance_subset_K40.xlsx", index = False)

## Evaluation for K = 50

In [None]:
#Split
X = features_all_df.drop(['is_valid'], axis=1)
y = features_all_df['is_valid']
K_features = 50

selected_features = mrmr_classif(X, y, K = 50)

# create subset of original dataframe on output of selection process
feature_subset_df = features_all_df[selected_features]

# Export subset of features as .xslx file
feature_subset_df.to_excel(excel_writer = f"../results/5. Feature selection subsets/{dataset}_subset_K{K_features}.xlsx", index = False)

file = open(f"../results/5. Feature selection subsets/{dataset}_K{K_features}_selected_features.txt", "w")
for feature in selected_features:
    file.write(feature + "\n")
file.close()

In [None]:
#Get number of features to pass to importance_array (model_evalution.py)
n_features_df = feature_subset_df.shape[1]

#Saving feature names
feature_name_df = list(feature_subset_df.columns)

#Transform pandas dataframe into numpy arrays
features_all_array = np.array(feature_subset_df)
labels_array = np.array(y)

In [None]:
df_columns = ['algorithm'] + ['Rebalancing'] + ['K (#features)'] + feature_name_df
importance_df_50 = pd.DataFrame(columns=df_columns)

feature_results, feature_importance = generate_evaluation_metrics(rebalancing_strategy = 'none', 
                                                                classification_algorithm = 'xg_boost', 
                                                                data = features_all_array, 
                                                                labels = labels_array, 
                                                                feature_names = feature_name_df,
                                                                is_normalized = False,
                                                                n_runs = n_runs,
                                                                n_features = n_features_df,
                                                                K_features = K_features)

# Get averages of evaluation
temp_list = feature_results.mean(axis=0)
temp_list = temp_list.reset_index(drop=True)
temp_list = temp_list.tolist()
temp_list = ['xgboost'] + ['none'] + ['50'] + temp_list
results_df.loc[len(results_df)] = temp_list

results_df.loc[len(results_df)] = ['std below'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std']

#Standard deviation
temp_list = feature_results[['Accuracy', 'Precision', 'Recall', 'F1', 'F2', 'F0.5', 'Average Precision']].std()
temp_list = temp_list.reset_index(drop=True)
temp_list = temp_list.tolist()
temp_list = ['xgboost'] + ['none'] + ['50'] + temp_list
results_df.loc[len(results_df)] = temp_list

temp_list = feature_importance.mean(axis=0)
temp_list = temp_list.reset_index(drop=True)
temp_list = temp_list.tolist()
temp_list = ['xgboost'] + ['none'] + ['50'] + temp_list
importance_df_50.loc[len(importance_df_50)] = temp_list

feature_results, feature_importance = generate_evaluation_metrics(rebalancing_strategy = '5050', 
                                                                classification_algorithm = 'light_gbm', 
                                                                data = features_all_array, 
                                                                labels = labels_array, 
                                                                feature_names = feature_name_df,
                                                                is_normalized = False,
                                                                n_runs = n_runs,
                                                                n_features = n_features_df,
                                                                K_features = K_features)

# Get averages of evaluation
temp_list = feature_results.mean(axis=0)
temp_list = temp_list.reset_index(drop=True)
temp_list = temp_list.tolist()
temp_list = ['lightgbm'] + ['5050'] + ['50'] + temp_list
results_df.loc[len(results_df)] = temp_list

results_df.loc[len(results_df)] = ['std below'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std']

#Standard deviation
temp_list = feature_results[['Accuracy', 'Precision', 'Recall', 'F1', 'F2', 'F0.5', 'Average Precision']].std()
temp_list = temp_list.reset_index(drop=True)
temp_list = temp_list.tolist()
temp_list = ['lightgbm'] + ['5050'] + ['50'] + temp_list
results_df.loc[len(results_df)] = temp_list

temp_list = feature_importance.mean(axis=0)
temp_list = temp_list.reset_index(drop=True)
temp_list = temp_list.tolist()
temp_list = ['lightgbm'] + ['5050'] + ['50'] + temp_list
importance_df_50.loc[len(importance_df_50)] = temp_list

importance_df_50.to_excel(excel_writer = f"../results/5. Feature selection subsets/{dataset}_importance_subset_K50.xlsx", index = False)

## Evaluation for K = 60

In [None]:
#Split
X = features_all_df.drop(['is_valid'], axis=1)
y = features_all_df['is_valid']
K_features = 60

selected_features = mrmr_classif(X, y, K = 60)

# create subset of original dataframe on output of selection process
feature_subset_df = features_all_df[selected_features]

# Export subset of features as .xslx file
feature_subset_df.to_excel(excel_writer = f"../results/5. Feature selection subsets/{dataset}_subset_K{K_features}.xlsx", index = False)

file = open(f"../results/5. Feature selection subsets/{dataset}_K{K_features}_selected_features.txt", "w")
for feature in selected_features:
    file.write(feature + "\n")
file.close()

In [None]:
#Get number of features to pass to importance_array (model_evalution.py)
n_features_df = feature_subset_df.shape[1]

#Saving feature names
feature_name_df = list(feature_subset_df.columns)

#Transform pandas dataframe into numpy arrays
features_all_array = np.array(feature_subset_df)
labels_array = np.array(y)

In [None]:
df_columns = ['algorithm'] + ['Rebalancing'] + ['K (#features)'] + feature_name_df
importance_df_60 = pd.DataFrame(columns=df_columns)

feature_results, feature_importance = generate_evaluation_metrics(rebalancing_strategy = 'none', 
                                                                classification_algorithm = 'xg_boost', 
                                                                data = features_all_array, 
                                                                labels = labels_array, 
                                                                feature_names = feature_name_df,
                                                                is_normalized = False,
                                                                n_runs = n_runs,
                                                                n_features = n_features_df,
                                                                K_features = K_features)

# Get averages of evaluation
temp_list = feature_results.mean(axis=0)
temp_list = temp_list.reset_index(drop=True)
temp_list = temp_list.tolist()
temp_list = ['xgboost'] + ['none'] + ['60'] + temp_list
results_df.loc[len(results_df)] = temp_list

results_df.loc[len(results_df)] = ['std below'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std']

#Standard deviation
temp_list = feature_results[['Accuracy', 'Precision', 'Recall', 'F1', 'F2', 'F0.5', 'Average Precision']].std()
temp_list = temp_list.reset_index(drop=True)
temp_list = temp_list.tolist()
temp_list = ['xgboost'] + ['none'] + ['50'] + temp_list
results_df.loc[len(results_df)] = temp_list

temp_list = feature_importance.mean(axis=0)
temp_list = temp_list.reset_index(drop=True)
temp_list = temp_list.tolist()
temp_list = ['xgboost'] + ['none'] + ['60'] + temp_list
importance_df_60.loc[len(importance_df_60)] = temp_list

feature_results, feature_importance = generate_evaluation_metrics(rebalancing_strategy = '5050', 
                                                                classification_algorithm = 'light_gbm', 
                                                                data = features_all_array, 
                                                                labels = labels_array, 
                                                                feature_names = feature_name_df,
                                                                is_normalized = False,
                                                                n_runs = n_runs,
                                                                n_features = n_features_df,
                                                                K_features = K_features)

# Get averages of evaluation
temp_list = feature_results.mean(axis=0)
temp_list = temp_list.reset_index(drop=True)
temp_list = temp_list.tolist()
temp_list = ['lightgbm'] + ['505'] + ['60'] + temp_list
results_df.loc[len(results_df)] = temp_list

results_df.loc[len(results_df)] = ['std below'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std'] + ['std']

#Standard deviation
temp_list = feature_results[['Accuracy', 'Precision', 'Recall', 'F1', 'F2', 'F0.5', 'Average Precision']].std()
temp_list = temp_list.reset_index(drop=True)
temp_list = temp_list.tolist()
temp_list = ['lightgbm'] + ['5050'] + ['60'] + temp_list
results_df.loc[len(results_df)] = temp_list

temp_list = feature_importance.mean(axis=0)
temp_list = temp_list.reset_index(drop=True)
temp_list = temp_list.tolist()
temp_list = ['lightgbm'] + ['5050'] + ['60'] + temp_list
importance_df_60.loc[len(importance_df_60)] = temp_list

importance_df_60.to_excel(excel_writer = f"../results/5. Feature selection subsets/{dataset}_importance_subset_K60.xlsx", index = False)

In [None]:
# Export results of all runs together 
results_df.to_excel(excel_writer = f"../results/5. Feature selection subsets/{dataset}_results_subset_K40_50_60.xlsx", index = False)

# Create top-10 file for feature importance
importance_dfs = [importance_df_40, importance_df_50, importance_df_60]
feature_importance = []

for df_iteration in importance_dfs:
    for row in df_iteration.itertuples():
        row_local = list(row[4:])
        top = heapq.nlargest(top_n, row_local)
        feature_list = [row[1], row[2], row[3]]
        for item in top:
            index = row_local.index(item)
            feature = df_iteration.columns[(index+3)]
            feature_value = feature, item
            feature_list.append(feature)
            feature_list.append(item)
        feature_importance.append(feature_list)

feature_importance = pd.DataFrame(columns=['algorithm', 'balancing', 'K (#features)', '1', '1', '2', '2', '3', '3', '4', '4', '5', '5', '6', '6', '7', '7',' 8', '8', '9', '9', '10', '10'], data=feature_importance)
feature_importance.to_excel(excel_writer= f"../results/5. Feature selection subsets/{dataset}_feature_importance_top{top_n}_K40_50_60.xlsx", index=False)
        