In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from utils import utils_gn, utils_sig, utils_dgrd, utils_models
from config.definitions import ROOT_DIR
import importlib
importlib.reload(utils_gn)
importlib.reload(utils_sig)
importlib.reload(utils_models)
importlib.reload(utils_dgrd)
warnings.filterwarnings("ignore")

In [None]:
# load train raw data
train_raw = utils_gn.read_data('train_1238.pkl')

In [None]:
# Load train raw data
test_raw = utils_gn.read_data('test_1238.pkl')

In [None]:
# Load test raw data
test_target = utils_gn.read_data('true_test_labels_1238.pkl')

In [None]:
# Define the list of number of selected features
k_list = np.linspace(0.1, 0.9, 9) * 66   # we have a total of 66 features
k_list = [int(i) for i in k_list]
k_list

In [None]:
# Train model with high-frequncy data with features 
# selected by the RRCT algorithm using different values 
# selection threshold-- a case of EOL and IRatEOL

df_k = utils_models.model_feature_selection(
    train_raw=train_raw,
    test_raw=test_raw,
    y_test_df=test_target,
    target_list=['EOL'], # target_list=['IRatEOL'],
    k_list=k_list,
    params={'n_estimators': 100, 'reg_alpha': 0.1, 'max_depth': 2, 'min_samples_split': 3} #{'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.1} 
)

In [None]:
df = df_k.copy()

In [None]:
df

In [None]:
df = df.apply(pd.to_numeric)
df.index = np.arange(10, 100, 10)  # change index to percentages
df['MAPE_train'] = df['MAPE_train'] * 100  # change MAPE to percentages
df['MAPE_test'] = df['MAPE_test'] * 100    # change MAPE to percentages
df 

In [None]:
# A bar chart of the errors for each percentage of features retained

fig = plt.figure(figsize=(10, 5))

for i, item in enumerate(df.columns):
    ax = fig.add_subplot(2, 3, i+1)

    if i in [0, 2]:
        ax.set_title(item.split('_')[0]+" (cycles)", fontsize=16)
        #ax.set_title(item.split('_')[0]+r" ($\Omega$)", fontsize=16)
    
    if i == 1:
        ax.set_title(item.split('_')[0]+r" ($\%$)", fontsize=16)
       
    ax.bar(df.index.map(str), df[item].values, color='brown', ec='black', alpha=0.78)
    ax.tick_params(axis='x', rotation=90, labelsize=14)
    ax.tick_params(axis='y', labelsize=14)

    if i not in [3, 4, 5]:
        ax.set_xticklabels([])
    else:
        ax.set_xlabel('Feature percentage (%)', fontsize=14)
    
    if i == 0:
        ax.set_ylabel('Train errors', size=16)   
    
    if i == 3:
        ax.set_ylabel('Test errors', size=16)

plt.tight_layout()
plt.savefig(fname=f"{ROOT_DIR}/plots/rrct_feature_selection_n_100_tabs12.pdf", bbox_inches='tight')

In [None]:
# Model with top 10% of features and data recorded at 4 mins interval

res_10p_4min = utils_models.model_feature_selection(
                            train_raw=train_raw,
                            test_raw=test_raw,
                            y_test_df=test_target,
                            target_list=['EOL'],
                            k_list=[6],  # integer part of 10% of 66
                            step_size=80, # key for 4 mins sub-sampling
                            params = {'n_estimators': 100, 'reg_alpha': 0.1, 'max_depth': 2, 'min_samples_split': 3} #{'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.1} # 
)

In [None]:
res_10p_4min

In [None]:
# function to plot bar chart of feature ranking (correspoding to top 10% features selected by the RRCT algorithm)

def plot_rank_bar_chart(x, y, labels):

    fig = plt.figure(figsize=(10, 5))

    for i, item in enumerate(x):

        ax = fig.add_subplot(1, 2, i+1)
        ax.text(0.6, 0.95, labels[i], transform=ax.transAxes, fontsize=16, fontweight='bold', va='top')

        ax.bar(x[i], y[::-1], color='brown', ec='black', alpha=0.78)
        
        for j, p in enumerate(ax.patches):
            ax.annotate(y[j], (p.get_x()+p.get_width()/2., p.get_height()), ha='left', 
                        va='center', xytext=(0, 10), textcoords='offset points', size=14)
            
        ax.set_xlabel('Top 10% selected features', fontsize=16)
        ax.tick_params(axis='x', rotation=90, labelsize=14)
        ax.set_ylabel('Rankings')
        ax.yaxis.set_visible(False)
        ax.set_frame_on(False)
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['left'].set_visible(False)
    
    plt.savefig(fname=f"{ROOT_DIR}/plots/sig-rank-10p-4min.pdf", bbox_inches='tight')

In [None]:
x = [
    ['S1-Sig-diff', 'Min-S1', 'Var-S1', 'S21-Sig-100', 'Kurt-S21', 'S21-Sig-diff'],
    ['Mean-S22', 'Max-S2', 'S12-Sig-50', 'S11-Sig-diff', 'S21-Sig-1', 'S2-Sig-100']
    ]
y = [1, 2, 3, 4, 5, 6]
labels = ['EOL', 'IRatEOL']

In [None]:
plot_rank_bar_chart(x=x, y=y, labels=labels)