In [None]:
import os
import sys
sys.path.append('../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from tqdm import tqdm

from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler, normalize

from utils.data_utils import mod_df, drop_features, inverse_mod_X, inverse_mod_y, fill_with_mean, apply_standard_scale, apply_std, apply_is_zero
# from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

In [None]:
train_df = pd.read_csv(os.path.join('data', 'unionTrain.csv'))
test_df = pd.read_csv(os.path.join('data', 'unionTest.csv'))

In [None]:
X_train, y_train = mod_df(train_df)
X_test, y_test = mod_df(test_df)

In [None]:
def feature_engineering(df):
    result = df.copy()
    
    # select & drop features
    result = result.drop(['id', 'timestep'], axis=1)
    # result = drop_features(result,[0,1,2,5,15,16,17,18,21,20,23,24])
    # result = drop_features(result, [15, 16, 17, 18, 20, 21, 23, 24])
    # result = result[['0X', '1Y', '4Y', '5Y', '7X', '8Y', '12Y', '13Y', '14Y', '17Y', '20X', '24X', '24Y']] # ts = 0.09
    # result = result[['1Y','2Y','4Y','5Y','6Y','7X','8Y','11Y','12Y','13Y','14Y','20X','22Y','23X','24X','24Y']] # ts = 0.07 bug
    # result = result[['1Y', '2Y', '4X', '4Y', '5Y', '6Y', '7X', '8Y', '10X', '11Y', '12Y', '13Y', '14X', '14Y', '20X', '21Y', '22Y', '23X', '24X', '24Y']] # ts = 0.04 bug
    # result = result[['1Y', '2Y', '3Y', '4X', '4Y', '5Y', '6Y', '7X', '7Y', '8Y', '10X', '11Y', '12X', '12Y', '13Y', '14X', '14Y', '16Y', '20X', '21Y', '22X', '22Y', '23X', '23Y', '24X', '24Y']] # ts = 0.001 bug
    DEFAULT_FEATURE = result.columns.to_list()
    
    # fill outlier
    # for col in tqdm(DEFAULT_FEATURE):
    #     feature = result[col].to_numpy()
    #     result[col] = fill_with_mean(feature)
    
    # add features
    # result['test_0X'] = result['0X']
    FEATURE_COLUMNS = result.columns.to_list()
    for col in tqdm(FEATURE_COLUMNS):
        feature = result[col]
        feature = feature.to_numpy()
        # result[f'{col}_savgol_1'] = apply_savgol_filter(feature, window_size=21, polynomial=1)
        # result[f'{col}_savgol_7'] = apply_savgol_filter(feature, window_size=21, polynomial=2)
        # result[f'{col}_savgol_2'] = apply_savgol_filter(feature, window_size=21, polynomial=3)
        # result[f'{col}_savgol_3'] = apply_savgol_filter(feature, window_size=21, polynomial=5)
        # result[f'{col}_savgol_4'] = apply_savgol_filter(feature, window_size=11, polynomial=1)
        # result[f'{col}_savgol_8'] = apply_savgol_filter(feature, window_size=11, polynomial=2)
        # result[f'{col}_savgol_5'] = apply_savgol_filter(feature, window_size=11, polynomial=3)
        # result[f'{col}_savgol_6'] = apply_savgol_filter(feature, window_size=11, polynomial=5)
        # result[f'{col}_median'] = apply_median_filter(feature)
        # result[f'{col}_max'] = apply_maximum_filter(feature, window_size=5)
        # result[f'{col}_sav_med'] = apply_median_filter(apply_savgol_filter(feature, window_size=21), window_size=5)
        # result[f'{col}_sav_min_max_scale'] = apply_min_max_scale(apply_savgol_filter(feature, window_size=11, polynomial=2))
        # result[f'{col}_standard_min_max_scale'] = apply_min_max_scale(apply_standard_scale(feature))
        

        # result[f'{col}_max'] = apply_max_value(feature)
        # result[f'{col}_min'] = apply_min_value(feature)
        # result[f'{col}_mean'] = apply_mean(feature)
        # result[f'{col}_std'] = apply_std(feature)
        # result[f'{col}_is_zero'] = apply_is_zero(feature)
        # result[f'{col}_min_max_scale'] = apply_min_max_scale(feature)
        result[f'{col}_standard_scale'] = apply_standard_scale(feature)
        # result[f'{col}_fill_with_mean'] = fill_with_mean(feature)
    
    # modify features
    FEATURE_COLUMNS = result.columns.to_list()
    for col in tqdm(FEATURE_COLUMNS):
        feature = result[col]
        feature = feature.to_numpy()
        # result[col] = apply_savgol_filter(feature)
        # result[col] = apply_median_filter(feature)
        # result[col] = apply_median_filter(apply_savgol_filter(feature, window_size=21), window_size=5)
        # result[col] = fill_with_mean(feature)
        # result[col] = apply_standard_scale(feature)
    
    # drop default features
    # result = result.drop(DEFAULT_FEATURE, axis=1)
      
    return result

X_train_1 = feature_engineering(X_train)
X_test_1 = feature_engineering(X_test)
y_train_1 = np.argmax(y_train.to_numpy(), axis=1)+1
y_test_1 = np.argmax(y_test.to_numpy(), axis=1)+1

In [None]:
# X_train_1['0X'].hist()

In [None]:
X_train_1['0X'].to_numpy().shape
DEFAULT_FEATURE = X_train_1.columns.to_list()
# y_train_1.shape

In [None]:
def plot_feature_hist(pd_series, bins=100, feature_name=None, range=None):
    plt.hist(pd_series, bins=bins, range=range)
    plt.axvline(pd_series.mean(), color='red')
    plt.axvline(pd_series.mean()-pd_series.std(), color='blue')
    plt.axvline(pd_series.mean()+pd_series.std(), color='blue')
    plt.axvline(pd_series.mean()-2*pd_series.std(), color='green')
    plt.axvline(pd_series.mean()+2*pd_series.std(), color='green')
    plt.title(feature_name)
    plt.grid()
    # plt.show()

In [None]:
# hist = X_train_1[X_train_1_feature>10].hist(feature, bins=100)
feature = '1Y'
X_train_1_feature = X_train_1[feature]
X_train_1_feature_mod = X_train_1_feature
# X_train_1_feature_mod = X_train_1_feature[X_train_1_feature>0]
plot_feature_hist(X_train_1_feature_mod, bins=100, feature_name=feature)

In [None]:
# X_train_1.columns

In [None]:
for feature in ['4X', '4Y', '5X', '5Y', '6X', '6Y', '8X', '8Y', '9X', '9Y']:
    X_train_1_feature = X_train_1[feature]
    X_train_1_feature_mod = X_train_1_feature
    # X_train_1_feature_mod = X_train_1_feature[X_train_1_feature>0]
    # X_train_1_feature_mod.hist()
    # plot_feature_hist(X_train_1_feature_mod, bins=100, feature_name=feature, range=None)
    # plot_feature_hist(X_train_1[f'{feature}_fill_with_mean'], bins=100, feature_name=feature, range=None)
    # plot_feature_hist(X_train_1[f'{feature}_standard_scale'], bins=100, feature_name=feature, range=None)
    plot_feature_hist(X_train_1[f"{feature}_standard_scale"], bins=100, feature_name=feature, range=(-8, 8))
    plt.show()