In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from tqdm import tqdm
plt.style.use('default')

sys.path.append(os.path.join(".."))

from utils.data_utils import mod_df, drop_features, inverse_mod_X, inverse_mod_y, convert_to_numpy, apply_standard_scale, apply_std, apply_mean
from my_model import create_model_1, create_model_1_1, create_model_1_2, create_model_2, create_model_2_1
# tf.__version__

In [None]:
train_df = pd.read_csv(os.path.join('..', 'data', 'unionTrain.csv'))
test_df = pd.read_csv(os.path.join('..', 'data', 'unionTest.csv'))

In [None]:
X_train, y_train = mod_df(train_df)
X_test, y_test = mod_df(test_df)

In [None]:
def feature_engineering(df):
    result = df.copy()
    
    # select & drop features
    result = result.drop(['id', 'timestep'], axis=1)
    # result = drop_features(result,[0,1,2,5,15,16,17,18,21,20,23,24])
    # result = drop_features(result, [15, 16, 17, 18, 20, 21, 23, 24])
    # result = result[['0X', '1Y', '4Y', '5Y', '7X', '8Y', '12Y', '13Y', '14Y', '17Y', '20X', '24X', '24Y']] # ts = 0.09
    # result = result[['1Y','2Y','4Y','5Y','6Y','7X','8Y','11Y','12Y','13Y','14Y','20X','22Y','23X','24X','24Y']] # ts = 0.07 bug
    # result = result[['1Y', '2Y', '4X', '4Y', '5Y', '6Y', '7X', '8Y', '10X', '11Y', '12Y', '13Y', '14X', '14Y', '20X', '21Y', '22Y', '23X', '24X', '24Y']] # ts = 0.04 bug
    # result = result[['1Y', '2Y', '3Y', '4X', '4Y', '5Y', '6Y', '7X', '7Y', '8Y', '10X', '11Y', '12X', '12Y', '13Y', '14X', '14Y', '16Y', '20X', '21Y', '22X', '22Y', '23X', '23Y', '24X', '24Y']] # ts = 0.001 bug
    DEFAULT_FEATURE = result.columns.to_list()
    
    # fill outlier
    # for col in tqdm(DEFAULT_FEATURE):
    #     feature = result[col].to_numpy()
    #     result[col] = fill_with_mean(feature)
    
    # add features
    # result['test_0X'] = result['0X']
    FEATURE_COLUMNS = result.columns.to_list()
    for col in tqdm(FEATURE_COLUMNS):
        feature = result[col]
        feature = feature.to_numpy()

        # result[f'{col}_max'] = apply_max_value(feature)
        # result[f'{col}_min'] = apply_min_value(feature)
        # result[f'{col}_mean'] = apply_mean(feature)
        # result[f'{col}_std'] = apply_std(feature)
        # result[f'{col}_is_zero'] = apply_is_zero(feature)
        # result[f'{col}_min_max_scale'] = apply_min_max_scale(feature)
        # result[f'{col}_standard_scale'] = apply_standard_scale(feature)
        # result[f'{col}_fill_with_mean'] = fill_with_mean(feature)

    # modify features
    FEATURE_COLUMNS = result.columns.to_list()
    for col in tqdm(FEATURE_COLUMNS):
        feature = result[col]
        feature = feature.to_numpy()
        # result[col] = fill_with_mean(feature)
        result[col] = apply_standard_scale(feature)
        
    return result

X_train_1 = feature_engineering(X_train)
X_test_1 = feature_engineering(X_test)
y_train_1 = np.argmax(y_train.to_numpy(), axis=1)+1
y_test_1 = np.argmax(y_test.to_numpy(), axis=1)+1

In [None]:
def clip_with_each_sample_sd(np_array, sd_mul=2):
    preprocessed_array = convert_to_numpy(np_array)
    mean = apply_mean(preprocessed_array)
    std = apply_std(preprocessed_array)
    mn_range, mx_range = mean-sd_mul*std, mean+sd_mul*std
    preprocessed_array = np.clip(preprocessed_array, mn_range, mx_range)
    return preprocessed_array

def clip_with_all_sample_sd(np_array, sd_mul=2):
    preprocessed_array = convert_to_numpy(np_array)
    std = np.std(preprocessed_array)
    mean = np.mean(preprocessed_array)
    mn_range, mx_range = mean-sd_mul*std, mean+sd_mul*std
    preprocessed_array = np.clip(preprocessed_array, mn_range, mx_range)
    return preprocessed_array

In [None]:
def plot_feature_hist(pd_series, bins=100, feature_name=None, range=None, show_avxline=False):
    plt.hist(pd_series, bins=bins, range=range)
    if show_avxline:
        plt.axvline(pd_series.mean(), color='red')
        plt.axvline(pd_series.mean()-pd_series.std(), color='blue')
        plt.axvline(pd_series.mean()+pd_series.std(), color='blue')
        plt.axvline(pd_series.mean()-2*pd_series.std(), color='green')
        plt.axvline(pd_series.mean()+2*pd_series.std(), color='green')
    plt.title(feature_name)
    plt.grid()
    # plt.show()

In [None]:
feature = X_train_1['0X']
plot_feature_hist(feature, range=(-8, 8), show_avxline=True)
plt.plot()
plot_feature_hist(clip_with_all_sample_sd(feature, sd_mul=1), range=(-8, 8))

In [None]:
feature = X_train_1['0X']
for i in range(5):
    plt.figure(i)
    sample_feature = feature[i*854:(i+1)*854].to_numpy()
    plot_feature_hist(sample_feature, range=(-8, 8), show_avxline=True)
    plot_feature_hist(clip_with_each_sample_sd(sample_feature, sd_mul=0.5), range=(-8, 8))
    plt.plot()