### FEATURE ENGINEERING

In [17]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from helpers import *
from implementations import *
from crossvalidation import *
from preprocessing import *
from dataset_splitting import *
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Loading train data

In [18]:
filename = 'train.csv'
data_folder = './data/'
file_path = data_folder + filename
y,tx,ids,features = load_train_data(file_path)

Retrieving logical masks to divide the dataset

In [19]:
categorical_column = np.where(features == 'PRI_jet_num')[0][0]

In [20]:
mask_0,mask_1,mask_2_3 = divide_indices_in_subsets(tx,categorical_column)

Removing categorical column since it is now useless 

In [21]:
tx = np.delete(tx,categorical_column,axis = 1)
# since we delete the column in tx, we also delete the name of the categorical feature used to divide the dataset
features = np.delete(features,categorical_column)

Splitting the dataset, the output vector and ids w.r.t according to the mask

In [22]:
subset_0, y_0, ids_0 = divide_dataset_in_subsets(tx,y,ids,mask_0)
subset_1, y_1, ids_1 = divide_dataset_in_subsets(tx,y,ids,mask_1)
subset_2_3, y_2_3, ids_2_3 = divide_dataset_in_subsets(tx,y,ids,mask_2_3)

Defining a list containing each subset 

In [23]:
list_subsets = [subset_0,subset_1,subset_2_3]

Define a list containing features for each subset

In [24]:
list_features = [features]*3

Managing missing values in each subset of data

In [25]:
for idx in range(3):
    list_subsets[idx],list_features[idx] = managing_missing_values(list_subsets[idx],features)

The last column in subset_0 is a zeros vector (see the documentation). Therefore, we drop it not to have problems when
standardizing

In [26]:
list_subsets[0] = np.delete(list_subsets[0],-1, 1)
list_features[0] = np.delete(list_features[0],-1)

Capping outliers in each subset by replacing them with 5% or 95% percentiles

In [27]:
for idx in range(3):
    list_subsets[idx] = capping_outliers(list_subsets[idx])

Defining trigonometric features (sine and cosine) starting from columns related to angle values

In [28]:
columns_angles_0 = [11, 14, 16]
columns_angles_1 = [11, 14, 16, 20]
columns_angles_2 = [15, 18, 20, 27]

list_subsets[0],list_features[0] = trigonometrics(list_subsets[0],columns_angles_0,list_features[0])
list_subsets[1],list_features[1] = trigonometrics(list_subsets[1],columns_angles_1,list_features[1])
list_subsets[2],list_features[2] = trigonometrics(list_subsets[2],columns_angles_2,list_features[2])

Applying logarithmic transformation to skewed distributions in each subset

In [None]:
# non runnare fino a quando fede non rivede quali sono quelle giuste

#to_log_c0 = [1, 2, 3, 4,6,7,8,10,12,15] 
#to_log_c1 = [1,2,3,4,6,7,8,10,12,15,16,18]
#to_log_c2 = [1,2,3,4,6,9,10,11,14,16,19,20,23,25]

#list_subsets[0][:,to_log_c0] =log_transform(list_subsets[0][:,to_log_c0])
#list_subsets[1][:,to_log_c1] = log_transform(list_subsets[1][:,to_log_c1])
#list_subsets[2][:,to_log_c2] = log_transform(list_subsets[2][:,to_log_c2])

In [None]:
# PER FEDE: ripetere il plot delle feature e rivedere quali sono inutili

Introducing interaction factors in each subsets by multypling pairs of columns

In [33]:
# we build interaction factors between columns. Each subset still doesn't have an offset column, which will be added later
# The cell takes time, the number of columns grows a lot. We must remove some columns before doing this expansion
#for idx in range(3):
    #list_subsets[idx] = build_interaction_factors(list_subsets[idx])

Standardizing data before using them

In [None]:
list_means = []
list_std = []
for idx in range(3):
    list_subsets[idx],mean,std = standardize(list_subsets[idx])
    list_means.append(mean)
    list_std.append(std)