### PREPROCESSING

Executing preprocessing routine

In [67]:
# The following magic method runs feature_engineering.ipynb
%run ./feature_engineering.ipynb

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Defining optimal hyperparameters values (see notebook choosing_hyperparameters.ipynb for more details)

In [68]:
best_lambdas = [0.000001]*3
best_degrees = [7,7,7]

### LOADING TEST DATA

Loading test data

In [69]:
filename = 'test.csv'
data_folder = './data/'
file_path = data_folder + filename
tx_test,test_ids,features_test=load_test_data(file_path)

Retrieving logical masks to divide test dataset

In [70]:
categorical_column = np.where(features_test == 'PRI_jet_num')[0][0]

In [71]:
mask_0_test,mask_1_test,mask_2_3_test = divide_indices_in_subsets(tx_test,categorical_column)

Removing categorical column since it is now useless

In [72]:
tx_test = np.delete(tx_test,categorical_column,axis = 1)
# since we delete the column in tx, we also delete the name of the categorical feature used to divide the dataset
features_test = np.delete(features_test,categorical_column)

Splitting test dataset, the output vector and ids w.r.t according to the mask

In [73]:
subset_0_test, ids_0_test = divide_test_dataset_in_subsets(tx_test,test_ids,mask_0_test)
subset_1_test, ids_1_test = divide_test_dataset_in_subsets(tx_test,test_ids,mask_1_test)
subset_2_3_test, ids_2_3_test = divide_test_dataset_in_subsets(tx_test,test_ids,mask_2_3_test)

Defining a list containing each subset 

In [74]:
list_subsets_test = [subset_0_test,subset_1_test,subset_2_3_test]

Define a list containing features for each subset

In [75]:
list_features_test = [features]*3

Dropping columns as done for train dataset and managing remaining missing values

In [76]:
for idx in range(3):
    list_subsets_test[idx] = list_subsets_test[idx][:,columns_to_drop_in_subsets[idx]]
    list_features_test[idx] = list_features_test[idx][columns_to_drop_in_subsets[idx]]
    for col in range(list_subsets_test[idx].shape[1]):
        median = np.nanmedian(list_subsets_test[idx][:,col])
        index = np.isnan(list_subsets_test[idx][:,col])
        list_subsets_test[idx][index,col] = median

The last column in subset_0 is a zeros vector (see the documentation). Therefore, we drop it not to have problems when
standardizing

In [77]:
list_subsets_test[0] = np.delete(list_subsets_test[0],-1, 1)
list_features_test[0] = np.delete(list_features_test[0],-1)

Defining trigonometric features (sine and cosine) starting from columns related to angle values

In [78]:
columns_angles_0 = [11, 14, 16]
columns_angles_1 = [11, 14, 16, 20]
columns_angles_2 = [15, 18, 20, 27]

list_subsets_test[0],list_features_test[0] = trigonometrics(list_subsets_test[0],columns_angles_0,list_features_test[0])
list_subsets_test[1],list_features_test[1] = trigonometrics(list_subsets_test[1],columns_angles_1,list_features_test[1])
list_subsets_test[2],list_features_test[2] = trigonometrics(list_subsets_test[2],columns_angles_2,list_features_test[2])

Applying logarithmic transformation to skewed distributions in each subset

In [79]:
to_log_c0 = [0,1,2,3,5,6,7,9,11,13,14] 
to_log_c1 = [0,1,2,3,5,6,7,9,11,13,14,15,17]
to_log_c2 = [0,1,2,3,5,8,9,10,13,15,17,18,19,22,24]


list_subsets_test[0][:,to_log_c0] =log_transform(list_subsets_test[0][:,to_log_c0])
list_subsets_test[1][:,to_log_c1] = log_transform(list_subsets_test[1][:,to_log_c1])
list_subsets_test[2][:,to_log_c2] = log_transform(list_subsets_test[2][:,to_log_c2])

Handling outliers by replacing them with 5% or 95% percentiles

In [80]:
for idx in range(3):
    list_subsets_test[idx] = capping_outliers(list_subsets_test[idx])

Dropping columns corresponding to useless variables in each subset

In [81]:
a=list(range(list_subsets_test[0].shape[1]))
useful_c0 = np.delete(a,useless_c0)
list_subsets_test[0] = list_subsets_test[0][:,useful_c0]
list_features_test[0] = list_features_test[0][useful_c0]

In [82]:
b=list(range(list_subsets_test[1].shape[1]))
useful_c1 = np.delete(b,useless_c1)
list_subsets_test[1] = list_subsets_test[1][:,useful_c1]
list_features_test[1] = list_features_test[1][useful_c1]

In [83]:
c=list(range(list_subsets_test[2].shape[1]))
useful_c2 = np.delete(c,useless_c2)
list_subsets_test[2] = list_subsets_test[2][:,useful_c2]
list_features_test[2] = list_features_test[2][useful_c2]

Standardizing test data

In [84]:
for idx in range(3):
    list_subsets_test[idx]= (list_subsets_test[idx] - list_means[idx]) / list_std[idx]

Expanding both test and train dataset according to degrees in best_degrees

In [85]:
for idx in range(3):
    list_subsets[idx] = build_poly(list_subsets[idx],best_degrees[idx],how_many_trig_features[idx])
    list_subsets_test[idx] = build_poly(list_subsets_test[idx],best_degrees[idx],how_many_trig_features[idx])

Training ridge regression model for each subset

In [86]:
final_ws = [0]*3
final_ws[0],_ = ridge_regression(y_0,list_subsets[0],best_lambdas[0])
final_ws[1],_ = ridge_regression(y_1,list_subsets[1],best_lambdas[1])
final_ws[2],_ = ridge_regression(y_2_3,list_subsets[2],best_lambdas[2])

Computing predictions for each test subset

In [87]:
prediction_0 = predict_ridge(list_subsets_test[0],final_ws[0])
prediction_1 = predict_ridge(list_subsets_test[1],final_ws[1])
prediction_2 = predict_ridge(list_subsets_test[2],final_ws[2])
all_predictions = np.concatenate([prediction_0,prediction_1,prediction_2])
all_ids = np.concatenate([ids_0_test,ids_1_test,ids_2_3_test])
all_predictions, all_ids = reordering_predictions(all_predictions,all_ids)

Creating final submission

In [88]:
create_submission(all_ids,all_predictions,['Id','Prediction'],'./output/ridge_regression_final.csv')