In [1]:
import numpy as np
import pandas as pd
from utils import *
import warnings
# to ignore pandas warning
warnings.simplefilter(action='ignore', category=FutureWarning)
from NeuralNetwork import  *
from Ensemble import *
import matplotlib.pyplot as plt

The tr-set is split in 70% to perform 5-fold cross validation on models, 10% for early stopping and 20% as internal test-set.

In [2]:
col_names = ('Input1', 'Input2', 'Input3', 'Input4', 'Input5', 'Input6',
             'Input7', 'Input8', 'Input9', 'Input10', 'TARGET_x', 'TARGET_y', 'TARGET_z')

if os.path.isfile("train_split.csv"):
    tr_data = pd.read_csv("train_split.csv",
                   skiprows=1, usecols=[i for i in range(1, 14)], names=col_names)
    test_data = pd.read_csv("test_split.csv",
                   skiprows=1, usecols=[i for i in range(1, 14)], names=col_names)
    es_data = pd.read_csv("es_split.csv",
                   skiprows=1, usecols=[i for i in range(1, 14)], names=col_names)
else:
    data = pd.read_csv("../ML-23-PRJ-Package/ML-CUP23-TR.csv",
                   skiprows=7, usecols=[i for i in range(1, 14)], names=col_names)
    data = data.sample(frac=1)
    #SPLIT 80/20 FOR TRAIN/TEST
    folds = np.array_split(data, 10)
    tr_data = pd.concat(folds[2:9])
    test_data = pd.concat(folds[0:2])
    es_data=folds[9]
    tr_data.to_csv("train_split.csv")
    test_data.to_csv("test_split.csv")
    es_data.to_csv("es_split.csv")



## Gridsearch
We will test some hyperparameter combinations in order to get the best model.
<p> Every model is trained with a 5-fold cross , using as tr-set a 70% split on the original tr-set (10% is used for early stopping), MSE is used as a score on the validation set (within the 5-fold) to get the best model. 
<p> The following is only one of the grid search performed.

In [None]:
fullBatch = tr_data.shape[0]*0.8
grid={
"eta" : [0.001,0.005,0.0001],
    "mb" : [1,8, 128, fullBatch],
    "momentum" : [0.9,0.6,0.5,0.7,0.8,0.95],
    "n_layers" : [2],
    "n_neurons" : [50,100,150,200],
    "epochs" : [500],
    "clip_value" : [None],
    "hid_act_fun" : ["tanh"],
    "out_act_fun" : ["linear"],
    "cost_fun" : ["eucl"],
    "ridge_lambda": [None, 1e-8],
    "lasso_lambda": [None],
    "decay_max_steps": [None, 100],
    "decay_min_value": [10],
    "es_patience": [30]
}

search_space=get_search_space(grid)
print(len(search_space))
parallel_grid_search(k = 5, data = tr_data, es_data=es_data, search_space=search_space, n_inputs=10, n_outputs=3,type="cup")

Now we pick the best 5 models out of the grid search and will use them as an ensemble model.

The best results:

1. {'eta': 0.0001, 'mb': 1, 'momentum': 0.8, 'n_layers': 3, 'n_neurons': 200, 'epochs': 500, 'clip_value': None, 'hid_act_fun': 'tanh', 'out_act_fun': 'linear', 'cost_fun': 'eucl', 'ridge_lambda': 1e-08, 'lasso_lambda': None, 'decay_max_steps': None, 'decay_min_value': 10, 'es_patience': 30}<br>
Validation mean = 0.5776027113455716, Variance = 0.0026117155270263376<br>
Training mean (ES) = 0.2442599815747089

2. {'eta': 0.0001, 'mb': 1, 'momentum': 0.6, 'n_layers': 3, 'n_neurons': 200, 'epochs': 500, 'clip_value': None, 'hid_act_fun': 'tanh', 'out_act_fun': 'linear', 'cost_fun': 'eucl', 'ridge_lambda': 1e-08, 'lasso_lambda': None, 'decay_max_steps': None, 'decay_min_value': 10, 'es_patience': 30}<br>
Validation mean = 0.6003179533234724, Variance = 0.001471646127359402<br>
Training mean (ES) = 0.2625431767735682

3. {'eta': 0.0001, 'mb': 1, 'momentum': 0.9, 'n_layers': 2, 'n_neurons': 200, 'epochs': 500, 'clip_value': None, 'hid_act_fun': 'tanh', 'out_act_fun': 'linear', 'cost_fun': 'eucl', 'ridge_lambda': None, 'lasso_lambda': None, 'decay_max_steps': None, 'decay_min_value': 10, 'es_patience': 30}<br>
Validation mean = 0.6108271204955864, Variance = 0.002266016486143601<br>
Training mean (ES) = 0.25655232735599054

4. {'eta': 0.0001, 'mb': 8, 'momentum': 0.9, 'n_layers': 3, 'n_neurons': 200, 'epochs': 500, 'clip_value': None, 'hid_act_fun': 'tanh', 'out_act_fun': 'linear', 'cost_fun': 'eucl', 'ridge_lambda': None, 'lasso_lambda': None, 'decay_max_steps': None, 'decay_min_value': 10, 'es_patience': 30}<br>
Validation mean = 0.617711172518093, Variance = 0.0056700694722032425<br>
Training mean (ES) = 0.26035793552839165

5. {'eta': 0.0001, 'mb': 1, 'momentum': 0.5, 'n_layers': 3, 'n_neurons': 200, 'epochs': 500, 'clip_value': None, 'hid_act_fun': 'tanh', 'out_act_fun': 'linear', 'cost_fun': 'eucl', 'ridge_lambda': None, 'lasso_lambda': None, 'decay_max_steps': None, 'decay_min_value': 10, 'es_patience': 30}<br>
Validation mean = 0.6335390644035537, Variance = 0.004222898732823709<br>
Training mean (ES) = 0.2599176722840955


## Ensemble

Now we try to validate the ensemble model comparing the results of a 5-fold performed on the ensemble model and on the best single model (with and without using early stopping).<br>
(tr_set + es_data) which is an 80% split of the original training_set is used to perform 5-folds.<br>
The best model with the respect to the average validation MEE will be retrained using all the 80% split as the tr_set and tested on the internal test_set (20% split of the original training_set)

In [None]:
# lr decay is not used as a parameter beacuse there is no model among the best 5 which use it

train_params = {
    "eta" : [0.0001,0.0001,0.0001,0.0001,0.0001],
    "mb" : [1,1,1,8,1],
    "momentum" : [0.8,0.6,0.9,0.9,0.5],
    "es_stop": [0.244,0.262,0.256,0.260,0.259],
    "hid_act_fun" : "tanh",
    "out_act_fun" : "linear",
    "cost_fun" : "eucl",
    "ridge_lambda": [1e-8,1e-8,None,None,None]
}

if tr_data.shape[0] == 700:
    tr_data = pd.concat([tr_data,es_data])

# now we instantiate the 5 best models        
modelStructures = [(3,200),(3,200),(2,200),(3,200),(3,200)]


tr_mean,valid_mean, valid_var = k_fold_ensemble(5, tr_data,modelStructures, train_params,progress_bar=True,epochs=2000)


5-fold mee validation = **0.519**

In [None]:
# 5 fold with es
parameters = {
    'eta': 0.0001,
    'mb': 1,
    'momentum': 0.8, 
    'n_layers': 3, 
    'n_neurons': 200, 
    'epochs': 2000, 
    'clip_value': None, 
    'hid_act_fun': 'tanh', 
    'out_act_fun': 'linear', 
    'cost_fun': 'eucl', 
    'ridge_lambda': 1e-08, 
    'lasso_lambda': None, 
    'decay_max_steps': None, 
    'decay_min_value': 10, 
    'es_patience': 30 }

if tr_data.shape[0] == 700:
    tr_data = pd.concat([tr_data,es_data])
ES_STOP = 0.244
k_fold(5, tr_data, parameters,None,"cup",10,3,es_stop=ES_STOP)


5-fold mee validation = **0.560**

In [None]:
# 5 fold without es
parameters = {
    'eta': 0.0001,
    'mb': 1,
    'momentum': 0.8, 
    'n_layers': 3, 
    'n_neurons': 200, 
    'epochs': 2000, 
    'clip_value': None, 
    'hid_act_fun': 'tanh', 
    'out_act_fun': 'linear', 
    'cost_fun': 'eucl', 
    'ridge_lambda': 1e-08, 
    'lasso_lambda': None, 
    'decay_max_steps': None, 
    'decay_min_value': 10, 
    'es_patience': 30 }

if tr_data.shape[0] == 700:
    tr_data = pd.concat([tr_data,es_data])
k_fold(5, tr_data, parameters,None,"cup",10,3,es_stop=None)

5-fold mee validation = **0.538**

So the best model is the ensemble, now it'll be trained without 5-fold and tested on the internal test set.

In [None]:
train_params = {
    "eta" : [0.0001,0.0001,0.0001,0.0001,0.0001],
    "mb" : [1,1,1,8,1],
    "momentum" : [0.8,0.6,0.9,0.9,0.5],
    "es_stop": [0.244,0.262,0.256,0.260,0.259],
    "hid_act_fun" : "tanh",
    "out_act_fun" : "linear",
    "cost_fun" : "mse",
    "ridge_lambda": [1e-8,1e-8,None,None,None]
}

if tr_data.shape[0] == 700:
    tr_data = pd.concat([tr_data,es_data])

# now we instantiate the 5 best models        
modelStructures = [(3,200),(3,200),(2,200),(3,200),(3,200)]
ensemble = Ensemble(modelStructures)
test_mse, train_mse, test_mee,train_mee = ensemble.train_models(tr_data,train_params,test_data,"eucl",epochs=2000)


Now with the best model we can make prediction on the blind test set.

In [93]:
col_names = ('Input1', 'Input2', 'Input3', 'Input4', 'Input5', 'Input6',
             'Input7', 'Input8', 'Input9', 'Input10')

test = pd.read_csv("../ML-23-PRJ-Package/ML-CUP23-TS.csv",
                   skiprows=7, usecols=[i for i in range(1, 11)], names=col_names)

predictions=pd.DataFrame()
for input in test.itertuples(index=False, name=None):
    predictions=pd.concat([predictions,pd.DataFrame(ensemble.forwardPropagation(input,None,"tanh","linear",None,onlyPrediction=True).reshape((1,-1)))],ignore_index=True)
predictions.index+=1

header =["# Giuseppe De Marco, Alberto Dicembre","# Exploding gradients","# ML-CUP23", "# 08/07/2024"]

with open("Exploding_gradients_ML-CUP23-TS.csv", 'w', newline='') as f:
    for line in header:
        f.write(line + '\n')

predictions.to_csv("Exploding_gradients_ML-CUP23-TS.csv",header=False,mode="a")

Finally we try instantiating 5 ensable models to calculate the variance on the mee test results

In [None]:
train_params = {
    "eta" : [0.0001,0.0001,0.0001,0.0001,0.0001],
    "mb" : [1,1,1,8,1],
    "momentum" : [0.8,0.6,0.9,0.9,0.5],
    "es_stop": [0.244,0.262,0.256,0.260,0.259],
    "hid_act_fun" : "tanh",
    "out_act_fun" : "linear",
    "cost_fun" : "mse",
    "ridge_lambda": [1e-8,1e-8,None,None,None]
}

if tr_data.shape[0] == 700:
    tr_data = pd.concat([tr_data,es_data])

test_mse = []
train_mse = []
test_mee = []
train_mee = []

     
modelStructures = [(3,200),(3,200),(2,200),(3,200),(3,200)]
for i in range(5):
    ensemble = Ensemble(modelStructures)
    _test_mse, _train_mse, _test_mee, _train_mee = ensemble.train_models(tr_data,train_params,test_data,"eucl",epochs=2000)
    test_mse.append(_test_mse)
    test_mee.append(_test_mee)
    train_mse.append(_train_mse)
    train_mee.append(_train_mee)

train_mse_mean, train_mee_mean, test_mse_mean,test_mee_mean = plot_ensembles(test_mse,train_mse,test_mee,train_mee)
var_test_mee = np.array([test_mee[i][-1] for i in range(5)]).var()
print(f"train_MSE = {train_mse_mean[-1]}, test = {test_mse_mean[-1]}\ntrain_MEE = {train_mee_mean[-1]}, test = {test_mee_mean[-1]}")
print(f"mee test variance = {var_test_mee}")
