In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [2]:
# Carica il file 
file_path = "./dataset/ML-CUP24-TR.csv"
data = pd.read_csv(file_path, comment="#", header=None)

# drop first column not useful 
data = data.drop(data.columns[0], axis=1)

#shuffle data-set
# random_state=1 meaning that for all run the result is the same
data = data.sample(frac=1, random_state=1)

# take row number 
numsample = data.shape[0]

#select 80 % of rows
trainingPercentage = numsample * 0.8


# take the rows for k-fold validation 
selectionSet = data.iloc[:int(trainingPercentage), :]

# take the row for final testing 
testSet =  data.iloc[int(trainingPercentage):, :]

In [3]:
selectionSet.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
67,0.658214,-0.746251,-1.075565,0.749953,0.661491,-8.2e-05,0.06576,-0.074431,0.995056,4.954183,1.175069,-0.117254,-0.634115,-0.624596,10.122236
249,0.512589,-0.852849,-1.065483,0.857103,0.515145,5e-06,0.051254,-0.085286,0.995037,10.747492,1.001451,-0.100134,-1.483401,-0.823175,15.22603
230,-0.9106,0.401268,-1.091666,-0.403227,-0.9151,-0.000216,-0.090637,0.039703,0.995092,2.876918,1.467926,-0.145843,0.237865,0.755455,7.693303
161,-0.884819,0.45519,-1.066643,-0.457457,-0.889232,-2.1e-05,-0.088473,0.045491,0.995039,8.18751,1.021079,-0.102075,0.595585,1.224458,13.698484
91,-0.886405,-0.453609,-1.19489,0.455947,-0.889994,-0.004808,-0.079991,-0.046359,0.995717,0.398413,4.393699,-0.389459,-0.296103,0.080915,2.885263


In [4]:
selectionSet.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200 entries, 67 to 198
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1       200 non-null    float64
 1   2       200 non-null    float64
 2   3       200 non-null    float64
 3   4       200 non-null    float64
 4   5       200 non-null    float64
 5   6       200 non-null    float64
 6   7       200 non-null    float64
 7   8       200 non-null    float64
 8   9       200 non-null    float64
 9   10      200 non-null    float64
 10  11      200 non-null    float64
 11  12      200 non-null    float64
 12  13      200 non-null    float64
 13  14      200 non-null    float64
 14  15      200 non-null    float64
dtypes: float64(15)
memory usage: 25.0 KB


In [5]:
testSet.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
131,-0.225215,-0.797034,0.359026,0.798644,-0.601577,-0.016516,0.060769,0.053311,0.996727,-0.160117,8.388224,0.545529,-1.538677,0.09432,-2.509624
237,0.248795,-0.963758,-1.153674,0.968214,0.250117,-0.001716,0.025738,-0.092805,0.995352,0.784149,2.882221,-0.275426,-0.381338,-0.248614,3.529721
30,0.765137,-0.636133,-1.065623,0.639306,0.768952,9e-06,0.076505,-0.063618,0.995038,11.081002,1.003821,-0.100368,-0.981587,-1.324267,17.842261
200,-0.984689,0.148957,0.605423,-0.150138,-0.988645,-0.006345,0.088576,-0.019842,0.995872,-0.329656,4.993656,0.42614,0.109942,0.027452,-2.324193
121,-0.253753,0.962166,0.904337,-0.966941,-0.255,-0.000115,0.025192,-0.095973,0.995065,-4.149985,1.256344,0.125228,0.762072,0.21713,-9.108539


In [6]:
testSet.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, 131 to 37
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1       50 non-null     float64
 1   2       50 non-null     float64
 2   3       50 non-null     float64
 3   4       50 non-null     float64
 4   5       50 non-null     float64
 5   6       50 non-null     float64
 6   7       50 non-null     float64
 7   8       50 non-null     float64
 8   9       50 non-null     float64
 9   10      50 non-null     float64
 10  11      50 non-null     float64
 11  12      50 non-null     float64
 12  13      50 non-null     float64
 13  14      50 non-null     float64
 14  15      50 non-null     float64
dtypes: float64(15)
memory usage: 6.2 KB


In [7]:
selectionSet = selectionSet.to_numpy()
testSet = testSet.to_numpy()

In [8]:
x_training = selectionSet[:, :-3]
y_training = selectionSet[:, -3:]

x_test= testSet[:, :-3]
y_test= testSet[:, -3:]


In [9]:
# Split training set into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x_training, y_training, test_size=0.2, random_state=42)



In [10]:
def create_model(hidden_layers, neurons_per_layer, activation, learning_rate):
    model = Sequential()
    # Input layer
    model.add(Dense(neurons_per_layer, input_dim=12, activation=activation))
    # Hidden layers
    for _ in range(hidden_layers):
        model.add(Dense(neurons_per_layer, activation=activation))
    # Output layer - linear activation implicitly 
    model.add(Dense(3))  # 3 outputs for regression
    # Compile model
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse', metrics=['mse'])
    return model


In [11]:
from sklearn.model_selection import ParameterGrid

# Define hyperparameter grid
param_grid = {
    'hidden_layers': [1,2],
    'neurons_per_layer': [64, 32 ,8],
    'activation': ['relu','elu'],
    'learning_rate': [ 0.01, 0.001, 0.1],
    'batch_size': [32, 64],
    'epochs': [500]
}

grid = ParameterGrid(param_grid)

best_model = None
best_mse = float('inf')
best_params = None

for params in grid:
    print(f"Testing parameters: {params}")
    # Create model
    model = create_model(params['hidden_layers'], params['neurons_per_layer'], params['activation'], params['learning_rate'])
    # Train model
    history = model.fit(x_train, y_train, 
                        validation_data=(x_val, y_val),
                        batch_size=params['batch_size'], 
                        epochs=params['epochs'], 
                        verbose=0)
    # Evaluate on validation set
    val_mse = model.evaluate(x_val, y_val, verbose=0)[1]  # [1] for 'mse' metric
    print(f"Validation MSE: {val_mse}")
    
    # Save best model
    if val_mse < best_mse:
        best_mse = val_mse
        best_model = model
        best_params = params

Testing parameters: {'activation': 'relu', 'batch_size': 32, 'epochs': 500, 'hidden_layers': 1, 'learning_rate': 0.01, 'neurons_per_layer': 64}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Validation MSE: 0.4031205177307129
Testing parameters: {'activation': 'relu', 'batch_size': 32, 'epochs': 500, 'hidden_layers': 1, 'learning_rate': 0.01, 'neurons_per_layer': 32}
Validation MSE: 0.4407014846801758
Testing parameters: {'activation': 'relu', 'batch_size': 32, 'epochs': 500, 'hidden_layers': 1, 'learning_rate': 0.01, 'neurons_per_layer': 8}
Validation MSE: 0.37223994731903076
Testing parameters: {'activation': 'relu', 'batch_size': 32, 'epochs': 500, 'hidden_layers': 1, 'learning_rate': 0.001, 'neurons_per_layer': 64}
Validation MSE: 0.39119383692741394
Testing parameters: {'activation': 'relu', 'batch_size': 32, 'epochs': 500, 'hidden_layers': 1, 'learning_rate': 0.001, 'neurons_per_layer': 32}
Validation MSE: 0.3138672113418579
Testing parameters: {'activation': 'relu', 'batch_size': 32, 'epochs': 500, 'hidden_layers': 1, 'learning_rate': 0.001, 'neurons_per_layer': 8}
Validation MSE: 0.36888784170150757
Testing parameters: {'activation': 'relu', 'batch_size': 32, 'epoc

In [12]:
# Evaluate the best model on the test set
test_mse = best_model.evaluate(x_test, y_test, verbose=0)[1]
print(f"Best Model Test MSE: {test_mse}")
print(f"Best Hyperparameters: {best_params}")


Best Model Test MSE: 0.43930351734161377
Best Hyperparameters: {'activation': 'relu', 'batch_size': 64, 'epochs': 500, 'hidden_layers': 1, 'learning_rate': 0.001, 'neurons_per_layer': 8}
