In [1]:
import optuna
import numpy as np
import pandas as pd
from scipy.spatial import distance
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import RepeatedKFold, cross_validate
from sklearn.metrics import r2_score, mean_absolute_error
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Define Model Structure and Objective for Oprimization

In [2]:
def build_model(n_hidden_layers=5, n_hidden_nodes=6, hidden_shrink=0.5, activation='relu', learning_rate_init=1e-3, alpha=1e-5, 
                batch_size=16, random_state=0):
    const_hidden_layer_sizes = np.array([n_hidden_nodes, ] * n_hidden_layers)
    shrink_hidden_layer_sizes = np.linspace(n_hidden_nodes, 1, num=n_hidden_layers)
    hidden_layer_sizes = np.round(const_hidden_layer_sizes * (1 - hidden_shrink) + shrink_hidden_layer_sizes * hidden_shrink).astype(int).tolist()
    
    model = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes, 
                         activation=activation, 
                         learning_rate_init=learning_rate_init,
                         alpha=alpha,
                         batch_size=batch_size,
                         random_state=random_state, 
                         max_iter=50_000, tol=0.00001
                        )
    return model


def count_trainable_parameters(model: MLPRegressor):
    n_params = sum(len(np.reshape(w, -1)) for w in model.coefs_)
    n_params += sum(len(np.reshape(b, -1)) for b in model.intercepts_)
    return n_params    

def score_model(**kwargs):
    X, y = df_att_aug[x_cols_std], df_att_aug[y_cols_std[1]].values.flatten()
    model = build_model(**kwargs)
    
    cv = RepeatedKFold(n_splits=5, n_repeats=2, random_state=0)
    cv_results = cross_validate(model, X, y, n_jobs=5, cv=cv, 
                                scoring=['r2', 'neg_median_absolute_error'], return_train_score=True)
    val_r2 = np.mean(cv_results['test_r2'])
    val_mae = -np.mean(cv_results['test_neg_median_absolute_error'])
    cv_train_r2 = np.mean(cv_results['train_r2'])
    cv_train_mae = -np.mean(cv_results['train_neg_median_absolute_error'])

    model.fit(X, y)
    train_r2 = r2_score(y, model.predict(X))
    train_mae = mean_absolute_error(y, model.predict(X))

    n_params = count_trainable_parameters(model)
    
    
    return val_r2, val_mae, cv_train_r2, cv_train_mae, train_r2, train_mae, n_params


import optuna


def objective(trial: optuna.Trial):
    n_hidden_layers = trial.suggest_int('n_hidden_layers', 2, 10)
    n_hidden_nodes = trial.suggest_int('n_hidden_nodes', 4, 32)
    hidden_shrink = trial.suggest_float('hidden_shrink', 0.0, 1.0)
    activation = trial.suggest_categorical('activation', ['relu', 'tanh', 'logistic'])
    learning_rate_init = trial.suggest_float('learning_rate_init', 1e-5, 1e-2, log=True)
    alpha = trial.suggest_float('alpha', 1e-7, 1e-3, log=True)
    batch_size = trial.suggest_int('batch_size', 8, 64, step=4)
    random_state = trial.suggest_categorical('random_state', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    
    val_r2, val_mae, cv_train_r2, cv_train_mae, train_r2, train_mae, n_params = score_model(
        n_hidden_layers=n_hidden_layers,
        n_hidden_nodes=n_hidden_nodes,
        hidden_shrink=hidden_shrink,
        activation=activation,
        learning_rate_init=learning_rate_init,
        alpha=alpha,
        batch_size=batch_size,
        random_state=random_state)

    return val_r2, val_mae, cv_train_r2, cv_train_mae, train_r2, train_mae, n_params

## Running Optimization

In [None]:
study_name = f'102824_model-opt'
storage_name = f'sqlite:///{study_name}.db'
study = optuna.create_study(study_name=study_name, storage=storage_name, load_if_exists=True, 
                            directions=['maximize', 'minimize', 'maximize', 'minimize', 'maximize', 'minimize', 'minimize'])
study.optimize(objective, n_trials=2048, )

## Optimization Results

In [3]:
study_name = f'102824_model-opt'
storage_name = f'sqlite:///{study_name}.db'
study = optuna.load_study(study_name=study_name, storage=storage_name,)

# best_numbers = [t.number for t in study.best_trials]
study_df = study.trials_dataframe()
trials = study.get_trials()

study_df = study_df.rename(columns=dict(
    values_0='values_cv_val_r2',
    values_1='values_cv_val_mae',
    values_2='values_cv_train_r2',
    values_3='values_cv_train_mae',
    values_4='values_train_r2',
    values_5='values_train_mae',
    values_6='values_n_params',
))

study_df

Unnamed: 0,number,values_cv_val_r2,values_cv_val_mae,values_cv_train_r2,values_cv_train_mae,values_train_r2,values_train_mae,values_n_params,datetime_start,datetime_complete,...,params_activation,params_alpha,params_batch_size,params_hidden_shrink,params_learning_rate_init,params_n_hidden_layers,params_n_hidden_nodes,params_random_state,system_attrs_nsga2:generation,state
0,0,-0.007949,0.186675,-0.000088,0.188931,-0.000029,0.218535,3185.0,2024-10-28 18:06:39.328254,2024-10-28 18:06:43.264184,...,logistic,5.066109e-04,32,0.319508,0.000048,7,26,1,0,COMPLETE
1,1,-0.009949,0.184858,-0.003038,0.186947,-0.001360,0.216361,375.0,2024-10-28 18:06:43.273189,2024-10-28 18:06:56.505138,...,logistic,1.140273e-07,28,0.106995,0.000015,6,8,3,0,COMPLETE
2,2,0.907653,0.045528,0.923441,0.040092,0.971963,0.034162,1421.0,2024-10-28 18:06:56.515163,2024-10-28 18:07:01.228608,...,tanh,2.020010e-06,48,0.200943,0.001671,10,13,0,0,COMPLETE
3,3,0.960278,0.032505,0.968164,0.029677,0.966705,0.038452,779.0,2024-10-28 18:07:01.237607,2024-10-28 18:07:06.376841,...,tanh,6.154950e-06,24,0.846602,0.001351,3,30,0,0,COMPLETE
4,4,0.319240,0.130861,0.336544,0.129804,0.268111,0.183712,328.0,2024-10-28 18:07:06.385734,2024-10-28 18:07:14.190673,...,tanh,2.189653e-07,32,0.500804,0.000494,5,10,2,0,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2150,2150,0.289536,0.144063,0.324065,0.134194,0.474293,0.153068,28.0,2024-10-28 22:30:48.251558,2024-10-28 22:30:52.541771,...,tanh,3.168223e-06,28,0.878494,0.003994,2,5,0,42,COMPLETE
2151,2151,0.518570,0.096402,0.541586,0.094515,0.564759,0.135732,82.0,2024-10-28 22:30:52.554772,2024-10-28 22:31:02.715457,...,tanh,9.368303e-05,12,0.878494,0.002391,7,4,10,43,COMPLETE
2152,2152,0.619305,0.086471,0.648749,0.082877,0.700165,0.102534,206.0,2024-10-28 22:31:02.728970,2024-10-28 22:31:07.255586,...,tanh,1.054571e-05,56,0.605452,0.003994,8,6,0,43,COMPLETE
2153,2153,0.355099,0.123470,0.400504,0.122880,0.296131,0.169057,138.0,2024-10-28 22:31:07.268583,2024-10-28 22:31:10.847544,...,tanh,2.928547e-05,16,0.741974,0.003994,8,5,0,43,COMPLETE


## Select Models

In [11]:
# Find weak models
selected_weak_df = study_df[(study_df['number'] > 10) & (study_df['values_train_r2'] > 0.1) & (study_df['values_cv_val_r2'] > 0.1)]
selected_weak_df = selected_weak_df.sort_values(by=['values_train_r2', 'values_cv_val_r2'], ignore_index=True)
n_models = 5
selected_weak_df = selected_weak_df.iloc[:n_models, :]

# Find mid models
selected_mid_df = study_df[(study_df['number'] > 10) & (study_df['values_train_r2'] > 0.50) & (study_df['values_cv_val_r2'] > 0.50)]
selected_mid_df = selected_mid_df.sort_values(by=['values_train_r2', 'values_cv_val_r2'], ignore_index=True)
n_models = 5
selected_mid_df = selected_mid_df.iloc[:n_models, :]

# Find strong models
study_df['lr_to_l2_ratio'] = study_df.params_learning_rate_init / study_df.params_alpha

pre_screen_df = study_df[
    # study_df.number.isin(best_numbers) & \
    (study_df.lr_to_l2_ratio < 1000) & \
    (study_df.lr_to_l2_ratio > 10) & \
    (study_df.values_cv_val_r2 >= 0.9) & \
    (study_df.values_cv_val_mae <= 0.04) & \
    (study_df.values_train_r2 >= 0.91) & \
    (study_df.values_train_mae <= 0.03) 
].sort_values('values_n_params', ignore_index=True)

display(pre_screen_df)
n_selected = len(pre_screen_df)
n_models = 5
idx = np.round(np.linspace(0, 1, num=n_models) * (n_selected - 1)).astype(int)

selected_df = pre_screen_df.iloc[idx].reset_index(drop=True)

Unnamed: 0,number,values_cv_val_r2,values_cv_val_mae,values_cv_train_r2,values_cv_train_mae,values_train_r2,values_train_mae,values_n_params,datetime_start,datetime_complete,...,params_alpha,params_batch_size,params_hidden_shrink,params_learning_rate_init,params_n_hidden_layers,params_n_hidden_nodes,params_random_state,system_attrs_nsga2:generation,state,lr_to_l2_ratio
0,421,0.950300,0.036361,0.958867,0.032876,0.980723,0.029416,293.0,2024-10-28 19:06:14.162900,2024-10-28 19:06:19.269871,...,4.069180e-06,28,0.489962,0.003994,4,11,0,8,COMPLETE,981.456197
1,1795,0.955676,0.034170,0.963949,0.031098,0.981883,0.027116,335.0,2024-10-28 21:51:53.536541,2024-10-28 21:51:57.901516,...,5.113978e-06,36,0.978747,0.003994,7,11,4,35,COMPLETE,780.942208
2,1846,0.955676,0.034170,0.963949,0.031098,0.981883,0.027116,335.0,2024-10-28 21:58:35.680420,2024-10-28 21:58:40.112878,...,5.113978e-06,36,0.978747,0.003994,7,11,4,36,COMPLETE,780.942208
3,692,0.970851,0.025777,0.974642,0.026147,0.982037,0.027624,369.0,2024-10-28 19:44:27.573527,2024-10-28 19:44:31.780794,...,4.069180e-06,12,0.057745,0.003994,4,10,4,13,COMPLETE,981.456197
4,229,0.957809,0.032002,0.967286,0.030694,0.981865,0.027932,389.0,2024-10-28 18:44:15.770224,2024-10-28 18:44:17.327115,...,2.657770e-04,56,0.118007,0.009292,3,13,4,4,COMPLETE,34.963304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,66,0.975996,0.023405,0.980749,0.021147,0.980572,0.028137,4849.0,2024-10-28 18:16:01.539331,2024-10-28 18:16:39.520810,...,3.017459e-07,8,0.214366,0.000122,10,25,8,1,COMPLETE,402.783053
148,754,0.973363,0.025282,0.980180,0.022051,0.984250,0.025106,5404.0,2024-10-28 19:52:23.626108,2024-10-28 19:52:29.320892,...,1.629964e-05,40,0.105865,0.001033,10,25,8,15,COMPLETE,63.391589
149,1293,0.925218,0.034683,0.927274,0.032047,0.981906,0.028133,5404.0,2024-10-28 20:58:40.321224,2024-10-28 20:58:45.735847,...,4.069180e-06,36,0.105865,0.001033,10,25,9,25,COMPLETE,253.923519
150,932,0.970006,0.025856,0.977597,0.023489,0.980802,0.028974,5404.0,2024-10-28 20:14:38.690271,2024-10-28 20:14:43.681416,...,1.629964e-05,44,0.105865,0.001262,10,25,6,18,COMPLETE,77.438263


In [12]:
## Best models
best_models = []
best_model_strs = []

best_model_strs.append('models = [')

for _, row in selected_df.iterrows():
    trial = trials[row.number]
    assert trial.number == row.number
    model = build_model(**trial.params)
    best_model_strs.append(f'    # Model #{row.number}:')
    best_model_strs.append(f'    #   - CV Metrics: Train R2 = {row.values_cv_train_r2:.4f}, Train MAE = {row.values_cv_train_mae:.4f};')
    best_model_strs.append(f'    #   - CV Metrics: Val R2 = {row.values_cv_val_r2:.4f}, Val MAE = {row.values_cv_val_mae:.4f};')
    best_model_strs.append(f'    #   - Train Metrics: R2 = {row.values_train_r2:.4f}, MAE = {row.values_train_mae:.4f};')
    model_str = str(model)
    best_model_strs.append('    ' + '\n    '.join(model_str.split('\n')) + ',\n')

    best_models.append(model)
    
best_model_strs.append(']')
best_model_str = '\n'.join(best_model_strs)
print(best_model_str)

best_models

models = [
    # Model #421:
    #   - CV Metrics: Train R2 = 0.9589, Train MAE = 0.0329;
    #   - CV Metrics: Val R2 = 0.9503, Val MAE = 0.0364;
    #   - Train Metrics: R2 = 0.9807, MAE = 0.0294;
    MLPRegressor(activation='tanh', alpha=4.069179548172916e-06, batch_size=28,
                 hidden_layer_sizes=[11, 9, 8, 6],
                 learning_rate_init=0.003993721484968005, max_iter=50000,
                 random_state=0, tol=1e-05),

    # Model #71:
    #   - CV Metrics: Train R2 = 0.9785, Train MAE = 0.0225;
    #   - CV Metrics: Val R2 = 0.9730, Val MAE = 0.0244;
    #   - Train Metrics: R2 = 0.9846, MAE = 0.0262;
    MLPRegressor(activation='tanh', alpha=1.061943207327572e-05, batch_size=36,
                 hidden_layer_sizes=[12, 12, 11, 11, 10, 10, 9, 9],
                 learning_rate_init=0.004877798264481361, max_iter=50000,
                 random_state=7, tol=1e-05),

    # Model #1274:
    #   - CV Metrics: Train R2 = 0.9469, Train MAE = 0.0261;
    #   - CV Me

[MLPRegressor(activation='tanh', alpha=4.069179548172916e-06, batch_size=28,
              hidden_layer_sizes=[11, 9, 8, 6],
              learning_rate_init=0.003993721484968005, max_iter=50000,
              random_state=0, tol=1e-05),
 MLPRegressor(activation='tanh', alpha=1.061943207327572e-05, batch_size=36,
              hidden_layer_sizes=[12, 12, 11, 11, 10, 10, 9, 9],
              learning_rate_init=0.004877798264481361, max_iter=50000,
              random_state=7, tol=1e-05),
 MLPRegressor(activation='tanh', alpha=5.11397827693426e-06, batch_size=28,
              hidden_layer_sizes=[25, 22, 19, 16, 13, 10, 7],
              learning_rate_init=0.003993721484968005, max_iter=50000,
              random_state=4, tol=1e-05),
 MLPRegressor(activation='tanh', alpha=1.061943207327572e-05, batch_size=12,
              hidden_layer_sizes=[23, 21, 19, 18, 16, 14, 12, 10, 9, 7],
              learning_rate_init=0.0023914925232281707, max_iter=50000,
              random_state=5, tol=

In [13]:
## Weak models
weak_models = []
weak_model_strs = []

weak_model_strs.append('models = [')

for _, row in selected_weak_df.iterrows():
    trial = trials[row.number]
    assert trial.number == row.number
    model = build_model(**trial.params)
    weak_model_strs.append(f'    # Model #{row.number}:')
    weak_model_strs.append(f'    #   - CV Metrics: Train R2 = {row.values_cv_train_r2:.4f}, Train MAE = {row.values_cv_train_mae:.4f};')
    weak_model_strs.append(f'    #   - CV Metrics: Val R2 = {row.values_cv_val_r2:.4f}, Val MAE = {row.values_cv_val_mae:.4f};')
    weak_model_strs.append(f'    #   - Train Metrics: R2 = {row.values_train_r2:.4f}, MAE = {row.values_train_mae:.4f};')
    model_str = str(model)
    weak_model_strs.append('    ' + '\n    '.join(model_str.split('\n')) + ',\n')

    weak_models.append(model)
    
weak_model_strs.append(']')
weak_model_str = '\n'.join(weak_model_strs)
print(weak_model_str)

weak_models

models = [
    # Model #1117:
    #   - CV Metrics: Train R2 = 0.1101, Train MAE = 0.1449;
    #   - CV Metrics: Val R2 = 0.1037, Val MAE = 0.1465;
    #   - Train Metrics: R2 = 0.1120, MAE = 0.2064;
    MLPRegressor(activation='tanh', alpha=4.069179548172916e-06, batch_size=64,
                 hidden_layer_sizes=[5, 5, 5, 5, 5, 5, 5, 5, 4, 4],
                 learning_rate_init=1.4229520275134514e-05, max_iter=50000,
                 random_state=0, tol=1e-05),

    # Model #1926:
    #   - CV Metrics: Train R2 = 0.1242, Train MAE = 0.1653;
    #   - CV Metrics: Val R2 = 0.1124, Val MAE = 0.1675;
    #   - Train Metrics: R2 = 0.1268, MAE = 0.2089;
    MLPRegressor(activation='tanh', alpha=9.368302866664674e-05, batch_size=24,
                 hidden_layer_sizes=[5, 1],
                 learning_rate_init=2.683342414372862e-05, max_iter=50000,
                 random_state=6, tol=1e-05),

    # Model #2052:
    #   - CV Metrics: Train R2 = 0.1220, Train MAE = 0.1572;
    #   - CV Met

[MLPRegressor(activation='tanh', alpha=4.069179548172916e-06, batch_size=64,
              hidden_layer_sizes=[5, 5, 5, 5, 5, 5, 5, 5, 4, 4],
              learning_rate_init=1.4229520275134514e-05, max_iter=50000,
              random_state=0, tol=1e-05),
 MLPRegressor(activation='tanh', alpha=9.368302866664674e-05, batch_size=24,
              hidden_layer_sizes=[5, 1],
              learning_rate_init=2.683342414372862e-05, max_iter=50000,
              random_state=6, tol=1e-05),
 MLPRegressor(activation='tanh', alpha=1.0636353020794172e-06, batch_size=24,
              hidden_layer_sizes=[5, 2],
              learning_rate_init=4.042940432014313e-05, max_iter=50000,
              random_state=4, tol=1e-05),
 MLPRegressor(activation='tanh', alpha=5.11397827693426e-06, batch_size=40,
              hidden_layer_sizes=[7, 5, 3],
              learning_rate_init=7.098088946744915e-05, max_iter=50000,
              random_state=5, tol=1e-05),
 MLPRegressor(activation='tanh', alpha=9.368

In [14]:
# Moderate models
mid_models = []
mid_model_strs = []

mid_model_strs.append('models = [')

for _, row in selected_mid_df.iterrows():
    trial = trials[row.number]
    assert trial.number == row.number
    model = build_model(**trial.params)
    mid_model_strs.append(f'    # Model #{row.number}:')
    mid_model_strs.append(f'    #   - CV Metrics: Val R2 = {row.values_cv_val_r2:.4f}, Val MAE = {row.values_cv_val_mae:.4f};')
    mid_model_strs.append(f'    #   - Train Metrics: R2 = {row.values_train_r2:.4f}, MAE = {row.values_train_mae:.4f};')
    model_str = str(model)
    mid_model_strs.append('    ' + '\n    '.join(model_str.split('\n')) + ',\n')

    mid_models.append(model)
    
mid_model_strs.append(']')
mid_model_str = '\n'.join(mid_model_strs)
print(mid_model_str)

mid_models

models = [
    # Model #163:
    #   - CV Metrics: Val R2 = 0.5705, Val MAE = 0.0892;
    #   - Train Metrics: R2 = 0.5006, MAE = 0.1400;
    MLPRegressor(activation='tanh', alpha=1.4699245951666415e-07, batch_size=24,
                 hidden_layer_sizes=[10, 8, 7, 5, 3],
                 learning_rate_init=0.0013507653838908675, max_iter=50000,
                 random_state=2, tol=1e-05),

    # Model #820:
    #   - CV Metrics: Val R2 = 0.7103, Val MAE = 0.0668;
    #   - Train Metrics: R2 = 0.5019, MAE = 0.1466;
    MLPRegressor(activation='tanh', alpha=4.1520846921736066e-06, batch_size=8,
                 hidden_layer_sizes=[6, 6, 5, 5, 5, 4, 4, 4, 3, 3],
                 learning_rate_init=0.0023914925232281707, max_iter=50000,
                 random_state=10, tol=1e-05),

    # Model #638:
    #   - CV Metrics: Val R2 = 0.5024, Val MAE = 0.1175;
    #   - Train Metrics: R2 = 0.5058, MAE = 0.1485;
    MLPRegressor(activation='tanh', alpha=4.3604728711610275e-07, batch_size=8,
  

[MLPRegressor(activation='tanh', alpha=1.4699245951666415e-07, batch_size=24,
              hidden_layer_sizes=[10, 8, 7, 5, 3],
              learning_rate_init=0.0013507653838908675, max_iter=50000,
              random_state=2, tol=1e-05),
 MLPRegressor(activation='tanh', alpha=4.1520846921736066e-06, batch_size=8,
              hidden_layer_sizes=[6, 6, 5, 5, 5, 4, 4, 4, 3, 3],
              learning_rate_init=0.0023914925232281707, max_iter=50000,
              random_state=10, tol=1e-05),
 MLPRegressor(activation='tanh', alpha=4.3604728711610275e-07, batch_size=8,
              hidden_layer_sizes=[19, 4],
              learning_rate_init=0.00027627417698698043, max_iter=50000,
              random_state=7, tol=1e-05),
 MLPRegressor(activation='tanh', alpha=3.4298152582719893e-06, batch_size=12,
              hidden_layer_sizes=[20, 8],
              learning_rate_init=0.003993721484968005, max_iter=50000,
              random_state=10, tol=1e-05),
 MLPRegressor(activation='tanh',