# Imports

In [1]:
import sys
import json
import logging
import joblib
import numpy as np
import matplotlib.pyplot as plt

import optuna

from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.linear_model import LinearRegression, SGDRegressor, ElasticNet, BayesianRidge, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline

from typing import Union

from bb_energy_prediction import data_utils, sklearn_regressors

  from .autonotebook import tqdm as notebook_tqdm


# Data

In [2]:
no_dupl = False
if no_dupl:
    data_df = data_utils.get_data_df(data_path="../energy_data/no_dupl_data.pkl")
else:
    data_df = data_utils.get_data_df(data_path="../energy_data/data.pkl")

if "bb_embeddings" in data_df.columns:
    data_df = data_df.drop(columns="bb_embeddings")
data_df = data_df.sample(frac=1)
data_df.head(10)

Unnamed: 0,bb,energy,program_name
79203,"[movl %eax, orl %eax, orl %eax, jz 0x932]",0.175075,find_biggest
398085,"[movl %eax, mov $0x1 %ebx, mov %eax %r9d, cmp ...",0.194707,variable_name
208370,"[cmpq $0x0, jz 0x6b]",0.195748,ip
316974,"[cmp %rax %r12, jb]",0.014741,simple_sort
146151,"[movl (%r12) %r15d, test $0x8 %r15b, jnz 0x249b]",0.395857,function_pointer
147844,"[cmpl $0x10, jnz 0xa]",0.339306,function_pointer
215515,"[mov %rax %r14, jmp]",0.293623,ip
280705,"[mov %rcx, nopw %ax, mov %rdi %rax, mov %rdi %...",0.021893,quicksort
92156,"[movl %eax, mov %rdi %rbx, movl $0x0, movl $0x...",0.023618,find_dyn_sum
286259,"[mov %r14 %rdi, mov %rbx %rsi, callq, pushq %r...",0.58602,quicksort


In [3]:
if no_dupl:
    test_size = 0.15
else:
    test_size = 0.1

shuffled_data_df = data_df.sample(frac=1).reset_index(drop=True)
test_size = int(test_size * len(shuffled_data_df))
test_df = shuffled_data_df[-test_size:]
train_df = shuffled_data_df[:-test_size]

print(f"Test data size: {len(test_df)}")

Test data size: 56165


In [4]:
X_train = np.array([" ".join(bb) for bb in train_df.bb.tolist()])
y_train = train_df.energy.values

X_test = np.array([" ".join(bb) for bb in test_df.bb.tolist()])
y_test = test_df.energy.values

In [5]:
cnt_vect = CountVectorizer()
cnt_vect.fit_transform(X_train)
vocab_len = len(cnt_vect.get_feature_names_out())
print(f"Vocab length: {vocab_len}")

Vocab length: 1266


# Regressors

In [6]:
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
if no_dupl:
    exp_dir = "/Users/thodo/Documents/sxoli/diplomatiki/basic-block-energy-prediction/model_checkpoints/regressors_no_dupl"
    storage_name = "sqlite:////Users/thodo/Documents/sxoli/diplomatiki/optuna-studies/sklearn-regressors-no-duplicates.db"
else:
    exp_dir = "/Users/thodo/Documents/sxoli/diplomatiki/basic-block-energy-prediction/model_checkpoints/regressors"
    storage_name = "sqlite:////Users/thodo/Documents/sxoli/diplomatiki/optuna-studies/sklearn-regressors.db"


## Linear Regression

Simple least squares regression.
No need for extra tuning.

In [7]:
study = optuna.create_study(study_name="linear-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-02-20 15:33:59,323][0m A new study created in RDB with name: linear-regression[0m


A new study created in RDB with name: linear-regression


In [None]:
def objective(trial):

    tfidf = trial.suggest_categorical("tfidf", [True, False])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = LinearRegression()
    score = sklearn_regressors.evaluate_regressor(
        regressor=regressor,
        X=X_train,
        y=y_train,
        tfidf=tfidf,
        normalization=normalization, 
        scaling=scaling
    )

    return score

study.optimize(objective, n_trials=10)

In [9]:
study.trials_dataframe().sort_values(by="value").head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_normalization,params_scaling,params_tfidf,state
0,0,1.391,2023-02-20 15:34:01.801151,2023-02-20 15:34:02.251864,0 days 00:00:00.450713,False,False,True,COMPLETE
1,1,1.394,2023-02-20 15:34:02.276361,2023-02-20 15:34:02.498864,0 days 00:00:00.222503,False,True,True,COMPLETE
7,7,1.565,2023-02-20 15:34:04.561908,2023-02-20 15:34:04.767401,0 days 00:00:00.205493,True,True,False,COMPLETE
6,6,1.695,2023-02-20 15:34:04.305399,2023-02-20 15:34:04.545399,0 days 00:00:00.240000,True,True,False,COMPLETE
4,4,1.852,2023-02-20 15:34:03.834829,2023-02-20 15:34:04.062158,0 days 00:00:00.227329,True,True,False,COMPLETE
3,3,2.005,2023-02-20 15:34:03.323764,2023-02-20 15:34:03.816773,0 days 00:00:00.493009,False,False,True,COMPLETE
2,2,2.129,2023-02-20 15:34:02.517863,2023-02-20 15:34:03.306765,0 days 00:00:00.788902,False,False,False,COMPLETE
5,5,2.13,2023-02-20 15:34:04.078658,2023-02-20 15:34:04.287906,0 days 00:00:00.209248,True,True,True,COMPLETE
8,8,2.158,2023-02-20 15:34:04.784519,2023-02-20 15:34:04.993911,0 days 00:00:00.209392,False,True,True,COMPLETE
9,9,3.26,2023-02-20 15:34:05.010990,2023-02-20 15:34:05.221483,0 days 00:00:00.210493,False,True,True,COMPLETE


In [10]:
print(f"Best linear regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

Best linear regression params: {'normalization': False, 'scaling': False, 'tfidf': True}, achieving val RMSE: 1.391


In [10]:
save = True
load = False

pipe_params = {
    'normalization': False,
    'scaling': False,
    'tfidf': True,
}

if load:
    pipe = joblib.load(f"{exp_dir}/linear/pipe")
else:
    regressor = LinearRegression()
    pipe = sklearn_regressors.train_pipe(regressor, X_train, y_train, **pipe_params)

    if save:
        additional_attributes = {
            "pipe_params": pipe_params,
            #"test_programs": test_programs,
        }

        joblib.dump(pipe, f"{exp_dir}/linear/pipe")
        with open(f"{exp_dir}/linear/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [11]:
test_bbs = np.array([" ".join(bb) for bb in test_df.bb.tolist()])
test_preds = pipe.predict(test_bbs)
true_energies = test_df.energy.values

median = np.median(true_energies)
std = np.std(true_energies)
mse = round(mean_squared_error(true_energies, test_preds), 3)
mae = round(mean_absolute_error(true_energies, test_preds), 3)

print(f"MSE: {mse}")
print(f"MAE: {mae}")

MSE: 4.706
MAE: 0.908


## Lasso

Linear regression with L1 regularization. Tune:
* alpha (L1 regularization term).

In [10]:
study = optuna.create_study(study_name="lasso-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-02-23 22:56:45,573][0m A new study created in RDB with name: lasso-regression[0m


A new study created in RDB with name: lasso-regression


In [None]:
def objective(trial):

    alpha = trial.suggest_float("alpha", 0, 10.0)
    tfidf = trial.suggest_categorical("tfidf", [True, False])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = Lasso(alpha=alpha)
    score = sklearn_regressors.evaluate_regressor(
        regressor=regressor,
        X=X_train,
        y=y_train,
        tfidf=tfidf,
        normalization=normalization, 
        scaling=scaling
    )

    return score

study.optimize(objective, n_trials=20)

In [12]:
study.trials_dataframe().sort_values(by="value").head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_normalization,params_scaling,params_tfidf,state
4,4,0.692,2023-02-23 22:57:44.181905,2023-02-23 22:57:58.314589,0 days 00:00:14.132684,5.91738,False,False,True,COMPLETE
15,15,0.696,2023-02-23 23:00:18.974504,2023-02-23 23:00:32.370134,0 days 00:00:13.395630,6.282374,False,False,False,COMPLETE
12,12,0.697,2023-02-23 22:59:36.850902,2023-02-23 22:59:50.645707,0 days 00:00:13.794805,4.01779,False,False,True,COMPLETE
9,9,0.699,2023-02-23 22:58:54.142856,2023-02-23 22:59:09.079188,0 days 00:00:14.936332,2.461454,True,True,True,COMPLETE
1,1,0.699,2023-02-23 22:57:01.180312,2023-02-23 22:57:15.129809,0 days 00:00:13.949497,1.841672,True,False,True,COMPLETE
11,11,0.7,2023-02-23 22:59:23.022401,2023-02-23 22:59:36.833912,0 days 00:00:13.811511,0.041001,False,False,True,COMPLETE
18,18,0.7,2023-02-23 23:00:59.072141,2023-02-23 23:01:12.447933,0 days 00:00:13.375792,5.75131,False,False,False,COMPLETE
8,8,0.702,2023-02-23 22:58:40.427411,2023-02-23 22:58:54.125378,0 days 00:00:13.697967,6.811531,True,False,False,COMPLETE
0,0,0.702,2023-02-23 22:56:46.535341,2023-02-23 22:57:01.157810,0 days 00:00:14.622469,6.543619,True,True,True,COMPLETE
3,3,0.702,2023-02-23 22:57:29.344184,2023-02-23 22:57:44.164911,0 days 00:00:14.820727,1.504827,True,False,True,COMPLETE


In [13]:
print(f"Best lasso regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

Best lasso regression params: {'alpha': 5.91737974176493, 'normalization': False, 'scaling': False, 'tfidf': True}, achieving val RMSE: 0.692


In [15]:
save = True
load = False

regressor_params = {"alpha": 5.9}
pipe_params = {
    'normalization': False,
    'scaling': False,
    'tfidf': True,
}

if load:
    pipe = joblib.load(f"{exp_dir}/lasso/pipe")
else:
    regressor = Lasso(**regressor_params)
    pipe = sklearn_regressors.train_pipe(regressor, X_train, y_train, **pipe_params)

    if save:
        additional_attributes = {
            "regressor_params": regressor_params,
            "pipe_params": pipe_params,
        }

        joblib.dump(pipe, f"{exp_dir}/lasso/pipe")
        with open(f"{exp_dir}/lasso/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [16]:
test_bbs = np.array([" ".join(bb) for bb in test_df.bb.tolist()])
test_preds = pipe.predict(test_bbs)
true_energies = test_df.energy.values

median = np.median(true_energies)
std = np.std(true_energies)
mse = round(mean_squared_error(true_energies, test_preds), 3)
mae = round(mean_absolute_error(true_energies, test_preds), 3)

print(f"MSE: {mse}")
print(f"MAE: {mae}")

MSE: 0.523
MAE: 0.374


## Ridge

Linear regression with L2 regularization. Tune:
* alpha (L2 regularization term).

In [17]:
study = optuna.create_study(study_name="ridge-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-02-23 23:01:59,681][0m A new study created in RDB with name: ridge-regression[0m


A new study created in RDB with name: ridge-regression


In [None]:
def objective(trial):

    alpha = trial.suggest_float("alpha", 0, 10.0)
    tfidf = trial.suggest_categorical("tfidf", [True, False])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = Ridge(alpha=alpha)
    score = sklearn_regressors.evaluate_regressor(
        regressor=regressor,
        X=X_train,
        y=y_train,
        tfidf=tfidf,
        normalization=normalization, 
        scaling=scaling
    )

    return score

study.optimize(objective, n_trials=20)

In [19]:
study.trials_dataframe().sort_values(by="value").head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_normalization,params_scaling,params_tfidf,state
16,16,0.57,2023-02-23 23:06:13.309209,2023-02-23 23:06:30.538625,0 days 00:00:17.229416,0.227142,False,True,False,COMPLETE
6,6,0.57,2023-02-23 23:03:39.845163,2023-02-23 23:03:54.339203,0 days 00:00:14.494040,7.558995,True,True,True,COMPLETE
9,9,0.571,2023-02-23 23:04:24.766999,2023-02-23 23:04:43.280817,0 days 00:00:18.513818,9.413899,False,False,False,COMPLETE
1,1,0.572,2023-02-23 23:02:21.435224,2023-02-23 23:02:36.612951,0 days 00:00:15.177727,6.342039,False,False,True,COMPLETE
13,13,0.575,2023-02-23 23:05:27.403146,2023-02-23 23:05:42.520299,0 days 00:00:15.117153,8.088298,True,False,True,COMPLETE
12,12,0.576,2023-02-23 23:05:12.739115,2023-02-23 23:05:27.386145,0 days 00:00:14.647030,8.129031,True,True,False,COMPLETE
0,0,0.578,2023-02-23 23:02:02.566181,2023-02-23 23:02:21.410223,0 days 00:00:18.844042,5.89382,False,False,False,COMPLETE
4,4,0.578,2023-02-23 23:03:08.138003,2023-02-23 23:03:25.196195,0 days 00:00:17.058192,1.598583,False,True,False,COMPLETE
3,3,0.578,2023-02-23 23:02:53.527112,2023-02-23 23:03:08.121002,0 days 00:00:14.593890,2.199179,False,True,True,COMPLETE
18,18,0.579,2023-02-23 23:06:45.234579,2023-02-23 23:06:59.917210,0 days 00:00:14.682631,3.955875,False,True,True,COMPLETE


In [20]:
print(f"Best ridge regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

Best ridge regression params: {'alpha': 7.558994936255297, 'normalization': True, 'scaling': True, 'tfidf': True}, achieving val RMSE: 0.57


In [21]:
save = True
load = False

regressor_params = {"alpha": 7.6}
pipe_params = {
    'normalization': True,
    'scaling': True,
    'tfidf': True,
}

if load:
    pipe = joblib.load(f"{exp_dir}/ridge/pipe")
else:
    regressor = Ridge(**regressor_params)
    pipe = sklearn_regressors.train_pipe(regressor, X_train, y_train, **pipe_params)

    if save:
        additional_attributes = {
            "regressor_params": regressor_params,
            "pipe_params": pipe_params,
            #"test_programs": test_programs,
        }

        joblib.dump(pipe, f"{exp_dir}/ridge/pipe")
        with open(f"{exp_dir}/ridge/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [22]:
test_bbs = np.array([" ".join(bb) for bb in test_df.bb.tolist()])
test_preds = pipe.predict(test_bbs)
true_energies = test_df.energy.values

median = np.median(true_energies)
std = np.std(true_energies)
mse = round(mean_squared_error(true_energies, test_preds), 3)
mae = round(mean_absolute_error(true_energies, test_preds), 3)

print(f"MSE: {mse}")
print(f"MAE: {mae}")

MSE: 0.351
MAE: 0.271


## ElasticNet

Linear regression with both L1 and L2 regularization. Tune
* alpha
* l1_ratio. 

In [23]:
study = optuna.create_study(study_name="elasticnet-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-02-23 23:09:09,560][0m A new study created in RDB with name: elasticnet-regression[0m


A new study created in RDB with name: elasticnet-regression


In [None]:
def objective(trial):

    alpha = trial.suggest_float("alpha", 0, 10.0)
    l1_ratio = trial.suggest_float("l1_ratio", 0, 0.9)
    tfidf = trial.suggest_categorical("tfidf", [True, False])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
    score = sklearn_regressors.evaluate_regressor(
        regressor=regressor,
        X=X_train,
        y=y_train,
        tfidf=tfidf,
        normalization=normalization, 
        scaling=scaling
    )

    return score

study.optimize(objective, n_trials=20)

In [25]:
study.trials_dataframe().sort_values(by="value").head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_l1_ratio,params_normalization,params_scaling,params_tfidf,state
14,14,0.678,2023-02-23 23:12:42.749424,2023-02-23 23:13:03.997340,0 days 00:00:21.247916,6.951369,0.006451,False,True,True,COMPLETE
12,12,0.681,2023-02-23 23:12:01.653850,2023-02-23 23:12:22.928126,0 days 00:00:21.274276,7.993278,0.005983,False,True,True,COMPLETE
17,17,0.691,2023-02-23 23:13:32.201532,2023-02-23 23:13:51.561341,0 days 00:00:19.359809,8.19097,0.013323,False,True,True,COMPLETE
13,13,0.695,2023-02-23 23:12:22.946183,2023-02-23 23:12:42.731923,0 days 00:00:19.785740,7.0119,0.011715,False,True,True,COMPLETE
10,10,0.699,2023-02-23 23:11:32.307638,2023-02-23 23:11:46.848537,0 days 00:00:14.540899,9.882783,0.039426,False,True,True,COMPLETE
1,1,0.7,2023-02-23 23:09:27.137007,2023-02-23 23:09:41.047906,0 days 00:00:13.910899,8.350481,0.312563,True,True,False,COMPLETE
19,19,0.703,2023-02-23 23:14:05.821041,2023-02-23 23:14:19.875400,0 days 00:00:14.054359,7.545994,0.195365,False,True,True,COMPLETE
15,15,0.703,2023-02-23 23:13:04.013340,2023-02-23 23:13:18.097159,0 days 00:00:14.083819,6.857993,0.142235,False,True,True,COMPLETE
6,6,0.703,2023-02-23 23:10:36.325045,2023-02-23 23:10:50.206146,0 days 00:00:13.881101,3.796461,0.28132,True,True,False,COMPLETE
11,11,0.704,2023-02-23 23:11:46.871537,2023-02-23 23:12:01.635939,0 days 00:00:14.764402,9.580784,0.046656,False,True,True,COMPLETE


In [27]:
print(f"Best ElasticNET regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

Best ElasticNET regression params: {'alpha': 6.951369371511551, 'l1_ratio': 0.006451007849770041, 'normalization': False, 'scaling': True, 'tfidf': True}, achieving val RMSE: 0.678


In [28]:
save = True
load = False

regressor_params = {"alpha": 6.95, "l1_ratio": 0.006}
pipe_params = {
    'normalization': False,
    'scaling': True,
    'tfidf': True,
}

if load:
    pipe = joblib.load(f"{exp_dir}/elasticnet/pipe")
else:
    regressor = ElasticNet(**regressor_params)
    pipe = sklearn_regressors.train_pipe(regressor, X_train, y_train, **pipe_params)

    if save:
        additional_attributes = {
            "regressor_params": regressor_params,
            "pipe_params": pipe_params,
            #"test_programs": test_programs,
        }

        joblib.dump(pipe, f"{exp_dir}/elasticnet/pipe")
        with open(f"{exp_dir}/elasticnet/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [29]:
test_bbs = np.array([" ".join(bb) for bb in test_df.bb.tolist()])
test_preds = pipe.predict(test_bbs)
true_energies = test_df.energy.values

median = np.median(true_energies)
std = np.std(true_energies)
mse = round(mean_squared_error(true_energies, test_preds), 3)
mae = round(mean_absolute_error(true_energies, test_preds), 3)

print(f"MSE: {mse}")
print(f"MAE: {mae}")

MSE: 0.48
MAE: 0.348


## SGD Regression

Stochastic Gradient Descent Regression using either of the above reguliration techniques. Tune:
* penalty method
* alpha
* l1_ratio (if elasticnet penalty).

In [8]:
study = optuna.create_study(study_name="SGD-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-02-23 23:40:04,072][0m A new study created in RDB with name: SGD-regression[0m


A new study created in RDB with name: SGD-regression


In [None]:
def objective(trial):

    penalty = trial.suggest_categorical("penalty", ["l1", "l2", "elasticnet"])
    alpha = trial.suggest_float("alpha", 0, 10.0)
    l1_ratio = trial.suggest_float("l1_ratio", 0, 0.9)
    tfidf = trial.suggest_categorical("tfidf", [True, False])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = SGDRegressor(penalty=penalty, alpha=alpha, l1_ratio=l1_ratio)
    score = sklearn_regressors.evaluate_regressor(
        regressor=regressor,
        X=X_train,
        y=y_train,
        tfidf=tfidf,
        normalization=normalization, 
        scaling=scaling
    )

    return score

study.optimize(objective, n_trials=40)

In [10]:
study.trials_dataframe().sort_values(by="value").head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_l1_ratio,params_normalization,params_penalty,params_scaling,params_tfidf,state
33,33,0.631,2023-02-23 23:47:34.672118,2023-02-23 23:47:47.976857,0 days 00:00:13.304739,0.635574,0.322415,False,l2,False,False,COMPLETE
34,34,0.632,2023-02-23 23:47:47.992856,2023-02-23 23:48:01.375355,0 days 00:00:13.382499,0.632296,0.344076,False,l2,False,False,COMPLETE
32,32,0.634,2023-02-23 23:47:21.507735,2023-02-23 23:47:34.655114,0 days 00:00:13.147379,0.795358,0.326961,False,l2,False,False,COMPLETE
18,18,0.635,2023-02-23 23:44:15.096003,2023-02-23 23:44:29.493540,0 days 00:00:14.397537,1.595639,0.29564,False,l2,False,False,COMPLETE
28,28,0.636,2023-02-23 23:46:28.685720,2023-02-23 23:46:41.851265,0 days 00:00:13.165545,1.657489,0.120385,False,l2,False,False,COMPLETE
17,17,0.637,2023-02-23 23:44:00.593681,2023-02-23 23:44:15.076505,0 days 00:00:14.482824,2.002745,0.140757,False,l2,False,False,COMPLETE
38,38,0.637,2023-02-23 23:48:41.258086,2023-02-23 23:48:54.661371,0 days 00:00:13.403285,0.034489,0.302983,False,l1,False,False,COMPLETE
20,20,0.637,2023-02-23 23:44:42.707081,2023-02-23 23:44:56.398148,0 days 00:00:13.691067,1.501492,0.470841,False,l2,False,False,COMPLETE
27,27,0.639,2023-02-23 23:46:15.540754,2023-02-23 23:46:28.668447,0 days 00:00:13.127693,1.142344,0.209599,False,l2,False,False,COMPLETE
21,21,0.639,2023-02-23 23:44:56.414855,2023-02-23 23:45:09.631967,0 days 00:00:13.217112,1.967395,0.537627,False,l2,False,False,COMPLETE


In [11]:
print(f"Best SGD regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

Best SGD regression params: {'alpha': 0.6355739460605232, 'l1_ratio': 0.32241514893328543, 'normalization': False, 'penalty': 'l2', 'scaling': False, 'tfidf': False}, achieving val RMSE: 0.631


In [13]:
save = True
load = False

regressor_params = {"alpha": 0.63, "l1_ratio": 0.32, "penalty": "l2"}
pipe_params = {
    'normalization': False,
    'scaling': False,
    'tfidf': False,
}

if load:
    pipe = joblib.load(f"{exp_dir}/sgd/pipe")
else:
    regressor = SGDRegressor(**regressor_params)
    pipe = sklearn_regressors.train_pipe(regressor, X_train, y_train, **pipe_params)

    if save:
        additional_attributes = {
            "regressor_params": regressor_params,
            "pipe_params": pipe_params,
        }

        joblib.dump(pipe, f"{exp_dir}/sgd/pipe")
        with open(f"{exp_dir}/sgd/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [14]:
test_bbs = np.array([" ".join(bb) for bb in test_df.bb.tolist()])
test_preds = pipe.predict(test_bbs)
true_energies = test_df.energy.values

median = np.median(true_energies)
std = np.std(true_energies)
mse = round(mean_squared_error(true_energies, test_preds), 3)
mae = round(mean_absolute_error(true_energies, test_preds), 3)

print(f"MSE: {mse}")
print(f"MAE: {mae}")

MSE: 0.389
MAE: 0.29


## SVR

Support Vector Regression. Tune:
* kernel
* gamma
* C

In [7]:
study = optuna.create_study(study_name="SVR-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-03-02 01:26:21,704][0m A new study created in RDB with name: SVR-regression[0m


A new study created in RDB with name: SVR-regression


In [8]:
def objective(trial):

    kernel = trial.suggest_categorical("kernel", ["rbf", "linear"])
    C = trial.suggest_float("C", 0.1, 10.0)
    gamma = trial.suggest_float("gamma", 0.01, 1)
    tfidf = trial.suggest_categorical("tfidf", [True, False])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = SVR(kernel=kernel, C=C, gamma=gamma)
    score = sklearn_regressors.evaluate_regressor(
        regressor=regressor,
        X=X_train[:10000],
        y=y_train[:10000],
        tfidf=tfidf,
        normalization=normalization, 
        scaling=scaling
    )

    return score

study.optimize(objective, n_trials=40)

[32m[I 2023-03-02 01:26:34,104][0m Trial 0 finished with value: 0.669 and parameters: {'kernel': 'rbf', 'C': 8.09189854989799, 'gamma': 0.41694319737413976, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.669.[0m


Trial 0 finished with value: 0.669 and parameters: {'kernel': 'rbf', 'C': 8.09189854989799, 'gamma': 0.41694319737413976, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.669.


[32m[I 2023-03-02 01:26:48,953][0m Trial 1 finished with value: 0.605 and parameters: {'kernel': 'linear', 'C': 7.015790408226381, 'gamma': 0.7047882900770297, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 1 with value: 0.605.[0m


Trial 1 finished with value: 0.605 and parameters: {'kernel': 'linear', 'C': 7.015790408226381, 'gamma': 0.7047882900770297, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 1 with value: 0.605.


[32m[I 2023-03-02 02:05:38,909][0m Trial 2 finished with value: 0.695 and parameters: {'kernel': 'linear', 'C': 6.2245680323114305, 'gamma': 0.4655339187819725, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 1 with value: 0.605.[0m


Trial 2 finished with value: 0.695 and parameters: {'kernel': 'linear', 'C': 6.2245680323114305, 'gamma': 0.4655339187819725, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 1 with value: 0.605.


[32m[I 2023-03-02 02:05:47,332][0m Trial 3 finished with value: 0.601 and parameters: {'kernel': 'rbf', 'C': 0.6798703145441517, 'gamma': 0.594640966384394, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 3 with value: 0.601.[0m


Trial 3 finished with value: 0.601 and parameters: {'kernel': 'rbf', 'C': 0.6798703145441517, 'gamma': 0.594640966384394, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 3 with value: 0.601.


[32m[I 2023-03-02 02:35:02,477][0m Trial 4 finished with value: 0.674 and parameters: {'kernel': 'linear', 'C': 4.415529519411901, 'gamma': 0.20842695362528302, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 3 with value: 0.601.[0m


Trial 4 finished with value: 0.674 and parameters: {'kernel': 'linear', 'C': 4.415529519411901, 'gamma': 0.20842695362528302, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 3 with value: 0.601.


[32m[I 2023-03-02 02:35:10,836][0m Trial 5 finished with value: 0.611 and parameters: {'kernel': 'rbf', 'C': 3.5931480715880646, 'gamma': 0.8774235326806479, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 3 with value: 0.601.[0m


Trial 5 finished with value: 0.611 and parameters: {'kernel': 'rbf', 'C': 3.5931480715880646, 'gamma': 0.8774235326806479, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 3 with value: 0.601.


[32m[I 2023-03-02 02:35:19,571][0m Trial 6 finished with value: 0.574 and parameters: {'kernel': 'rbf', 'C': 1.817476795238639, 'gamma': 0.4353331068442312, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 6 with value: 0.574.[0m


Trial 6 finished with value: 0.574 and parameters: {'kernel': 'rbf', 'C': 1.817476795238639, 'gamma': 0.4353331068442312, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 6 with value: 0.574.


[32m[I 2023-03-02 02:35:28,038][0m Trial 7 finished with value: 0.539 and parameters: {'kernel': 'rbf', 'C': 8.266746610677888, 'gamma': 0.9732063755520685, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.[0m


Trial 7 finished with value: 0.539 and parameters: {'kernel': 'rbf', 'C': 8.266746610677888, 'gamma': 0.9732063755520685, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:47:30,668][0m Trial 8 finished with value: 0.635 and parameters: {'kernel': 'linear', 'C': 1.7352685590467325, 'gamma': 0.6980379962810118, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 7 with value: 0.539.[0m


Trial 8 finished with value: 0.635 and parameters: {'kernel': 'linear', 'C': 1.7352685590467325, 'gamma': 0.6980379962810118, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:47:42,027][0m Trial 9 finished with value: 0.608 and parameters: {'kernel': 'linear', 'C': 4.457717574442739, 'gamma': 0.6577037824295383, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 7 with value: 0.539.[0m


Trial 9 finished with value: 0.608 and parameters: {'kernel': 'linear', 'C': 4.457717574442739, 'gamma': 0.6577037824295383, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:47:50,010][0m Trial 10 finished with value: 0.616 and parameters: {'kernel': 'rbf', 'C': 9.723763274950771, 'gamma': 0.9867944979303487, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.[0m


Trial 10 finished with value: 0.616 and parameters: {'kernel': 'rbf', 'C': 9.723763274950771, 'gamma': 0.9867944979303487, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:47:58,237][0m Trial 11 finished with value: 0.578 and parameters: {'kernel': 'rbf', 'C': 2.5848136460887137, 'gamma': 0.3020654274653549, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.[0m


Trial 11 finished with value: 0.578 and parameters: {'kernel': 'rbf', 'C': 2.5848136460887137, 'gamma': 0.3020654274653549, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:48:06,042][0m Trial 12 finished with value: 0.588 and parameters: {'kernel': 'rbf', 'C': 0.5813864600766818, 'gamma': 0.0788917613681106, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.[0m


Trial 12 finished with value: 0.588 and parameters: {'kernel': 'rbf', 'C': 0.5813864600766818, 'gamma': 0.0788917613681106, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:48:15,349][0m Trial 13 finished with value: 0.7 and parameters: {'kernel': 'rbf', 'C': 6.088963343816186, 'gamma': 0.8235040653152463, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.[0m


Trial 13 finished with value: 0.7 and parameters: {'kernel': 'rbf', 'C': 6.088963343816186, 'gamma': 0.8235040653152463, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:48:23,786][0m Trial 14 finished with value: 0.579 and parameters: {'kernel': 'rbf', 'C': 2.83931821627723, 'gamma': 0.5418771986866919, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.[0m


Trial 14 finished with value: 0.579 and parameters: {'kernel': 'rbf', 'C': 2.83931821627723, 'gamma': 0.5418771986866919, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:48:32,085][0m Trial 15 finished with value: 0.597 and parameters: {'kernel': 'rbf', 'C': 9.965298260289376, 'gamma': 0.38826394148923266, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 7 with value: 0.539.[0m


Trial 15 finished with value: 0.597 and parameters: {'kernel': 'rbf', 'C': 9.965298260289376, 'gamma': 0.38826394148923266, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:48:40,146][0m Trial 16 finished with value: 0.607 and parameters: {'kernel': 'rbf', 'C': 5.063590365700627, 'gamma': 0.5243217723071262, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.[0m


Trial 16 finished with value: 0.607 and parameters: {'kernel': 'rbf', 'C': 5.063590365700627, 'gamma': 0.5243217723071262, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:48:49,300][0m Trial 17 finished with value: 0.625 and parameters: {'kernel': 'rbf', 'C': 8.154357956045613, 'gamma': 0.9786834553369809, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.[0m


Trial 17 finished with value: 0.625 and parameters: {'kernel': 'rbf', 'C': 8.154357956045613, 'gamma': 0.9786834553369809, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:48:58,289][0m Trial 18 finished with value: 0.576 and parameters: {'kernel': 'rbf', 'C': 0.18543193598363206, 'gamma': 0.7929923390080663, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.[0m


Trial 18 finished with value: 0.576 and parameters: {'kernel': 'rbf', 'C': 0.18543193598363206, 'gamma': 0.7929923390080663, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:49:07,102][0m Trial 19 finished with value: 0.606 and parameters: {'kernel': 'rbf', 'C': 1.8637469242704583, 'gamma': 0.585489005436695, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.[0m


Trial 19 finished with value: 0.606 and parameters: {'kernel': 'rbf', 'C': 1.8637469242704583, 'gamma': 0.585489005436695, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:49:15,198][0m Trial 20 finished with value: 0.633 and parameters: {'kernel': 'rbf', 'C': 3.7167390551871504, 'gamma': 0.31969351832988957, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.[0m


Trial 20 finished with value: 0.633 and parameters: {'kernel': 'rbf', 'C': 3.7167390551871504, 'gamma': 0.31969351832988957, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:49:24,088][0m Trial 21 finished with value: 0.644 and parameters: {'kernel': 'rbf', 'C': 0.31900542069268045, 'gamma': 0.802289761159284, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.[0m


Trial 21 finished with value: 0.644 and parameters: {'kernel': 'rbf', 'C': 0.31900542069268045, 'gamma': 0.802289761159284, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:49:33,289][0m Trial 22 finished with value: 0.562 and parameters: {'kernel': 'rbf', 'C': 1.2121899236916738, 'gamma': 0.9094991906939753, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.[0m


Trial 22 finished with value: 0.562 and parameters: {'kernel': 'rbf', 'C': 1.2121899236916738, 'gamma': 0.9094991906939753, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:49:42,533][0m Trial 23 finished with value: 0.635 and parameters: {'kernel': 'rbf', 'C': 1.3525282703663846, 'gamma': 0.9126512684754382, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.[0m


Trial 23 finished with value: 0.635 and parameters: {'kernel': 'rbf', 'C': 1.3525282703663846, 'gamma': 0.9126512684754382, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:49:51,941][0m Trial 24 finished with value: 0.646 and parameters: {'kernel': 'rbf', 'C': 2.6050436275616224, 'gamma': 0.9207930931911099, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.[0m


Trial 24 finished with value: 0.646 and parameters: {'kernel': 'rbf', 'C': 2.6050436275616224, 'gamma': 0.9207930931911099, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:50:00,902][0m Trial 25 finished with value: 0.573 and parameters: {'kernel': 'rbf', 'C': 1.1820126126000907, 'gamma': 0.763908264757172, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.[0m


Trial 25 finished with value: 0.573 and parameters: {'kernel': 'rbf', 'C': 1.1820126126000907, 'gamma': 0.763908264757172, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:58:07,918][0m Trial 26 finished with value: 0.668 and parameters: {'kernel': 'linear', 'C': 1.1402239894427317, 'gamma': 0.9821793539363138, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.[0m


Trial 26 finished with value: 0.668 and parameters: {'kernel': 'linear', 'C': 1.1402239894427317, 'gamma': 0.9821793539363138, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:58:17,195][0m Trial 27 finished with value: 0.609 and parameters: {'kernel': 'rbf', 'C': 2.1485548223856084, 'gamma': 0.7566541093417758, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.[0m


Trial 27 finished with value: 0.609 and parameters: {'kernel': 'rbf', 'C': 2.1485548223856084, 'gamma': 0.7566541093417758, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:58:26,356][0m Trial 28 finished with value: 0.59 and parameters: {'kernel': 'rbf', 'C': 1.0846775044210684, 'gamma': 0.8623855205986186, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.[0m


Trial 28 finished with value: 0.59 and parameters: {'kernel': 'rbf', 'C': 1.0846775044210684, 'gamma': 0.8623855205986186, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:58:35,776][0m Trial 29 finished with value: 0.643 and parameters: {'kernel': 'rbf', 'C': 3.220802399639031, 'gamma': 0.8942462335733705, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.[0m


Trial 29 finished with value: 0.643 and parameters: {'kernel': 'rbf', 'C': 3.220802399639031, 'gamma': 0.8942462335733705, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:58:44,887][0m Trial 30 finished with value: 0.625 and parameters: {'kernel': 'rbf', 'C': 2.2359220751804694, 'gamma': 0.7591999578584889, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.[0m


Trial 30 finished with value: 0.625 and parameters: {'kernel': 'rbf', 'C': 2.2359220751804694, 'gamma': 0.7591999578584889, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:58:54,084][0m Trial 31 finished with value: 0.631 and parameters: {'kernel': 'rbf', 'C': 1.5823875627931132, 'gamma': 0.9484421700541625, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.[0m


Trial 31 finished with value: 0.631 and parameters: {'kernel': 'rbf', 'C': 1.5823875627931132, 'gamma': 0.9484421700541625, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:59:03,155][0m Trial 32 finished with value: 0.589 and parameters: {'kernel': 'rbf', 'C': 0.902436505094099, 'gamma': 0.8344989353513164, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.[0m


Trial 32 finished with value: 0.589 and parameters: {'kernel': 'rbf', 'C': 0.902436505094099, 'gamma': 0.8344989353513164, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 02:59:10,788][0m Trial 33 finished with value: 0.584 and parameters: {'kernel': 'rbf', 'C': 0.19125379834050782, 'gamma': 0.9054885966893408, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.[0m


Trial 33 finished with value: 0.584 and parameters: {'kernel': 'rbf', 'C': 0.19125379834050782, 'gamma': 0.9054885966893408, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 03:06:17,795][0m Trial 34 finished with value: 0.656 and parameters: {'kernel': 'linear', 'C': 1.5097502032425942, 'gamma': 0.673151458475344, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 7 with value: 0.539.[0m


Trial 34 finished with value: 0.656 and parameters: {'kernel': 'linear', 'C': 1.5097502032425942, 'gamma': 0.673151458475344, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 03:06:25,479][0m Trial 35 finished with value: 0.669 and parameters: {'kernel': 'rbf', 'C': 0.8420120266251017, 'gamma': 0.4668291132404676, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 7 with value: 0.539.[0m


Trial 35 finished with value: 0.669 and parameters: {'kernel': 'rbf', 'C': 0.8420120266251017, 'gamma': 0.4668291132404676, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 03:17:40,970][0m Trial 36 finished with value: 0.642 and parameters: {'kernel': 'linear', 'C': 2.2084255539493887, 'gamma': 0.9955202474014198, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 7 with value: 0.539.[0m


Trial 36 finished with value: 0.642 and parameters: {'kernel': 'linear', 'C': 2.2084255539493887, 'gamma': 0.9955202474014198, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 03:17:48,926][0m Trial 37 finished with value: 0.621 and parameters: {'kernel': 'rbf', 'C': 3.035197015066412, 'gamma': 0.7246579017306212, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.[0m


Trial 37 finished with value: 0.621 and parameters: {'kernel': 'rbf', 'C': 3.035197015066412, 'gamma': 0.7246579017306212, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 03:17:58,186][0m Trial 38 finished with value: 0.619 and parameters: {'kernel': 'rbf', 'C': 1.6311158291122412, 'gamma': 0.853774437592896, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.[0m


Trial 38 finished with value: 0.619 and parameters: {'kernel': 'rbf', 'C': 1.6311158291122412, 'gamma': 0.853774437592896, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.539.


[32m[I 2023-03-02 03:18:15,870][0m Trial 39 finished with value: 0.633 and parameters: {'kernel': 'linear', 'C': 0.7549664510757861, 'gamma': 0.6256739621022606, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.[0m


Trial 39 finished with value: 0.633 and parameters: {'kernel': 'linear', 'C': 0.7549664510757861, 'gamma': 0.6256739621022606, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.539.


In [9]:
study.trials_dataframe().sort_values(by="value").head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_gamma,params_kernel,params_normalization,params_scaling,params_tfidf,state
7,7,0.539,2023-03-02 02:35:19.576094,2023-03-02 02:35:28.025263,0 days 00:00:08.449169,8.266747,0.973206,rbf,False,False,True,COMPLETE
22,22,0.562,2023-03-02 02:49:24.099901,2023-03-02 02:49:33.275564,0 days 00:00:09.175663,1.21219,0.909499,rbf,True,True,True,COMPLETE
25,25,0.573,2023-03-02 02:49:51.944888,2023-03-02 02:50:00.889523,0 days 00:00:08.944635,1.182013,0.763908,rbf,True,True,True,COMPLETE
6,6,0.574,2023-03-02 02:35:10.840927,2023-03-02 02:35:19.558089,0 days 00:00:08.717162,1.817477,0.435333,rbf,False,False,False,COMPLETE
18,18,0.576,2023-03-02 02:48:49.304172,2023-03-02 02:48:58.274334,0 days 00:00:08.970162,0.185432,0.792992,rbf,True,True,True,COMPLETE
11,11,0.578,2023-03-02 02:47:50.014429,2023-03-02 02:47:58.224430,0 days 00:00:08.210001,2.584814,0.302065,rbf,False,False,False,COMPLETE
14,14,0.579,2023-03-02 02:48:15.353429,2023-03-02 02:48:23.773135,0 days 00:00:08.419706,2.839318,0.541877,rbf,False,False,False,COMPLETE
33,33,0.584,2023-03-02 02:59:03.158829,2023-03-02 02:59:10.775366,0 days 00:00:07.616537,0.191254,0.905489,rbf,False,False,True,COMPLETE
12,12,0.588,2023-03-02 02:47:58.241428,2023-03-02 02:48:06.029428,0 days 00:00:07.788000,0.581386,0.078892,rbf,False,False,False,COMPLETE
32,32,0.589,2023-03-02 02:58:54.087201,2023-03-02 02:59:03.142411,0 days 00:00:09.055210,0.902437,0.834499,rbf,True,True,False,COMPLETE


In [10]:
print(f"Best SVR regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

Best SVR regression params: {'C': 8.266746610677888, 'gamma': 0.9732063755520685, 'kernel': 'rbf', 'normalization': False, 'scaling': False, 'tfidf': True}, achieving val RMSE: 0.539


In [11]:
save = True
load = False

regressor_params = {"kernel": "rbf", "C": 8.3, "gamma": 0.97}
pipe_params = {
    'normalization': False,
    'scaling': False,
    'tfidf': True,
}

sample = 50000

if load:
    pipe = joblib.load(f"{exp_dir}/svr/pipe")
else:
    regressor = SVR(**regressor_params)
    pipe = sklearn_regressors.train_pipe(regressor, X_train[:sample], y_train[:sample], **pipe_params)
    if save:
        additional_attributes = {
            "regressor_params": regressor_params,
            "pipe_params": pipe_params,
        }

        joblib.dump(pipe, f"{exp_dir}/svr/pipe")
        with open(f"{exp_dir}/svr/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [12]:
test_bbs = np.array([" ".join(bb) for bb in test_df.bb.tolist()])
test_preds = pipe.predict(test_bbs)
true_energies = test_df.energy.values

median = np.median(true_energies)
std = np.std(true_energies)
mse = round(mean_squared_error(true_energies, test_preds), 3)
mae = round(mean_absolute_error(true_energies, test_preds), 3)

print(f"MSE: {mse}")
print(f"MAE: {mae}")

MSE: 0.36
MAE: 0.249


## Hist Gradient Boosting Regressor

Gradient Boosting regression for large datasets. Tune:
* learning rate
* max leaf nodes
* l2 regularization

In [16]:
study = optuna.create_study(study_name="HistGBoost-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-02-20 18:32:50,351][0m A new study created in RDB with name: HistGBoost-regression[0m


A new study created in RDB with name: HistGBoost-regression


In [None]:
def objective(trial):

    learning_rate = trial.suggest_float("learning_rate", 1e-4, 0.5)
    max_leaf_nodes = trial.suggest_int("max_leaf_nodes", 10, 60)
    l2_regularization = trial.suggest_float("l2_regularization", 0.1, 10.0)
    tfidf = trial.suggest_categorical("tfidf", [True, False])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = HistGradientBoostingRegressor(learning_rate=learning_rate, max_leaf_nodes=max_leaf_nodes, l2_regularization=l2_regularization)
    score = sklearn_regressors.evaluate_regressor(
        regressor=regressor,
        X=X_train,
        y=y_train,
        tfidf=tfidf,
        normalization=normalization, 
        scaling=scaling,
        requires_dense=True,
    )

    return score

study.optimize(objective, n_trials=40)

In [20]:
study.trials_dataframe().sort_values(by="value").head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_l2_regularization,params_learning_rate,params_max_leaf_nodes,params_normalization,params_scaling,params_tfidf,state
27,27,1.159,2023-02-20 18:52:59.665193,2023-02-20 18:53:17.724195,0 days 00:00:18.059002,2.744975,0.035091,11,False,False,False,COMPLETE
0,0,1.166,2023-02-20 18:32:54.411005,2023-02-20 18:33:35.052018,0 days 00:00:40.641013,0.60428,0.017179,25,False,False,False,COMPLETE
23,23,1.171,2023-02-20 18:50:36.498693,2023-02-20 18:51:10.542193,0 days 00:00:34.043500,3.965629,0.122618,21,False,False,False,COMPLETE
17,17,1.176,2023-02-20 18:46:51.490254,2023-02-20 18:47:23.308750,0 days 00:00:31.818496,9.630193,0.102127,20,False,False,False,COMPLETE
12,12,1.193,2023-02-20 18:43:53.610250,2023-02-20 18:44:29.325749,0 days 00:00:35.715499,5.683796,0.296192,23,False,False,False,COMPLETE
22,22,1.208,2023-02-20 18:49:51.526250,2023-02-20 18:50:36.479193,0 days 00:00:44.952943,5.153518,0.061101,28,False,False,False,COMPLETE
24,24,1.21,2023-02-20 18:51:10.562693,2023-02-20 18:51:32.506193,0 days 00:00:21.943500,1.272847,0.126748,14,False,False,False,COMPLETE
33,33,1.222,2023-02-20 18:55:34.949193,2023-02-20 18:56:00.773694,0 days 00:00:25.824501,4.295648,0.084432,16,False,False,False,COMPLETE
32,32,1.224,2023-02-20 18:55:02.206195,2023-02-20 18:55:34.929193,0 days 00:00:32.722998,3.248474,0.140634,21,False,False,False,COMPLETE
35,35,1.231,2023-02-20 18:56:52.135694,2023-02-20 18:57:19.166193,0 days 00:00:27.030499,0.774214,0.173554,17,True,True,True,COMPLETE


In [21]:
print(f"Best HistGBoost regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

Best HistGBoost regression params: {'l2_regularization': 2.7449752057437693, 'learning_rate': 0.03509097403389229, 'max_leaf_nodes': 11, 'normalization': False, 'scaling': False, 'tfidf': False}, achieving val RMSE: 1.159


In [23]:
save = True
load = False

regressor_params = {"learning_rate": 0.035, "max_leaf_nodes": 11, "l2_regularization": 2.75}
pipe_params = {
    'normalization': False,
    'scaling': False,
    'tfidf': False,
    'requires_dense': True,
}

if load:
    pipe = joblib.load(f"{exp_dir}/hist_gradient_boosting/pipe")
else:
    regressor = HistGradientBoostingRegressor(**regressor_params)
    pipe = sklearn_regressors.train_pipe(regressor, X_train, y_train, **pipe_params)

    if save:
        additional_attributes = {
            "regressor_params": regressor_params,
            "pipe_params": pipe_params,
        }

        joblib.dump(pipe, f"{exp_dir}/hist_gradient_boosting/pipe")
        with open(f"{exp_dir}/hist_gradient_boosting/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [24]:
test_bbs = np.array([" ".join(bb) for bb in test_df.bb.tolist()])
test_preds = pipe.predict(test_bbs)
true_energies = test_df.energy.values

median = np.median(true_energies)
std = np.std(true_energies)
mse = round(mean_squared_error(true_energies, test_preds), 3)
mae = round(mean_absolute_error(true_energies, test_preds), 3)

print(f"MSE: {mse}")
print(f"MAE: {mae}")

MSE: 1.325
MAE: 0.718
