# Imports

In [1]:
import sys
import json
import logging
import joblib
import numpy as np
import matplotlib.pyplot as plt

import optuna

from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.linear_model import LinearRegression, SGDRegressor, ElasticNet, BayesianRidge, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline

from typing import Union

from bb_energy_prediction import data_utils, sklearn_regressors

  from .autonotebook import tqdm as notebook_tqdm


# Data

In [2]:
no_dupl = True
if no_dupl:
    data_df = data_utils.get_data_df(data_path="../energy_data/no_dupl_data.pkl")
else:
    data_df = data_utils.get_data_df(data_path="../energy_data/data.pkl")

if "bb_embeddings" in data_df.columns:
    data_df = data_df.drop(columns="bb_embeddings")
data_df = data_df.sample(frac=1)
data_df.head(10)

Unnamed: 0,bb,energy
1341,"[mov $0xc %r13d, xor %r14d %r14d, xor %eax %ea...",1.108462
2636,"[movq %rax, lea %r12, movq (%rbp) %rdi, lea %r...",0.776596
1855,"[movl %ebx, movq %r15, movl %r14d, movl %ebx, ...",0.293011
1809,"[movl $0x0, movl %ebx, movq %r14]",0.406841
110,"[and $0x2 %ecx, jz]",0.883783
1581,"[mov %rbp %rsi, lea %rdi, callq, test %rsi %rs...",1.766037
2232,"[movq $0x0, movl %edx, test %edx %edx, jnz 0xa0]",0.865977
850,"[lea %rbx, nopl %eax (%rax), callq, nop %edi %...",0.750367
1271,"[mov %rdi, mov $0x0 %al, callq 0xf0f2, nop %ed...",0.020873
183,"[callq, pushq %rbp, mov %rsp %rbp]",2.867793


In [3]:
if no_dupl:
    shuffled_data_df = data_df.sample(frac=1).reset_index(drop=True)
    test_size = int(0.15 * len(shuffled_data_df))
    test_df = shuffled_data_df[-test_size:]
    train_df = shuffled_data_df[:-test_size]

    print(f"Test data size: {len(test_df)}")
else:
    program_names = data_df.program_name.value_counts().index[4:]
    test_programs = np.random.choice(program_names, 3, replace=False)
    train_val_df = data_df[~data_df.program_name.isin(test_programs)]
    test_df = data_df[data_df.program_name.isin(test_programs)]

    print(f"Test programs: {test_programs}")
    print(f"Test data size: {len(test_df)}")

Test data size: 561


In [4]:
X_train = np.array([" ".join(bb) for bb in train_df.bb.tolist()])
y_train = train_df.energy.values

X_test = np.array([" ".join(bb) for bb in test_df.bb.tolist()])
y_test = test_df.energy.values

In [5]:
cnt_vect = CountVectorizer()
cnt_vect.fit_transform(X_train)
vocab_len = len(cnt_vect.get_feature_names_out())
print(f"Vocab length: {vocab_len}")

Vocab length: 1158


# Regressors

In [6]:
exp_dir = "/Users/thodo/Documents/sxoli/diplomatiki/basic-block-energy-prediction/model_checkpoints/regressors_no_dupl"

In [7]:
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
storage_name = "sqlite:////Users/thodo/Documents/sxoli/diplomatiki/optuna-studies/sklearn-regressors-no-duplicates.db"

## Linear Regression

Simple least squares regression.
No need for extra tuning.

In [7]:
study = optuna.create_study(study_name="linear-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-02-20 15:33:59,323][0m A new study created in RDB with name: linear-regression[0m


A new study created in RDB with name: linear-regression


In [None]:
def objective(trial):

    tfidf = trial.suggest_categorical("tfidf", [True, False])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = LinearRegression()
    score = sklearn_regressors.evaluate_regressor(
        regressor=regressor,
        X=X_train,
        y=y_train,
        tfidf=tfidf,
        normalization=normalization, 
        scaling=scaling
    )

    return score

study.optimize(objective, n_trials=10)

In [9]:
study.trials_dataframe().sort_values(by="value").head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_normalization,params_scaling,params_tfidf,state
0,0,1.391,2023-02-20 15:34:01.801151,2023-02-20 15:34:02.251864,0 days 00:00:00.450713,False,False,True,COMPLETE
1,1,1.394,2023-02-20 15:34:02.276361,2023-02-20 15:34:02.498864,0 days 00:00:00.222503,False,True,True,COMPLETE
7,7,1.565,2023-02-20 15:34:04.561908,2023-02-20 15:34:04.767401,0 days 00:00:00.205493,True,True,False,COMPLETE
6,6,1.695,2023-02-20 15:34:04.305399,2023-02-20 15:34:04.545399,0 days 00:00:00.240000,True,True,False,COMPLETE
4,4,1.852,2023-02-20 15:34:03.834829,2023-02-20 15:34:04.062158,0 days 00:00:00.227329,True,True,False,COMPLETE
3,3,2.005,2023-02-20 15:34:03.323764,2023-02-20 15:34:03.816773,0 days 00:00:00.493009,False,False,True,COMPLETE
2,2,2.129,2023-02-20 15:34:02.517863,2023-02-20 15:34:03.306765,0 days 00:00:00.788902,False,False,False,COMPLETE
5,5,2.13,2023-02-20 15:34:04.078658,2023-02-20 15:34:04.287906,0 days 00:00:00.209248,True,True,True,COMPLETE
8,8,2.158,2023-02-20 15:34:04.784519,2023-02-20 15:34:04.993911,0 days 00:00:00.209392,False,True,True,COMPLETE
9,9,3.26,2023-02-20 15:34:05.010990,2023-02-20 15:34:05.221483,0 days 00:00:00.210493,False,True,True,COMPLETE


In [10]:
print(f"Best linear regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

Best linear regression params: {'normalization': False, 'scaling': False, 'tfidf': True}, achieving val RMSE: 1.391


In [10]:
save = True
load = False

pipe_params = {
    'normalization': False,
    'scaling': False,
    'tfidf': True,
}

if load:
    pipe = joblib.load(f"{exp_dir}/linear/pipe")
else:
    regressor = LinearRegression()
    pipe = sklearn_regressors.train_pipe(regressor, X_train, y_train, **pipe_params)

    if save:
        additional_attributes = {
            "pipe_params": pipe_params,
            #"test_programs": test_programs,
        }

        joblib.dump(pipe, f"{exp_dir}/linear/pipe")
        with open(f"{exp_dir}/linear/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [11]:
test_bbs = np.array([" ".join(bb) for bb in test_df.bb.tolist()])
test_preds = pipe.predict(test_bbs)
true_energies = test_df.energy.values

median = np.median(true_energies)
std = np.std(true_energies)
mse = round(mean_squared_error(true_energies, test_preds), 3)
mae = round(mean_absolute_error(true_energies, test_preds), 3)

print(f"MSE: {mse}")
print(f"MAE: {mae}")

MSE: 4.706
MAE: 0.908


## Lasso

Linear regression with L1 regularization. Tune:
* alpha (L1 regularization term).

In [13]:
study = optuna.create_study(study_name="lasso-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-02-20 15:34:33,054][0m A new study created in RDB with name: lasso-regression[0m


A new study created in RDB with name: lasso-regression


In [None]:
def objective(trial):

    alpha = trial.suggest_float("alpha", 0.1, 10.0)
    tfidf = trial.suggest_categorical("tfidf", [True, False])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = Lasso(alpha=alpha)
    score = sklearn_regressors.evaluate_regressor(
        regressor=regressor,
        X=X_train,
        y=y_train,
        tfidf=tfidf,
        normalization=normalization, 
        scaling=scaling
    )

    return score

study.optimize(objective, n_trials=20)

In [15]:
study.trials_dataframe().sort_values(by="value").head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_normalization,params_scaling,params_tfidf,state
13,13,1.283,2023-02-20 15:34:37.334249,2023-02-20 15:34:37.511251,0 days 00:00:00.177002,8.406526,True,False,False,COMPLETE
7,7,1.329,2023-02-20 15:34:36.252813,2023-02-20 15:34:36.415310,0 days 00:00:00.162497,9.905034,True,False,True,COMPLETE
8,8,1.358,2023-02-20 15:34:36.433306,2023-02-20 15:34:36.587816,0 days 00:00:00.154510,4.451271,True,False,False,COMPLETE
17,17,1.387,2023-02-20 15:34:38.122383,2023-02-20 15:34:38.303883,0 days 00:00:00.181500,8.24698,True,False,True,COMPLETE
0,0,1.391,2023-02-20 15:34:34.972038,2023-02-20 15:34:35.149055,0 days 00:00:00.177017,4.766426,False,False,False,COMPLETE
2,2,1.394,2023-02-20 15:34:35.357539,2023-02-20 15:34:35.526089,0 days 00:00:00.168550,7.28016,True,False,False,COMPLETE
4,4,1.402,2023-02-20 15:34:35.726315,2023-02-20 15:34:35.884015,0 days 00:00:00.157700,2.382495,False,True,False,COMPLETE
10,10,1.406,2023-02-20 15:34:36.783307,2023-02-20 15:34:36.950560,0 days 00:00:00.167253,9.99227,True,False,True,COMPLETE
16,16,1.426,2023-02-20 15:34:37.930738,2023-02-20 15:34:38.104504,0 days 00:00:00.173766,9.882431,True,False,False,COMPLETE
6,6,1.44,2023-02-20 15:34:36.078317,2023-02-20 15:34:36.236313,0 days 00:00:00.157996,1.382539,True,True,True,COMPLETE


In [16]:
print(f"Best lasso regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

Best lasso regression params: {'alpha': 8.406525586151336, 'normalization': True, 'scaling': False, 'tfidf': False}, achieving val RMSE: 1.283


In [13]:
save = True
load = False

regressor_params = {"alpha": 8.4}
pipe_params = {
    'normalization': True,
    'scaling': False,
    'tfidf': False,
}

if load:
    pipe = joblib.load(f"{exp_dir}/lasso/pipe")
else:
    regressor = Lasso(**regressor_params)
    pipe = sklearn_regressors.train_pipe(regressor, X_train, y_train, **pipe_params)

    if save:
        additional_attributes = {
            "regressor_params": regressor_params,
            "pipe_params": pipe_params,
            #"test_programs": test_programs,
        }

        joblib.dump(pipe, f"{exp_dir}/lasso/pipe")
        with open(f"{exp_dir}/lasso/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [14]:
test_bbs = np.array([" ".join(bb) for bb in test_df.bb.tolist()])
test_preds = pipe.predict(test_bbs)
true_energies = test_df.energy.values

median = np.median(true_energies)
std = np.std(true_energies)
mse = round(mean_squared_error(true_energies, test_preds), 3)
mae = round(mean_absolute_error(true_energies, test_preds), 3)

print(f"MSE: {mse}")
print(f"MAE: {mae}")

MSE: 2.198
MAE: 0.931


## Ridge

Linear regression with L2 regularization. Tune:
* alpha (L2 regularization term).

In [20]:
study = optuna.create_study(study_name="ridge-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-02-20 15:35:25,705][0m A new study created in RDB with name: ridge-regression[0m


A new study created in RDB with name: ridge-regression


In [None]:
def objective(trial):

    alpha = trial.suggest_float("alpha", 0.1, 10.0)
    tfidf = trial.suggest_categorical("tfidf", [True, False])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = Ridge(alpha=alpha)
    score = sklearn_regressors.evaluate_regressor(
        regressor=regressor,
        X=X_train,
        y=y_train,
        tfidf=tfidf,
        normalization=normalization, 
        scaling=scaling
    )

    return score

study.optimize(objective, n_trials=20)

In [22]:
study.trials_dataframe().sort_values(by="value").head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_normalization,params_scaling,params_tfidf,state
17,17,1.137,2023-02-20 15:35:30.496136,2023-02-20 15:35:30.663641,0 days 00:00:00.167505,3.676679,False,False,True,COMPLETE
14,14,1.204,2023-02-20 15:35:29.934280,2023-02-20 15:35:30.108909,0 days 00:00:00.174629,5.854226,False,False,True,COMPLETE
12,12,1.227,2023-02-20 15:35:29.548637,2023-02-20 15:35:29.722652,0 days 00:00:00.174015,9.533101,False,False,True,COMPLETE
15,15,1.234,2023-02-20 15:35:30.125410,2023-02-20 15:35:30.292144,0 days 00:00:00.166734,5.939947,False,False,True,COMPLETE
5,5,1.238,2023-02-20 15:35:28.167013,2023-02-20 15:35:28.335519,0 days 00:00:00.168506,8.184475,False,False,True,COMPLETE
13,13,1.258,2023-02-20 15:35:29.741653,2023-02-20 15:35:29.916161,0 days 00:00:00.174508,9.898587,False,False,True,COMPLETE
10,10,1.26,2023-02-20 15:35:29.175171,2023-02-20 15:35:29.342912,0 days 00:00:00.167741,9.538996,False,False,True,COMPLETE
2,2,1.262,2023-02-20 15:35:27.507175,2023-02-20 15:35:27.674538,0 days 00:00:00.167363,7.608997,True,False,True,COMPLETE
18,18,1.271,2023-02-20 15:35:30.681069,2023-02-20 15:35:30.850963,0 days 00:00:00.169894,3.439453,True,False,False,COMPLETE
3,3,1.276,2023-02-20 15:35:27.692210,2023-02-20 15:35:27.950722,0 days 00:00:00.258512,7.662617,False,False,False,COMPLETE


In [23]:
print(f"Best ridge regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

Best ridge regression params: {'alpha': 3.6766792495848, 'normalization': False, 'scaling': False, 'tfidf': True}, achieving val RMSE: 1.137


In [15]:
save = True
load = False

regressor_params = {"alpha": 3.6}
pipe_params = {
    'normalization': False,
    'scaling': False,
    'tfidf': True,
}

if load:
    pipe = joblib.load(f"{exp_dir}/ridge/pipe")
else:
    regressor = Ridge(**regressor_params)
    pipe = sklearn_regressors.train_pipe(regressor, X_train, y_train, **pipe_params)

    if save:
        additional_attributes = {
            "regressor_params": regressor_params,
            "pipe_params": pipe_params,
            #"test_programs": test_programs,
        }

        joblib.dump(pipe, f"{exp_dir}/ridge/pipe")
        with open(f"{exp_dir}/ridge/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [16]:
test_bbs = np.array([" ".join(bb) for bb in test_df.bb.tolist()])
test_preds = pipe.predict(test_bbs)
true_energies = test_df.energy.values

median = np.median(true_energies)
std = np.std(true_energies)
mse = round(mean_squared_error(true_energies, test_preds), 3)
mae = round(mean_absolute_error(true_energies, test_preds), 3)

print(f"MSE: {mse}")
print(f"MAE: {mae}")

MSE: 1.685
MAE: 0.778


## ElasticNet

Linear regression with both L1 and L2 regularization. Tune
* alpha
* l1_ratio. 

In [27]:
study = optuna.create_study(study_name="elasticnet-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-02-20 15:36:06,708][0m A new study created in RDB with name: elasticnet-regression[0m


A new study created in RDB with name: elasticnet-regression


In [None]:
def objective(trial):

    alpha = trial.suggest_float("alpha", 0.1, 10.0)
    l1_ratio = trial.suggest_float("l1_ratio", 0.1, 0.9)
    tfidf = trial.suggest_categorical("tfidf", [True, False])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
    score = sklearn_regressors.evaluate_regressor(
        regressor=regressor,
        X=X_train,
        y=y_train,
        tfidf=tfidf,
        normalization=normalization, 
        scaling=scaling
    )

    return score

study.optimize(objective, n_trials=20)

In [29]:
study.trials_dataframe().sort_values(by="value").head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_l1_ratio,params_normalization,params_scaling,params_tfidf,state
18,18,1.251,2023-02-20 15:36:10.884320,2023-02-20 15:36:11.199201,0 days 00:00:00.314881,0.272486,0.29562,True,True,True,COMPLETE
6,6,1.341,2023-02-20 15:36:08.474913,2023-02-20 15:36:08.656431,0 days 00:00:00.181518,4.106715,0.152726,True,False,True,COMPLETE
14,14,1.345,2023-02-20 15:36:10.068108,2023-02-20 15:36:10.245156,0 days 00:00:00.177048,1.849326,0.362417,False,False,False,COMPLETE
8,8,1.362,2023-02-20 15:36:08.868852,2023-02-20 15:36:09.045848,0 days 00:00:00.176996,4.893964,0.426713,False,False,False,COMPLETE
19,19,1.374,2023-02-20 15:36:11.218698,2023-02-20 15:36:11.541705,0 days 00:00:00.323007,0.298526,0.275196,True,True,True,COMPLETE
12,12,1.377,2023-02-20 15:36:09.661214,2023-02-20 15:36:09.844556,0 days 00:00:00.183342,6.025417,0.373996,False,False,False,COMPLETE
16,16,1.38,2023-02-20 15:36:10.474854,2023-02-20 15:36:10.669320,0 days 00:00:00.194466,1.697139,0.100542,True,False,False,COMPLETE
17,17,1.386,2023-02-20 15:36:10.686823,2023-02-20 15:36:10.866824,0 days 00:00:00.180001,1.861158,0.465428,False,False,True,COMPLETE
13,13,1.43,2023-02-20 15:36:09.862141,2023-02-20 15:36:10.051468,0 days 00:00:00.189327,3.571594,0.116734,False,False,True,COMPLETE
5,5,1.441,2023-02-20 15:36:08.278437,2023-02-20 15:36:08.458293,0 days 00:00:00.179856,8.839634,0.224013,True,True,False,COMPLETE


In [30]:
print(f"Best ElasticNET regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

Best ElasticNET regression params: {'alpha': 0.2724859546847187, 'l1_ratio': 0.29561996999439866, 'normalization': True, 'scaling': True, 'tfidf': True}, achieving val RMSE: 1.251


In [17]:
save = True
load = False

regressor_params = {"alpha": 0.27, "l1_ratio": 0.29}
pipe_params = {
    'normalization': True,
    'scaling': True,
    'tfidf': True,
}

if load:
    pipe = joblib.load(f"{exp_dir}/elasticnet/pipe")
else:
    regressor = ElasticNet(**regressor_params)
    pipe = sklearn_regressors.train_pipe(regressor, X_train, y_train, **pipe_params)

    if save:
        additional_attributes = {
            "regressor_params": regressor_params,
            "pipe_params": pipe_params,
            #"test_programs": test_programs,
        }

        joblib.dump(pipe, f"{exp_dir}/elasticnet/pipe")
        with open(f"{exp_dir}/elasticnet/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [18]:
test_bbs = np.array([" ".join(bb) for bb in test_df.bb.tolist()])
test_preds = pipe.predict(test_bbs)
true_energies = test_df.energy.values

median = np.median(true_energies)
std = np.std(true_energies)
mse = round(mean_squared_error(true_energies, test_preds), 3)
mae = round(mean_absolute_error(true_energies, test_preds), 3)

print(f"MSE: {mse}")
print(f"MAE: {mae}")

MSE: 1.765
MAE: 0.801


## SGD Regression

Stochastic Gradient Descent Regression using either of the above reguliration techniques. Tune:
* penalty method
* alpha
* l1_ratio (if elasticnet penalty).

In [33]:
study = optuna.create_study(study_name="SGD-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-02-20 15:36:36,171][0m A new study created in RDB with name: SGD-regression[0m


A new study created in RDB with name: SGD-regression


In [None]:
def objective(trial):

    penalty = trial.suggest_categorical("penalty", ["l1", "l2", "elasticnet"])
    alpha = trial.suggest_float("alpha", 0.1, 10.0)
    l1_ratio = trial.suggest_float("l1_ratio", 0.1, 0.1)
    tfidf = trial.suggest_categorical("tfidf", [True, False])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = SGDRegressor(penalty=penalty, alpha=alpha, l1_ratio=l1_ratio)
    score = sklearn_regressors.evaluate_regressor(
        regressor=regressor,
        X=X_train,
        y=y_train,
        tfidf=tfidf,
        normalization=normalization, 
        scaling=scaling
    )

    return score

study.optimize(objective, n_trials=40)

In [29]:
study.trials_dataframe().sort_values(by="value").head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_l1_ratio,params_normalization,params_penalty,params_scaling,params_tfidf,state
33,33,0.646,2023-02-12 20:19:40.921652,2023-02-12 20:19:55.090151,0 days 00:00:14.168499,4.324908,0.1,False,l2,False,False,COMPLETE
27,27,0.646,2023-02-12 20:18:15.011229,2023-02-12 20:18:29.342369,0 days 00:00:14.331140,4.394182,0.1,False,l2,False,False,COMPLETE
36,36,0.649,2023-02-12 20:20:23.841917,2023-02-12 20:20:38.031959,0 days 00:00:14.190042,2.82342,0.1,False,l2,False,False,COMPLETE
38,38,0.654,2023-02-12 20:20:52.464274,2023-02-12 20:21:06.633232,0 days 00:00:14.168958,2.761732,0.1,False,l2,False,False,COMPLETE
32,32,0.654,2023-02-12 20:19:26.693659,2023-02-12 20:19:40.906652,0 days 00:00:14.212993,4.428107,0.1,False,l2,False,False,COMPLETE
23,23,0.655,2023-02-12 20:17:18.110064,2023-02-12 20:17:32.220056,0 days 00:00:14.109992,6.133459,0.1,False,l2,False,False,COMPLETE
31,31,0.658,2023-02-12 20:19:12.543496,2023-02-12 20:19:26.676186,0 days 00:00:14.132690,6.027586,0.1,False,l2,False,False,COMPLETE
17,17,0.66,2023-02-12 20:15:51.947395,2023-02-12 20:16:06.275483,0 days 00:00:14.328088,5.391655,0.1,False,l2,False,False,COMPLETE
24,24,0.661,2023-02-12 20:17:32.235063,2023-02-12 20:17:46.365057,0 days 00:00:14.129994,5.782303,0.1,False,l2,False,False,COMPLETE
30,30,0.662,2023-02-12 20:18:58.337785,2023-02-12 20:19:12.528499,0 days 00:00:14.190714,6.127276,0.1,False,l2,False,False,COMPLETE


In [38]:
print(f"Best SGD regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

Best SGD regression params: {'alpha': 1.341812665544318, 'l1_ratio': 0.1, 'normalization': False, 'penalty': 'l2', 'scaling': False, 'tfidf': False}, achieving val RMSE: 1.245


In [19]:
save = True
load = False

regressor_params = {"alpha": 1.34, "l1_ratio": 0.1, "penalty": "l2"}
pipe_params = {
    'normalization': False,
    'scaling': False,
    'tfidf': False,
}

if load:
    pipe = joblib.load(f"{exp_dir}/sgd/pipe")
else:
    regressor = SGDRegressor(**regressor_params)
    pipe = sklearn_regressors.train_pipe(regressor, X_train, y_train, **pipe_params)

    if save:
        additional_attributes = {
            "regressor_params": regressor_params,
            "pipe_params": pipe_params,
            #"test_programs": test_programs,
        }

        joblib.dump(pipe, f"{exp_dir}/sgd/pipe")
        with open(f"{exp_dir}/sgd/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [20]:
test_bbs = np.array([" ".join(bb) for bb in test_df.bb.tolist()])
test_preds = pipe.predict(test_bbs)
true_energies = test_df.energy.values

median = np.median(true_energies)
std = np.std(true_energies)
mse = round(mean_squared_error(true_energies, test_preds), 3)
mae = round(mean_absolute_error(true_energies, test_preds), 3)

print(f"MSE: {mse}")
print(f"MAE: {mae}")

MSE: 2.02
MAE: 0.744


## SVR

Support Vector Regression. Tune:
* kernel
* gamma
* C

In [8]:
study = optuna.create_study(study_name="SVR-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-02-20 17:57:42,719][0m A new study created in RDB with name: SVR-regression[0m


A new study created in RDB with name: SVR-regression


In [None]:
def objective(trial):

    kernel = trial.suggest_categorical("kernel", ["rbf", "linear"])
    C = trial.suggest_float("C", 0.1, 10.0)
    gamma = trial.suggest_float("gamma", 0.1, 0.1)
    tfidf = trial.suggest_categorical("tfidf", [True, False])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = SVR(kernel=kernel, C=C, gamma=gamma)
    score = sklearn_regressors.evaluate_regressor(
        regressor=regressor,
        X=X_train[:10000],
        y=y_train[:10000],
        tfidf=tfidf,
        normalization=normalization, 
        scaling=scaling
    )

    return score

study.optimize(objective, n_trials=40)

In [10]:
study.trials_dataframe().sort_values(by="value").head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_gamma,params_kernel,params_normalization,params_scaling,params_tfidf,state
22,22,1.189,2023-02-20 18:30:22.377620,2023-02-20 18:30:24.073617,0 days 00:00:01.695997,7.200131,0.1,rbf,True,False,False,COMPLETE
18,18,1.192,2023-02-20 18:30:13.887883,2023-02-20 18:30:15.588621,0 days 00:00:01.700738,9.99097,0.1,rbf,True,False,False,COMPLETE
9,9,1.198,2023-02-20 18:29:36.954581,2023-02-20 18:29:38.842582,0 days 00:00:01.888001,3.756106,0.1,linear,True,False,False,COMPLETE
25,25,1.204,2023-02-20 18:30:27.460547,2023-02-20 18:30:29.144814,0 days 00:00:01.684267,7.38977,0.1,rbf,True,False,False,COMPLETE
26,26,1.206,2023-02-20 18:30:29.161903,2023-02-20 18:30:30.893968,0 days 00:00:01.732065,9.662281,0.1,rbf,True,False,True,COMPLETE
10,10,1.223,2023-02-20 18:29:38.860089,2023-02-20 18:29:40.609037,0 days 00:00:01.748948,5.50913,0.1,rbf,False,False,False,COMPLETE
32,32,1.23,2023-02-20 18:30:40.980095,2023-02-20 18:30:42.686375,0 days 00:00:01.706280,8.393829,0.1,rbf,True,False,False,COMPLETE
31,31,1.231,2023-02-20 18:30:39.276778,2023-02-20 18:30:40.963397,0 days 00:00:01.686619,7.26642,0.1,rbf,True,False,False,COMPLETE
6,6,1.233,2023-02-20 18:09:47.242202,2023-02-20 18:09:49.150196,0 days 00:00:01.907994,3.022796,0.1,linear,True,False,True,COMPLETE
24,24,1.238,2023-02-20 18:30:25.765158,2023-02-20 18:30:27.443546,0 days 00:00:01.678388,7.606435,0.1,rbf,True,False,False,COMPLETE


In [12]:
print(f"Best SVR regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

Best SVR regression params: {'C': 7.2001310227775885, 'gamma': 0.1, 'kernel': 'rbf', 'normalization': True, 'scaling': False, 'tfidf': False}, achieving val RMSE: 1.189


In [14]:
save = True
load = False

regressor_params = {"kernel": "rbf", "C": 7.2, "gamma": 0.1}
pipe_params = {
    'normalization': False,
    'scaling': False,
    'tfidf': False,
}

sample = 50000

if load:
    pipe = joblib.load(f"{exp_dir}/svr/pipe")
else:
    regressor = SVR(**regressor_params)
    pipe = sklearn_regressors.train_pipe(regressor, X_train[:sample], y_train[:sample], **pipe_params)
    if save:
        additional_attributes = {
            "regressor_params": regressor_params,
            "pipe_params": pipe_params,
        }

        joblib.dump(pipe, f"{exp_dir}/svr/pipe")
        with open(f"{exp_dir}/svr/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [15]:
test_bbs = np.array([" ".join(bb) for bb in test_df.bb.tolist()])
test_preds = pipe.predict(test_bbs)
true_energies = test_df.energy.values

median = np.median(true_energies)
std = np.std(true_energies)
mse = round(mean_squared_error(true_energies, test_preds), 3)
mae = round(mean_absolute_error(true_energies, test_preds), 3)

print(f"MSE: {mse}")
print(f"MAE: {mae}")

MSE: 1.375
MAE: 0.673


## Hist Gradient Boosting Regressor

Gradient Boosting regression for large datasets. Tune:
* learning rate
* max leaf nodes
* l2 regularization

In [16]:
study = optuna.create_study(study_name="HistGBoost-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-02-20 18:32:50,351][0m A new study created in RDB with name: HistGBoost-regression[0m


A new study created in RDB with name: HistGBoost-regression


In [None]:
def objective(trial):

    learning_rate = trial.suggest_float("learning_rate", 1e-4, 0.5)
    max_leaf_nodes = trial.suggest_int("max_leaf_nodes", 10, 60)
    l2_regularization = trial.suggest_float("l2_regularization", 0.1, 10.0)
    tfidf = trial.suggest_categorical("tfidf", [True, False])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = HistGradientBoostingRegressor(learning_rate=learning_rate, max_leaf_nodes=max_leaf_nodes, l2_regularization=l2_regularization)
    score = sklearn_regressors.evaluate_regressor(
        regressor=regressor,
        X=X_train,
        y=y_train,
        tfidf=tfidf,
        normalization=normalization, 
        scaling=scaling,
        requires_dense=True,
    )

    return score

study.optimize(objective, n_trials=40)

In [20]:
study.trials_dataframe().sort_values(by="value").head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_l2_regularization,params_learning_rate,params_max_leaf_nodes,params_normalization,params_scaling,params_tfidf,state
27,27,1.159,2023-02-20 18:52:59.665193,2023-02-20 18:53:17.724195,0 days 00:00:18.059002,2.744975,0.035091,11,False,False,False,COMPLETE
0,0,1.166,2023-02-20 18:32:54.411005,2023-02-20 18:33:35.052018,0 days 00:00:40.641013,0.60428,0.017179,25,False,False,False,COMPLETE
23,23,1.171,2023-02-20 18:50:36.498693,2023-02-20 18:51:10.542193,0 days 00:00:34.043500,3.965629,0.122618,21,False,False,False,COMPLETE
17,17,1.176,2023-02-20 18:46:51.490254,2023-02-20 18:47:23.308750,0 days 00:00:31.818496,9.630193,0.102127,20,False,False,False,COMPLETE
12,12,1.193,2023-02-20 18:43:53.610250,2023-02-20 18:44:29.325749,0 days 00:00:35.715499,5.683796,0.296192,23,False,False,False,COMPLETE
22,22,1.208,2023-02-20 18:49:51.526250,2023-02-20 18:50:36.479193,0 days 00:00:44.952943,5.153518,0.061101,28,False,False,False,COMPLETE
24,24,1.21,2023-02-20 18:51:10.562693,2023-02-20 18:51:32.506193,0 days 00:00:21.943500,1.272847,0.126748,14,False,False,False,COMPLETE
33,33,1.222,2023-02-20 18:55:34.949193,2023-02-20 18:56:00.773694,0 days 00:00:25.824501,4.295648,0.084432,16,False,False,False,COMPLETE
32,32,1.224,2023-02-20 18:55:02.206195,2023-02-20 18:55:34.929193,0 days 00:00:32.722998,3.248474,0.140634,21,False,False,False,COMPLETE
35,35,1.231,2023-02-20 18:56:52.135694,2023-02-20 18:57:19.166193,0 days 00:00:27.030499,0.774214,0.173554,17,True,True,True,COMPLETE


In [21]:
print(f"Best HistGBoost regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

Best HistGBoost regression params: {'l2_regularization': 2.7449752057437693, 'learning_rate': 0.03509097403389229, 'max_leaf_nodes': 11, 'normalization': False, 'scaling': False, 'tfidf': False}, achieving val RMSE: 1.159


In [23]:
save = True
load = False

regressor_params = {"learning_rate": 0.035, "max_leaf_nodes": 11, "l2_regularization": 2.75}
pipe_params = {
    'normalization': False,
    'scaling': False,
    'tfidf': False,
    'requires_dense': True,
}

if load:
    pipe = joblib.load(f"{exp_dir}/hist_gradient_boosting/pipe")
else:
    regressor = HistGradientBoostingRegressor(**regressor_params)
    pipe = sklearn_regressors.train_pipe(regressor, X_train, y_train, **pipe_params)

    if save:
        additional_attributes = {
            "regressor_params": regressor_params,
            "pipe_params": pipe_params,
        }

        joblib.dump(pipe, f"{exp_dir}/hist_gradient_boosting/pipe")
        with open(f"{exp_dir}/hist_gradient_boosting/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [24]:
test_bbs = np.array([" ".join(bb) for bb in test_df.bb.tolist()])
test_preds = pipe.predict(test_bbs)
true_energies = test_df.energy.values

median = np.median(true_energies)
std = np.std(true_energies)
mse = round(mean_squared_error(true_energies, test_preds), 3)
mae = round(mean_absolute_error(true_energies, test_preds), 3)

print(f"MSE: {mse}")
print(f"MAE: {mae}")

MSE: 1.325
MAE: 0.718
