# Imports

In [1]:
import sys
import json
import logging
import joblib
import numpy as np
import matplotlib.pyplot as plt

import optuna

from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.linear_model import LinearRegression, SGDRegressor, ElasticNet, BayesianRidge, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline

from typing import Union

from bb_energy_prediction import data_utils, sklearn_regressors

  from .autonotebook import tqdm as notebook_tqdm


# Data

In [2]:
data_df = data_utils.get_data_df(data_path=f"../energy_data/data.pkl")
if "bb_embeddings" in data_df.columns:
    data_df = data_df.drop(columns="bb_embeddings")
data_df = data_df.sample(frac=1)
data_df.head(10)

Unnamed: 0,bb,energy,program_name
145775,"[movq %rcx, mov %rcx %rdx, mov %r11 %rsi, mov ...",2.884094,function_pointer
34647,"[mov %rcx, nopw %ax, mov %rdi %rax, mov %rdi %...",0.000583,find_biggest
510665,"[movq %rdx (%rsi), jmp]",0.25961,variable_name
5424,[jb 0x7],0.012634,bubblesort
440869,"[lea %rdi, movq %rdi, movq %rdi, movq %rax, jmp]",0.519218,variable_name
455246,"[sub $0x1 %rax, cmpb $0xa (%rax), jz 0x111]",0.151438,variable_name
317234,"[cmp %rax %r12, jb]",0.014741,simple_sort
138583,"[mov %rcx %rax, jmp]",0.508958,function_pointer
260558,"[cmpq $0x0, jz 0x4a]",0.221693,polinominal
133856,"[add $0x8 %rsp, popq %rbx, popq %rbp, popq %r1...",7.9e-05,function_pointer


In [3]:
program_names = data_df.program_name.value_counts().index[4:]
#test_programs = np.random.choice(program_names, 3, replace=False)
test_programs = ["faa", "quicksort", "declare"]
train_df = data_df[~data_df.program_name.isin(test_programs)]
test_df = data_df[data_df.program_name.isin(test_programs)]

print(f"Test programs: {test_programs}")
print(f"Test data size: {len(test_df)}")

Test programs: ['faa', 'quicksort', 'declare']
Test data size: 28575


In [4]:
X_train = np.array([" ".join(bb) for bb in train_df.bb.tolist()])
y_train = train_df.energy.values

X_test = np.array([" ".join(bb) for bb in test_df.bb.tolist()])
y_test = test_df.energy.values

In [5]:
cnt_vect = CountVectorizer()
cnt_vect.fit_transform(X_train)
vocab_len = len(cnt_vect.get_feature_names_out())
print(f"Vocab length: {vocab_len}")

Vocab length: 1258


# Regressors

In [6]:
exp_dir = "/Users/thodo/Documents/sxoli/diplomatiki/basic-block-energy-prediction/model_checkpoints/regressors"

In [7]:
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
storage_name = "sqlite:////Users/thodo/Documents/sxoli/diplomatiki/optuna-studies/sklearn-regressors.db"

## Linear Regression

Simple least squares regression.
No need for extra tuning.

In [8]:
study = optuna.create_study(study_name="linear-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-02-13 10:08:23,477][0m Using an existing study with name 'linear-regression' instead of creating a new one.[0m


Using an existing study with name 'linear-regression' instead of creating a new one.


In [9]:
def objective(trial):

    tfidf = trial.suggest_categorical("tfidf", [True, False])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = LinearRegression()
    score = sklearn_regressors.evaluate_regressor(
        regressor=regressor,
        X=X_train,
        y=y_train,
        tfidf=tfidf,
        normalization=normalization, 
        scaling=scaling
    )

    return score

study.optimize(objective, n_trials=10)

[32m[I 2023-02-12 19:27:08,112][0m Trial 0 finished with value: 0.584 and parameters: {'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.584.[0m


Trial 0 finished with value: 0.584 and parameters: {'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.584.


[32m[I 2023-02-12 19:27:42,693][0m Trial 1 finished with value: 0.586 and parameters: {'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 0 with value: 0.584.[0m


Trial 1 finished with value: 0.586 and parameters: {'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 0 with value: 0.584.


[32m[I 2023-02-12 19:28:08,316][0m Trial 2 finished with value: 0.577 and parameters: {'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 2 with value: 0.577.[0m


Trial 2 finished with value: 0.577 and parameters: {'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 2 with value: 0.577.


[32m[I 2023-02-12 19:28:32,625][0m Trial 3 finished with value: 0.581 and parameters: {'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 2 with value: 0.577.[0m


Trial 3 finished with value: 0.581 and parameters: {'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 2 with value: 0.577.


[32m[I 2023-02-12 19:30:41,874][0m Trial 4 finished with value: 0.59 and parameters: {'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 2 with value: 0.577.[0m


Trial 4 finished with value: 0.59 and parameters: {'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 2 with value: 0.577.


[32m[I 2023-02-12 19:32:29,073][0m Trial 5 finished with value: 0.581 and parameters: {'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 2 with value: 0.577.[0m


Trial 5 finished with value: 0.581 and parameters: {'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 2 with value: 0.577.


[32m[I 2023-02-12 19:34:42,330][0m Trial 6 finished with value: 0.579 and parameters: {'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 2 with value: 0.577.[0m


Trial 6 finished with value: 0.579 and parameters: {'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 2 with value: 0.577.


[32m[I 2023-02-12 19:36:25,277][0m Trial 7 finished with value: 0.582 and parameters: {'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 2 with value: 0.577.[0m


Trial 7 finished with value: 0.582 and parameters: {'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 2 with value: 0.577.


[32m[I 2023-02-12 19:38:38,708][0m Trial 8 finished with value: 0.587 and parameters: {'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 2 with value: 0.577.[0m


Trial 8 finished with value: 0.587 and parameters: {'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 2 with value: 0.577.


[32m[I 2023-02-12 19:39:03,423][0m Trial 9 finished with value: 0.578 and parameters: {'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 2 with value: 0.577.[0m


Trial 9 finished with value: 0.578 and parameters: {'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 2 with value: 0.577.


In [9]:
study.trials_dataframe().sort_values(by="value").head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_normalization,params_scaling,params_tfidf,state
2,2,0.577,2023-02-12 19:27:42.697463,2023-02-12 19:28:08.303165,0 days 00:00:25.605702,True,True,True,COMPLETE
9,9,0.578,2023-02-12 19:38:38.712430,2023-02-12 19:39:03.410964,0 days 00:00:24.698534,True,True,True,COMPLETE
6,6,0.579,2023-02-12 19:32:29.077328,2023-02-12 19:34:42.318137,0 days 00:02:13.240809,False,False,False,COMPLETE
3,3,0.581,2023-02-12 19:28:08.320091,2023-02-12 19:28:32.611529,0 days 00:00:24.291438,True,True,False,COMPLETE
5,5,0.581,2023-02-12 19:30:41.877848,2023-02-12 19:32:29.060359,0 days 00:01:47.182511,True,False,True,COMPLETE
7,7,0.582,2023-02-12 19:34:42.333613,2023-02-12 19:36:25.262902,0 days 00:01:42.929289,True,False,True,COMPLETE
0,0,0.584,2023-02-12 19:25:00.574415,2023-02-12 19:27:08.093968,0 days 00:02:07.519553,True,False,False,COMPLETE
1,1,0.586,2023-02-12 19:27:08.116439,2023-02-12 19:27:42.680967,0 days 00:00:34.564528,False,True,False,COMPLETE
8,8,0.587,2023-02-12 19:36:25.280903,2023-02-12 19:38:38.695933,0 days 00:02:13.415030,False,False,False,COMPLETE
4,4,0.59,2023-02-12 19:28:32.628512,2023-02-12 19:30:41.862347,0 days 00:02:09.233835,True,False,False,COMPLETE


In [10]:
print(f"Best linear regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

Best linear regression params: {'normalization': True, 'scaling': True, 'tfidf': True}, achieving val RMSE: 0.577


In [11]:
save = True
load = False

pipe_params = {
    'normalization': True,
    'scaling': True,
    'tfidf': True,
}

if load:
    pipe = joblib.load(f"{exp_dir}/linear/pipe")
else:
    regressor = LinearRegression()
    pipe = sklearn_regressors.train_pipe(regressor, X_train, y_train, **pipe_params)

    if save:
        additional_attributes = {
            "pipe_params": pipe_params,
            "test_programs": test_programs,
        }

        joblib.dump(pipe, f"{exp_dir}/linear/pipe")
        with open(f"{exp_dir}/linear/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [12]:
maes = {}
prog_energy = {}

for test_program in test_programs:
    prog_df = test_df[test_df.program_name == test_program]
    prog_bbs = np.array([" ".join(bb) for bb in prog_df.bb.tolist()])
    test_preds = pipe.predict(prog_bbs)
    true_energies = prog_df.energy.values
    
    maes[test_program] = mean_absolute_error(true_energies, test_preds)
    prog_energy[test_program] = {
        "true_energy": sum(true_energies),
        "pred_energy": sum(test_preds),
    }

print("Mean absolute error for test programs")
print(f"{maes}\n")

print("Total energies for test programs")
print(prog_energy)

Mean absolute error for test programs
{'faa': 0.8564875429972671, 'quicksort': 0.2580165094097379, 'declare': 0.42751488030627116}

Total energies for test programs
{'faa': {'true_energy': 1593.1851865544543, 'pred_energy': 581.280986254267}, 'quicksort': {'true_energy': 4246.078264105911, 'pred_energy': 7350.651970431555}, 'declare': {'true_energy': 6382.83618144921, 'pred_energy': 3866.6772622437725}}


## Lasso

Linear regression with L1 regularization. Tune:
* alpha (L1 regularization term).

In [13]:
study = optuna.create_study(study_name="lasso-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-02-13 10:08:44,491][0m Using an existing study with name 'lasso-regression' instead of creating a new one.[0m


Using an existing study with name 'lasso-regression' instead of creating a new one.


In [13]:
def objective(trial):

    alpha = trial.suggest_float("alpha", 0.1, 10.0)
    tfidf = trial.suggest_categorical("tfidf", [True, False])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = Lasso(alpha=alpha)
    score = sklearn_regressors.evaluate_regressor(
        regressor=regressor,
        X=X_train,
        y=y_train,
        tfidf=tfidf,
        normalization=normalization, 
        scaling=scaling
    )

    return score

study.optimize(objective, n_trials=20)

[32m[I 2023-02-12 19:39:18,857][0m Trial 0 finished with value: 0.698 and parameters: {'alpha': 4.814427097179507, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.[0m


Trial 0 finished with value: 0.698 and parameters: {'alpha': 4.814427097179507, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.


[32m[I 2023-02-12 19:39:33,720][0m Trial 1 finished with value: 0.711 and parameters: {'alpha': 1.88409913571284, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.[0m


Trial 1 finished with value: 0.711 and parameters: {'alpha': 1.88409913571284, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.


[32m[I 2023-02-12 19:39:48,553][0m Trial 2 finished with value: 0.703 and parameters: {'alpha': 1.9321236682453649, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.[0m


Trial 2 finished with value: 0.703 and parameters: {'alpha': 1.9321236682453649, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.


[32m[I 2023-02-12 19:40:03,477][0m Trial 3 finished with value: 0.705 and parameters: {'alpha': 2.2421514399121314, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 0 with value: 0.698.[0m


Trial 3 finished with value: 0.705 and parameters: {'alpha': 2.2421514399121314, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 0 with value: 0.698.


[32m[I 2023-02-12 19:40:18,085][0m Trial 4 finished with value: 0.704 and parameters: {'alpha': 6.4155008735014185, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 0 with value: 0.698.[0m


Trial 4 finished with value: 0.704 and parameters: {'alpha': 6.4155008735014185, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 0 with value: 0.698.


[32m[I 2023-02-12 19:40:33,252][0m Trial 5 finished with value: 0.704 and parameters: {'alpha': 3.69903251896639, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 0 with value: 0.698.[0m


Trial 5 finished with value: 0.704 and parameters: {'alpha': 3.69903251896639, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 0 with value: 0.698.


[32m[I 2023-02-12 19:40:48,281][0m Trial 6 finished with value: 0.713 and parameters: {'alpha': 1.7016988574638576, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 0 with value: 0.698.[0m


Trial 6 finished with value: 0.713 and parameters: {'alpha': 1.7016988574638576, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 0 with value: 0.698.


[32m[I 2023-02-12 19:41:03,500][0m Trial 7 finished with value: 0.707 and parameters: {'alpha': 7.366511932718572, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 0 with value: 0.698.[0m


Trial 7 finished with value: 0.707 and parameters: {'alpha': 7.366511932718572, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 0 with value: 0.698.


[32m[I 2023-02-12 19:41:18,635][0m Trial 8 finished with value: 0.7 and parameters: {'alpha': 9.393990157968663, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.[0m


Trial 8 finished with value: 0.7 and parameters: {'alpha': 9.393990157968663, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.


[32m[I 2023-02-12 19:41:33,946][0m Trial 9 finished with value: 0.706 and parameters: {'alpha': 6.701910400002038, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.698.[0m


Trial 9 finished with value: 0.706 and parameters: {'alpha': 6.701910400002038, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.698.


[32m[I 2023-02-12 19:41:48,807][0m Trial 10 finished with value: 0.706 and parameters: {'alpha': 4.299597659352175, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.[0m


Trial 10 finished with value: 0.706 and parameters: {'alpha': 4.299597659352175, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.


[32m[I 2023-02-12 19:42:03,897][0m Trial 11 finished with value: 0.707 and parameters: {'alpha': 9.52087418076839, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.[0m


Trial 11 finished with value: 0.707 and parameters: {'alpha': 9.52087418076839, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.


[32m[I 2023-02-12 19:42:18,854][0m Trial 12 finished with value: 0.702 and parameters: {'alpha': 9.714165008426788, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.[0m


Trial 12 finished with value: 0.702 and parameters: {'alpha': 9.714165008426788, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.


[32m[I 2023-02-12 19:42:34,036][0m Trial 13 finished with value: 0.704 and parameters: {'alpha': 5.405069881427979, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.[0m


Trial 13 finished with value: 0.704 and parameters: {'alpha': 5.405069881427979, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.


[32m[I 2023-02-12 19:42:49,174][0m Trial 14 finished with value: 0.704 and parameters: {'alpha': 8.460035782330909, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.[0m


Trial 14 finished with value: 0.704 and parameters: {'alpha': 8.460035782330909, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.


[32m[I 2023-02-12 19:43:04,238][0m Trial 15 finished with value: 0.71 and parameters: {'alpha': 7.985507283266603, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.[0m


Trial 15 finished with value: 0.71 and parameters: {'alpha': 7.985507283266603, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.


[32m[I 2023-02-12 19:43:18,996][0m Trial 16 finished with value: 0.71 and parameters: {'alpha': 5.315801116001432, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.[0m


Trial 16 finished with value: 0.71 and parameters: {'alpha': 5.315801116001432, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.


[32m[I 2023-02-12 19:43:34,044][0m Trial 17 finished with value: 0.71 and parameters: {'alpha': 3.3599300135642705, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.[0m


Trial 17 finished with value: 0.71 and parameters: {'alpha': 3.3599300135642705, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.


[32m[I 2023-02-12 19:43:49,049][0m Trial 18 finished with value: 0.701 and parameters: {'alpha': 6.1804517447271206, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.698.[0m


Trial 18 finished with value: 0.701 and parameters: {'alpha': 6.1804517447271206, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.698.


[32m[I 2023-02-12 19:44:04,215][0m Trial 19 finished with value: 0.705 and parameters: {'alpha': 0.5037687817231662, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.[0m


Trial 19 finished with value: 0.705 and parameters: {'alpha': 0.5037687817231662, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.698.


In [14]:
study.trials_dataframe().sort_values(by="value").head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_normalization,params_scaling,params_tfidf,state
0,0,0.698,2023-02-12 19:39:03.656946,2023-02-12 19:39:18.840222,0 days 00:00:15.183276,4.814427,True,False,False,COMPLETE
8,8,0.7,2023-02-12 19:41:03.503741,2023-02-12 19:41:18.622516,0 days 00:00:15.118775,9.39399,True,False,True,COMPLETE
18,18,0.701,2023-02-12 19:43:34.047270,2023-02-12 19:43:49.035790,0 days 00:00:14.988520,6.180452,True,True,False,COMPLETE
12,12,0.702,2023-02-12 19:42:03.900519,2023-02-12 19:42:18.840543,0 days 00:00:14.940024,9.714165,True,False,False,COMPLETE
2,2,0.703,2023-02-12 19:39:33.723236,2023-02-12 19:39:48.541263,0 days 00:00:14.818027,1.932124,True,False,False,COMPLETE
14,14,0.704,2023-02-12 19:42:34.039209,2023-02-12 19:42:49.161452,0 days 00:00:15.122243,8.460036,True,False,True,COMPLETE
13,13,0.704,2023-02-12 19:42:18.858010,2023-02-12 19:42:34.022210,0 days 00:00:15.164200,5.40507,True,False,True,COMPLETE
4,4,0.704,2023-02-12 19:40:03.481276,2023-02-12 19:40:18.072702,0 days 00:00:14.591426,6.415501,False,False,False,COMPLETE
5,5,0.704,2023-02-12 19:40:18.088171,2023-02-12 19:40:33.238692,0 days 00:00:15.150521,3.699033,False,True,True,COMPLETE
19,19,0.705,2023-02-12 19:43:49.052268,2023-02-12 19:44:04.203400,0 days 00:00:15.151132,0.503769,True,False,True,COMPLETE


In [15]:
print(f"Best lasso regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

Best lasso regression params: {'alpha': 4.814427097179507, 'normalization': True, 'scaling': False, 'tfidf': False}, achieving val RMSE: 0.698


In [16]:
save = True
load = False

regressor_params = {"alpha": 4.81}
pipe_params = {
    'normalization': True,
    'scaling': False,
    'tfidf': False,
}

if load:
    pipe = joblib.load(f"{exp_dir}/lasso/pipe")
else:
    regressor = Lasso(**regressor_params)
    pipe = sklearn_regressors.train_pipe(regressor, X_train, y_train, **pipe_params)

    if save:
        additional_attributes = {
            "regressor_params": regressor_params,
            "pipe_params": pipe_params,
            "test_programs": test_programs,
        }

        joblib.dump(pipe, f"{exp_dir}/lasso/pipe")
        with open(f"{exp_dir}/lasso/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [17]:
maes = {}
prog_energy = {}

for test_program in test_programs:
    prog_df = test_df[test_df.program_name == test_program]
    prog_bbs = np.array([" ".join(bb) for bb in prog_df.bb.tolist()])
    test_preds = pipe.predict(prog_bbs)
    true_energies = prog_df.energy.values
    
    maes[test_program] = mean_absolute_error(true_energies, test_preds)
    prog_energy[test_program] = {
        "true_energy": sum(true_energies),
        "pred_energy": sum(test_preds),
    }

print("Mean absolute error for test programs")
print(f"{maes}\n")

print("Total energies for test programs")
print(prog_energy)

Mean absolute error for test programs
{'faa': 0.9109510074307886, 'quicksort': 0.31704603813120613, 'declare': 0.4567457793792269}

Total energies for test programs
{'faa': {'true_energy': 1593.1851865544543, 'pred_energy': 475.4874028864406}, 'quicksort': {'true_energy': 4246.078264105911, 'pred_energy': 6599.130136312516}, 'declare': {'true_energy': 6382.83618144921, 'pred_energy': 3989.757491321512}}


## Ridge

Linear regression with L2 regularization. Tune:
* alpha (L2 regularization term).

In [18]:
study = optuna.create_study(study_name="ridge-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-02-13 10:08:59,535][0m Using an existing study with name 'ridge-regression' instead of creating a new one.[0m


Using an existing study with name 'ridge-regression' instead of creating a new one.


In [24]:
def objective(trial):

    alpha = trial.suggest_float("alpha", 0.1, 10.0)
    tfidf = trial.suggest_categorical("tfidf", [True, False])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = Ridge(alpha=alpha)
    score = sklearn_regressors.evaluate_regressor(
        regressor=regressor,
        X=X_train,
        y=y_train,
        tfidf=tfidf,
        normalization=normalization, 
        scaling=scaling
    )

    return score

study.optimize(objective, n_trials=20)

[32m[I 2023-02-13 00:17:52,008][0m Trial 0 finished with value: 0.571 and parameters: {'alpha': 4.580125801925755, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.571.[0m


Trial 0 finished with value: 0.571 and parameters: {'alpha': 4.580125801925755, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.571.


[32m[I 2023-02-13 00:18:07,805][0m Trial 1 finished with value: 0.578 and parameters: {'alpha': 0.9453533393144836, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.571.[0m


Trial 1 finished with value: 0.578 and parameters: {'alpha': 0.9453533393144836, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.571.


[32m[I 2023-02-13 00:18:25,037][0m Trial 2 finished with value: 0.586 and parameters: {'alpha': 4.883188382960704, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 0 with value: 0.571.[0m


Trial 2 finished with value: 0.586 and parameters: {'alpha': 4.883188382960704, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 0 with value: 0.571.


[32m[I 2023-02-13 00:18:40,865][0m Trial 3 finished with value: 0.583 and parameters: {'alpha': 5.550875241887742, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.571.[0m


Trial 3 finished with value: 0.583 and parameters: {'alpha': 5.550875241887742, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.571.


[32m[I 2023-02-13 00:18:55,908][0m Trial 4 finished with value: 0.574 and parameters: {'alpha': 7.909947413280453, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.571.[0m


Trial 4 finished with value: 0.574 and parameters: {'alpha': 7.909947413280453, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.571.


[32m[I 2023-02-13 00:19:11,036][0m Trial 5 finished with value: 0.572 and parameters: {'alpha': 8.685663080931134, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.571.[0m


Trial 5 finished with value: 0.572 and parameters: {'alpha': 8.685663080931134, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.571.


[32m[I 2023-02-13 00:19:26,152][0m Trial 6 finished with value: 0.579 and parameters: {'alpha': 8.485506050956728, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.571.[0m


Trial 6 finished with value: 0.579 and parameters: {'alpha': 8.485506050956728, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.571.


[32m[I 2023-02-13 00:19:41,169][0m Trial 7 finished with value: 0.58 and parameters: {'alpha': 1.4558659255882256, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 0 with value: 0.571.[0m


Trial 7 finished with value: 0.58 and parameters: {'alpha': 1.4558659255882256, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 0 with value: 0.571.


[32m[I 2023-02-13 00:19:56,137][0m Trial 8 finished with value: 0.578 and parameters: {'alpha': 7.862033032375683, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 0 with value: 0.571.[0m


Trial 8 finished with value: 0.578 and parameters: {'alpha': 7.862033032375683, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 0 with value: 0.571.


[32m[I 2023-02-13 00:20:11,516][0m Trial 9 finished with value: 0.576 and parameters: {'alpha': 5.1819958790924, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.571.[0m


Trial 9 finished with value: 0.576 and parameters: {'alpha': 5.1819958790924, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.571.


[32m[I 2023-02-13 00:20:32,329][0m Trial 10 finished with value: 0.571 and parameters: {'alpha': 2.8731494215238524, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 0 with value: 0.571.[0m


Trial 10 finished with value: 0.571 and parameters: {'alpha': 2.8731494215238524, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 0 with value: 0.571.


[32m[I 2023-02-13 00:20:52,490][0m Trial 11 finished with value: 0.569 and parameters: {'alpha': 2.964135035227021, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 11 with value: 0.569.[0m


Trial 11 finished with value: 0.569 and parameters: {'alpha': 2.964135035227021, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 11 with value: 0.569.


[32m[I 2023-02-13 00:21:12,857][0m Trial 12 finished with value: 0.569 and parameters: {'alpha': 2.705393400580099, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 11 with value: 0.569.[0m


Trial 12 finished with value: 0.569 and parameters: {'alpha': 2.705393400580099, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 11 with value: 0.569.


[32m[I 2023-02-13 00:21:33,086][0m Trial 13 finished with value: 0.576 and parameters: {'alpha': 2.5337330297298335, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 11 with value: 0.569.[0m


Trial 13 finished with value: 0.576 and parameters: {'alpha': 2.5337330297298335, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 11 with value: 0.569.


[32m[I 2023-02-13 00:21:57,419][0m Trial 14 finished with value: 0.574 and parameters: {'alpha': 0.24505083701357444, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 11 with value: 0.569.[0m


Trial 14 finished with value: 0.574 and parameters: {'alpha': 0.24505083701357444, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 11 with value: 0.569.


[32m[I 2023-02-13 00:22:17,742][0m Trial 15 finished with value: 0.573 and parameters: {'alpha': 2.8752741870100893, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 11 with value: 0.569.[0m


Trial 15 finished with value: 0.573 and parameters: {'alpha': 2.8752741870100893, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 11 with value: 0.569.


[32m[I 2023-02-13 00:22:37,617][0m Trial 16 finished with value: 0.575 and parameters: {'alpha': 3.469536101738413, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 11 with value: 0.569.[0m


Trial 16 finished with value: 0.575 and parameters: {'alpha': 3.469536101738413, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 11 with value: 0.569.


[32m[I 2023-02-13 00:22:58,228][0m Trial 17 finished with value: 0.581 and parameters: {'alpha': 1.8123402010923577, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 11 with value: 0.569.[0m


Trial 17 finished with value: 0.581 and parameters: {'alpha': 1.8123402010923577, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 11 with value: 0.569.


[32m[I 2023-02-13 00:23:18,244][0m Trial 18 finished with value: 0.574 and parameters: {'alpha': 3.814716619114115, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 11 with value: 0.569.[0m


Trial 18 finished with value: 0.574 and parameters: {'alpha': 3.814716619114115, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 11 with value: 0.569.


[32m[I 2023-02-13 00:23:39,486][0m Trial 19 finished with value: 0.575 and parameters: {'alpha': 1.9522098761230005, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 11 with value: 0.569.[0m


Trial 19 finished with value: 0.575 and parameters: {'alpha': 1.9522098761230005, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 11 with value: 0.569.


In [19]:
study.trials_dataframe().sort_values(by="value").head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_normalization,params_scaling,params_tfidf,state
12,12,0.569,2023-02-13 00:20:52.494345,2023-02-13 00:21:12.844826,0 days 00:00:20.350481,2.705393,False,False,False,COMPLETE
11,11,0.569,2023-02-13 00:20:32.332712,2023-02-13 00:20:52.477348,0 days 00:00:20.144636,2.964135,False,False,False,COMPLETE
0,0,0.571,2023-02-13 00:17:35.266633,2023-02-13 00:17:51.989490,0 days 00:00:16.722857,4.580126,True,False,True,COMPLETE
10,10,0.571,2023-02-13 00:20:11.519570,2023-02-13 00:20:32.313745,0 days 00:00:20.794175,2.873149,False,False,False,COMPLETE
5,5,0.572,2023-02-13 00:18:55.911532,2023-02-13 00:19:11.024032,0 days 00:00:15.112500,8.685663,True,True,True,COMPLETE
15,15,0.573,2023-02-13 00:21:57.422958,2023-02-13 00:22:17.729151,0 days 00:00:20.306193,2.875274,False,False,False,COMPLETE
14,14,0.574,2023-02-13 00:21:33.089674,2023-02-13 00:21:57.407457,0 days 00:00:24.317783,0.245051,False,False,False,COMPLETE
4,4,0.574,2023-02-13 00:18:40.868462,2023-02-13 00:18:55.895532,0 days 00:00:15.027070,7.909947,True,True,True,COMPLETE
18,18,0.574,2023-02-13 00:22:58.231718,2023-02-13 00:23:18.231779,0 days 00:00:20.000061,3.814717,False,False,False,COMPLETE
16,16,0.575,2023-02-13 00:22:17.745150,2023-02-13 00:22:37.604546,0 days 00:00:19.859396,3.469536,False,False,False,COMPLETE


In [20]:
print(f"Best ridge regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

Best ridge regression params: {'alpha': 2.964135035227021, 'normalization': False, 'scaling': False, 'tfidf': False}, achieving val RMSE: 0.569


In [21]:
save = False
load = True

regressor_params = {"alpha": 2.96}
pipe_params = {
    'normalization': False,
    'scaling': False,
    'tfidf': False,
}

if load:
    pipe = joblib.load(f"{exp_dir}/ridge/pipe")
else:
    regressor = Ridge(**regressor_params)
    pipe = sklearn_regressors.train_pipe(regressor, X_train, y_train, **pipe_params)

    if save:
        additional_attributes = {
            "regressor_params": regressor_params,
            "pipe_params": pipe_params,
            "test_programs": test_programs,
        }

        joblib.dump(pipe, f"{exp_dir}/ridge/pipe")
        with open(f"{exp_dir}/ridge/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [22]:
maes = {}
prog_energy = {}

for test_program in test_programs:
    prog_df = test_df[test_df.program_name == test_program]
    prog_bbs = np.array([" ".join(bb) for bb in prog_df.bb.tolist()])
    test_preds = pipe.predict(prog_bbs)
    true_energies = prog_df.energy.values
    
    maes[test_program] = mean_absolute_error(true_energies, test_preds)
    prog_energy[test_program] = {
        "true_energy": sum(true_energies),
        "pred_energy": sum(test_preds),
    }

print("Mean absolute error for test programs")
print(f"{maes}\n")

print("Total energies for test programs")
print(prog_energy)

Mean absolute error for test programs
{'faa': 0.8569234326811495, 'quicksort': 0.25804184569469063, 'declare': 0.4274168206139757}

Total energies for test programs
{'faa': {'true_energy': 1593.1851865544543, 'pred_energy': 582.1524586769206}, 'quicksort': {'true_energy': 4246.078264105911, 'pred_energy': 7351.804519804801}, 'declare': {'true_energy': 6382.83618144921, 'pred_energy': 3867.893760943608}}


## ElasticNet

Linear regression with both L1 and L2 regularization. Tune
* alpha
* l1_ratio. 

In [23]:
study = optuna.create_study(study_name="elasticnet-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-02-13 10:09:24,875][0m Using an existing study with name 'elasticnet-regression' instead of creating a new one.[0m


Using an existing study with name 'elasticnet-regression' instead of creating a new one.


In [21]:
def objective(trial):

    alpha = trial.suggest_float("alpha", 0.1, 10.0)
    l1_ratio = trial.suggest_float("l1_ratio", 0.1, 0.9)
    tfidf = trial.suggest_categorical("tfidf", [True, False])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
    score = sklearn_regressors.evaluate_regressor(
        regressor=regressor,
        X=X_train,
        y=y_train,
        tfidf=tfidf,
        normalization=normalization, 
        scaling=scaling
    )

    return score

study.optimize(objective, n_trials=20)

[32m[I 2023-02-12 20:04:13,106][0m Trial 0 finished with value: 0.712 and parameters: {'alpha': 7.6767592564378555, 'l1_ratio': 0.7947754314368692, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.712.[0m


Trial 0 finished with value: 0.712 and parameters: {'alpha': 7.6767592564378555, 'l1_ratio': 0.7947754314368692, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.712.


[32m[I 2023-02-12 20:04:27,854][0m Trial 1 finished with value: 0.703 and parameters: {'alpha': 5.691982379501282, 'l1_ratio': 0.8611016476247246, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 1 with value: 0.703.[0m


Trial 1 finished with value: 0.703 and parameters: {'alpha': 5.691982379501282, 'l1_ratio': 0.8611016476247246, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 1 with value: 0.703.


[32m[I 2023-02-12 20:04:42,789][0m Trial 2 finished with value: 0.712 and parameters: {'alpha': 3.0480953980362853, 'l1_ratio': 0.777522900069485, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 1 with value: 0.703.[0m


Trial 2 finished with value: 0.712 and parameters: {'alpha': 3.0480953980362853, 'l1_ratio': 0.777522900069485, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 1 with value: 0.703.


[32m[I 2023-02-12 20:04:57,715][0m Trial 3 finished with value: 0.714 and parameters: {'alpha': 9.13385301998779, 'l1_ratio': 0.32878763039201014, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 1 with value: 0.703.[0m


Trial 3 finished with value: 0.714 and parameters: {'alpha': 9.13385301998779, 'l1_ratio': 0.32878763039201014, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 1 with value: 0.703.


[32m[I 2023-02-12 20:05:12,485][0m Trial 4 finished with value: 0.708 and parameters: {'alpha': 1.7324995511446453, 'l1_ratio': 0.7949651990976754, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 1 with value: 0.703.[0m


Trial 4 finished with value: 0.708 and parameters: {'alpha': 1.7324995511446453, 'l1_ratio': 0.7949651990976754, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 1 with value: 0.703.


[32m[I 2023-02-12 20:05:27,388][0m Trial 5 finished with value: 0.703 and parameters: {'alpha': 5.673337422798416, 'l1_ratio': 0.39168741336340185, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 1 with value: 0.703.[0m


Trial 5 finished with value: 0.703 and parameters: {'alpha': 5.673337422798416, 'l1_ratio': 0.39168741336340185, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 1 with value: 0.703.


[32m[I 2023-02-12 20:05:42,927][0m Trial 6 finished with value: 0.706 and parameters: {'alpha': 5.891976190685219, 'l1_ratio': 0.3460802906431496, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 1 with value: 0.703.[0m


Trial 6 finished with value: 0.706 and parameters: {'alpha': 5.891976190685219, 'l1_ratio': 0.3460802906431496, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 1 with value: 0.703.


[32m[I 2023-02-12 20:05:57,916][0m Trial 7 finished with value: 0.697 and parameters: {'alpha': 9.692300727460283, 'l1_ratio': 0.6865284769913371, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 7 with value: 0.697.[0m


Trial 7 finished with value: 0.697 and parameters: {'alpha': 9.692300727460283, 'l1_ratio': 0.6865284769913371, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 7 with value: 0.697.


[32m[I 2023-02-12 20:06:13,299][0m Trial 8 finished with value: 0.706 and parameters: {'alpha': 5.617326529159289, 'l1_ratio': 0.5045939392128937, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.697.[0m


Trial 8 finished with value: 0.706 and parameters: {'alpha': 5.617326529159289, 'l1_ratio': 0.5045939392128937, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.697.


[32m[I 2023-02-12 20:06:28,129][0m Trial 9 finished with value: 0.699 and parameters: {'alpha': 1.2393491454924748, 'l1_ratio': 0.4344194590093555, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.697.[0m


Trial 9 finished with value: 0.699 and parameters: {'alpha': 1.2393491454924748, 'l1_ratio': 0.4344194590093555, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 7 with value: 0.697.


[32m[I 2023-02-12 20:06:43,343][0m Trial 10 finished with value: 0.701 and parameters: {'alpha': 9.87872617683266, 'l1_ratio': 0.5977550207115114, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.697.[0m


Trial 10 finished with value: 0.701 and parameters: {'alpha': 9.87872617683266, 'l1_ratio': 0.5977550207115114, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.697.


[32m[I 2023-02-12 20:07:25,059][0m Trial 11 finished with value: 0.63 and parameters: {'alpha': 0.15998231814396924, 'l1_ratio': 0.6175743573587327, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 11 with value: 0.63.[0m


Trial 11 finished with value: 0.63 and parameters: {'alpha': 0.15998231814396924, 'l1_ratio': 0.6175743573587327, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 11 with value: 0.63.


[32m[I 2023-02-12 20:07:40,192][0m Trial 12 finished with value: 0.704 and parameters: {'alpha': 2.8811898741239776, 'l1_ratio': 0.6241048180357445, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 11 with value: 0.63.[0m


Trial 12 finished with value: 0.704 and parameters: {'alpha': 2.8811898741239776, 'l1_ratio': 0.6241048180357445, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 11 with value: 0.63.


[32m[I 2023-02-12 20:08:20,427][0m Trial 13 finished with value: 0.626 and parameters: {'alpha': 0.4009095343696467, 'l1_ratio': 0.17653555342654514, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 13 with value: 0.626.[0m


Trial 13 finished with value: 0.626 and parameters: {'alpha': 0.4009095343696467, 'l1_ratio': 0.17653555342654514, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 13 with value: 0.626.


[32m[I 2023-02-12 20:09:05,072][0m Trial 14 finished with value: 0.625 and parameters: {'alpha': 0.29376941240124577, 'l1_ratio': 0.1708032477248703, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 14 with value: 0.625.[0m


Trial 14 finished with value: 0.625 and parameters: {'alpha': 0.29376941240124577, 'l1_ratio': 0.1708032477248703, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 14 with value: 0.625.


[32m[I 2023-02-12 20:09:50,273][0m Trial 15 finished with value: 0.617 and parameters: {'alpha': 0.30908933395254606, 'l1_ratio': 0.13605876528263688, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 15 with value: 0.617.[0m


Trial 15 finished with value: 0.617 and parameters: {'alpha': 0.30908933395254606, 'l1_ratio': 0.13605876528263688, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 15 with value: 0.617.


[32m[I 2023-02-12 20:10:05,300][0m Trial 16 finished with value: 0.706 and parameters: {'alpha': 2.9967459204310116, 'l1_ratio': 0.10092718262079851, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 15 with value: 0.617.[0m


Trial 16 finished with value: 0.706 and parameters: {'alpha': 2.9967459204310116, 'l1_ratio': 0.10092718262079851, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 15 with value: 0.617.


[32m[I 2023-02-12 20:10:20,399][0m Trial 17 finished with value: 0.706 and parameters: {'alpha': 1.379902294338415, 'l1_ratio': 0.23948946362395285, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 15 with value: 0.617.[0m


Trial 17 finished with value: 0.706 and parameters: {'alpha': 1.379902294338415, 'l1_ratio': 0.23948946362395285, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 15 with value: 0.617.


[32m[I 2023-02-12 20:10:35,635][0m Trial 18 finished with value: 0.708 and parameters: {'alpha': 4.025708596980282, 'l1_ratio': 0.10381172088245072, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 15 with value: 0.617.[0m


Trial 18 finished with value: 0.708 and parameters: {'alpha': 4.025708596980282, 'l1_ratio': 0.10381172088245072, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 15 with value: 0.617.


[32m[I 2023-02-12 20:11:32,678][0m Trial 19 finished with value: 0.606 and parameters: {'alpha': 0.11242693435738552, 'l1_ratio': 0.23653131327938587, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 19 with value: 0.606.[0m


Trial 19 finished with value: 0.606 and parameters: {'alpha': 0.11242693435738552, 'l1_ratio': 0.23653131327938587, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 19 with value: 0.606.


In [24]:
study.trials_dataframe().sort_values(by="value").head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_l1_ratio,params_normalization,params_scaling,params_tfidf,state
19,19,0.606,2023-02-12 20:10:35.639328,2023-02-12 20:11:32.665827,0 days 00:00:57.026499,0.112427,0.236531,False,True,False,COMPLETE
15,15,0.617,2023-02-12 20:09:05.075636,2023-02-12 20:09:50.259511,0 days 00:00:45.183875,0.309089,0.136059,False,True,False,COMPLETE
14,14,0.625,2023-02-12 20:08:20.431011,2023-02-12 20:09:05.059881,0 days 00:00:44.628870,0.293769,0.170803,False,True,False,COMPLETE
13,13,0.626,2023-02-12 20:07:40.196028,2023-02-12 20:08:20.414043,0 days 00:00:40.218015,0.40091,0.176536,False,True,False,COMPLETE
11,11,0.63,2023-02-12 20:06:43.347274,2023-02-12 20:07:25.046306,0 days 00:00:41.699032,0.159982,0.617574,False,True,False,COMPLETE
7,7,0.697,2023-02-12 20:05:42.930764,2023-02-12 20:05:57.903260,0 days 00:00:14.972496,9.692301,0.686528,True,False,False,COMPLETE
9,9,0.699,2023-02-12 20:06:13.302679,2023-02-12 20:06:28.116305,0 days 00:00:14.813626,1.239349,0.434419,False,False,False,COMPLETE
10,10,0.701,2023-02-12 20:06:28.132778,2023-02-12 20:06:43.331306,0 days 00:00:15.198528,9.878726,0.597755,True,True,False,COMPLETE
5,5,0.703,2023-02-12 20:05:12.488259,2023-02-12 20:05:27.375760,0 days 00:00:14.887501,5.673337,0.391687,False,True,False,COMPLETE
1,1,0.703,2023-02-12 20:04:13.109187,2023-02-12 20:04:27.838393,0 days 00:00:14.729206,5.691982,0.861102,True,False,False,COMPLETE


In [25]:
print(f"Best ElasticNET regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

Best ElasticNET regression params: {'alpha': 0.11242693435738552, 'l1_ratio': 0.23653131327938587, 'normalization': False, 'scaling': True, 'tfidf': False}, achieving val RMSE: 0.606


In [26]:
save = False
load = True

regressor_params = {"alpha": 0.11, "l1_ratio": 0.24}
pipe_params = {
    'normalization': False,
    'scaling': True,
    'tfidf': False,
}

if load:
    pipe = joblib.load(f"{exp_dir}/elasticnet/pipe")
else:
    regressor = ElasticNet(**regressor_params)
    pipe = sklearn_regressors.train_pipe(regressor, X_train, y_train, **pipe_params)

    if save:
        additional_attributes = {
            "regressor_params": regressor_params,
            "pipe_params": pipe_params,
            "test_programs": test_programs,
        }

        joblib.dump(pipe, f"{exp_dir}/elasticnet/pipe")
        with open(f"{exp_dir}/elasticnet/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [27]:
maes = {}
prog_energy = {}

for test_program in test_programs:
    prog_df = test_df[test_df.program_name == test_program]
    prog_bbs = np.array([" ".join(bb) for bb in prog_df.bb.tolist()])
    test_preds = pipe.predict(prog_bbs)
    true_energies = prog_df.energy.values
    
    maes[test_program] = mean_absolute_error(true_energies, test_preds)
    prog_energy[test_program] = {
        "true_energy": sum(true_energies),
        "pred_energy": sum(test_preds),
    }

print("Mean absolute error for test programs")
print(f"{maes}\n")

print("Total energies for test programs")
print(prog_energy)

Mean absolute error for test programs
{'faa': 0.89147165008725, 'quicksort': 0.25386099978710697, 'declare': 0.43216138903587825}

Total energies for test programs
{'faa': {'true_energy': 1593.1851865544543, 'pred_energy': 500.42793924663556}, 'quicksort': {'true_energy': 4246.078264105911, 'pred_energy': 6705.826639080588}, 'declare': {'true_energy': 6382.83618144921, 'pred_energy': 3819.1135006007057}}


## SGD Regression

Stochastic Gradient Descent Regression using either of the above reguliration techniques. Tune:
* penalty method
* alpha
* l1_ratio (if elasticnet penalty).

In [28]:
study = optuna.create_study(study_name="SGD-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-02-13 10:09:42,950][0m Using an existing study with name 'SGD-regression' instead of creating a new one.[0m


Using an existing study with name 'SGD-regression' instead of creating a new one.


In [25]:
def objective(trial):

    penalty = trial.suggest_categorical("penalty", ["l1", "l2", "elasticnet"])
    alpha = trial.suggest_float("alpha", 0.1, 10.0)
    l1_ratio = trial.suggest_float("l1_ratio", 0.1, 0.1)
    tfidf = trial.suggest_categorical("tfidf", [True, False])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = SGDRegressor(penalty=penalty, alpha=alpha, l1_ratio=l1_ratio)
    score = sklearn_regressors.evaluate_regressor(
        regressor=regressor,
        X=X_train,
        y=y_train,
        tfidf=tfidf,
        normalization=normalization, 
        scaling=scaling
    )

    return score

study.optimize(objective, n_trials=40)

[32m[I 2023-02-12 20:11:48,285][0m Trial 0 finished with value: 0.675 and parameters: {'penalty': 'l2', 'alpha': 8.126047195718966, 'l1_ratio': 0.1, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 0 with value: 0.675.[0m


Trial 0 finished with value: 0.675 and parameters: {'penalty': 'l2', 'alpha': 8.126047195718966, 'l1_ratio': 0.1, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 0 with value: 0.675.


[32m[I 2023-02-12 20:12:03,083][0m Trial 1 finished with value: 0.713 and parameters: {'penalty': 'l1', 'alpha': 8.840943103159544, 'l1_ratio': 0.1, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.675.[0m


Trial 1 finished with value: 0.713 and parameters: {'penalty': 'l1', 'alpha': 8.840943103159544, 'l1_ratio': 0.1, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.675.


[32m[I 2023-02-12 20:12:17,886][0m Trial 2 finished with value: 1773240610374.694 and parameters: {'penalty': 'l1', 'alpha': 0.24060479728254555, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.675.[0m


Trial 2 finished with value: 1773240610374.694 and parameters: {'penalty': 'l1', 'alpha': 0.24060479728254555, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.675.


[32m[I 2023-02-12 20:12:34,145][0m Trial 3 finished with value: 0.695 and parameters: {'penalty': 'l2', 'alpha': 3.3482433415553117, 'l1_ratio': 0.1, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.675.[0m


Trial 3 finished with value: 0.695 and parameters: {'penalty': 'l2', 'alpha': 3.3482433415553117, 'l1_ratio': 0.1, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.675.


[32m[I 2023-02-12 20:12:48,672][0m Trial 4 finished with value: 0.702 and parameters: {'penalty': 'l1', 'alpha': 2.7672922815066747, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 0 with value: 0.675.[0m


Trial 4 finished with value: 0.702 and parameters: {'penalty': 'l1', 'alpha': 2.7672922815066747, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 0 with value: 0.675.


[32m[I 2023-02-12 20:13:03,132][0m Trial 5 finished with value: 0.704 and parameters: {'penalty': 'l1', 'alpha': 8.771042710873411, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 0 with value: 0.675.[0m


Trial 5 finished with value: 0.704 and parameters: {'penalty': 'l1', 'alpha': 8.771042710873411, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 0 with value: 0.675.


[32m[I 2023-02-12 20:13:18,849][0m Trial 6 finished with value: 0.665 and parameters: {'penalty': 'l2', 'alpha': 9.150106587750104, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 6 with value: 0.665.[0m


Trial 6 finished with value: 0.665 and parameters: {'penalty': 'l2', 'alpha': 9.150106587750104, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 6 with value: 0.665.


[32m[I 2023-02-12 20:13:35,108][0m Trial 7 finished with value: 0.745 and parameters: {'penalty': 'l2', 'alpha': 6.59286855825322, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 6 with value: 0.665.[0m


Trial 7 finished with value: 0.745 and parameters: {'penalty': 'l2', 'alpha': 6.59286855825322, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 6 with value: 0.665.


[32m[I 2023-02-12 20:13:49,342][0m Trial 8 finished with value: 0.666 and parameters: {'penalty': 'l2', 'alpha': 7.578612199954453, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 6 with value: 0.665.[0m


Trial 8 finished with value: 0.666 and parameters: {'penalty': 'l2', 'alpha': 7.578612199954453, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 6 with value: 0.665.


[32m[I 2023-02-12 20:14:04,606][0m Trial 9 finished with value: 13299025423.464 and parameters: {'penalty': 'l2', 'alpha': 1.0622968651710285, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 6 with value: 0.665.[0m


Trial 9 finished with value: 13299025423.464 and parameters: {'penalty': 'l2', 'alpha': 1.0622968651710285, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 6 with value: 0.665.


[32m[I 2023-02-12 20:14:19,569][0m Trial 10 finished with value: 0.7 and parameters: {'penalty': 'elasticnet', 'alpha': 9.79180717107954, 'l1_ratio': 0.1, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 6 with value: 0.665.[0m


Trial 10 finished with value: 0.7 and parameters: {'penalty': 'elasticnet', 'alpha': 9.79180717107954, 'l1_ratio': 0.1, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 6 with value: 0.665.


[32m[I 2023-02-12 20:14:33,818][0m Trial 11 finished with value: 0.711 and parameters: {'penalty': 'l2', 'alpha': 6.77797372390831, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 6 with value: 0.665.[0m


Trial 11 finished with value: 0.711 and parameters: {'penalty': 'l2', 'alpha': 6.77797372390831, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 6 with value: 0.665.


[32m[I 2023-02-12 20:14:48,248][0m Trial 12 finished with value: 0.699 and parameters: {'penalty': 'elasticnet', 'alpha': 6.85957293706055, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 6 with value: 0.665.[0m


Trial 12 finished with value: 0.699 and parameters: {'penalty': 'elasticnet', 'alpha': 6.85957293706055, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 6 with value: 0.665.


[32m[I 2023-02-12 20:15:03,028][0m Trial 13 finished with value: 0.699 and parameters: {'penalty': 'l2', 'alpha': 9.674899651377821, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 6 with value: 0.665.[0m


Trial 13 finished with value: 0.699 and parameters: {'penalty': 'l2', 'alpha': 9.674899651377821, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 6 with value: 0.665.


[32m[I 2023-02-12 20:15:20,633][0m Trial 14 finished with value: 0.746 and parameters: {'penalty': 'l2', 'alpha': 7.786631759116358, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 6 with value: 0.665.[0m


Trial 14 finished with value: 0.746 and parameters: {'penalty': 'l2', 'alpha': 7.786631759116358, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 6 with value: 0.665.


[32m[I 2023-02-12 20:15:36,979][0m Trial 15 finished with value: 0.666 and parameters: {'penalty': 'l2', 'alpha': 5.534293227052803, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 6 with value: 0.665.[0m


Trial 15 finished with value: 0.666 and parameters: {'penalty': 'l2', 'alpha': 5.534293227052803, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 6 with value: 0.665.


[32m[I 2023-02-12 20:15:51,944][0m Trial 16 finished with value: 0.706 and parameters: {'penalty': 'elasticnet', 'alpha': 7.859896346906717, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 6 with value: 0.665.[0m


Trial 16 finished with value: 0.706 and parameters: {'penalty': 'elasticnet', 'alpha': 7.859896346906717, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 6 with value: 0.665.


[32m[I 2023-02-12 20:16:06,289][0m Trial 17 finished with value: 0.66 and parameters: {'penalty': 'l2', 'alpha': 5.391655466034191, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 17 with value: 0.66.[0m


Trial 17 finished with value: 0.66 and parameters: {'penalty': 'l2', 'alpha': 5.391655466034191, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 17 with value: 0.66.


[32m[I 2023-02-12 20:16:20,783][0m Trial 18 finished with value: 0.713 and parameters: {'penalty': 'l2', 'alpha': 5.04509379048451, 'l1_ratio': 0.1, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 17 with value: 0.66.[0m


Trial 18 finished with value: 0.713 and parameters: {'penalty': 'l2', 'alpha': 5.04509379048451, 'l1_ratio': 0.1, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 17 with value: 0.66.


[32m[I 2023-02-12 20:16:35,341][0m Trial 19 finished with value: 0.707 and parameters: {'penalty': 'elasticnet', 'alpha': 5.458404424617722, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 17 with value: 0.66.[0m


Trial 19 finished with value: 0.707 and parameters: {'penalty': 'elasticnet', 'alpha': 5.458404424617722, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 17 with value: 0.66.


[32m[I 2023-02-12 20:16:49,609][0m Trial 20 finished with value: 0.706 and parameters: {'penalty': 'l2', 'alpha': 9.971021985029886, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 17 with value: 0.66.[0m


Trial 20 finished with value: 0.706 and parameters: {'penalty': 'l2', 'alpha': 9.971021985029886, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 17 with value: 0.66.


[32m[I 2023-02-12 20:17:03,837][0m Trial 21 finished with value: 0.672 and parameters: {'penalty': 'l2', 'alpha': 8.7391020308791, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 17 with value: 0.66.[0m


Trial 21 finished with value: 0.672 and parameters: {'penalty': 'l2', 'alpha': 8.7391020308791, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 17 with value: 0.66.


[32m[I 2023-02-12 20:17:18,107][0m Trial 22 finished with value: 0.671 and parameters: {'penalty': 'l2', 'alpha': 7.180920204865433, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 17 with value: 0.66.[0m


Trial 22 finished with value: 0.671 and parameters: {'penalty': 'l2', 'alpha': 7.180920204865433, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 17 with value: 0.66.


[32m[I 2023-02-12 20:17:32,231][0m Trial 23 finished with value: 0.655 and parameters: {'penalty': 'l2', 'alpha': 6.133458824319162, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 23 with value: 0.655.[0m


Trial 23 finished with value: 0.655 and parameters: {'penalty': 'l2', 'alpha': 6.133458824319162, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 23 with value: 0.655.


[32m[I 2023-02-12 20:17:46,382][0m Trial 24 finished with value: 0.661 and parameters: {'penalty': 'l2', 'alpha': 5.782302775253823, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 23 with value: 0.655.[0m


Trial 24 finished with value: 0.661 and parameters: {'penalty': 'l2', 'alpha': 5.782302775253823, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 23 with value: 0.655.


[32m[I 2023-02-12 20:18:00,628][0m Trial 25 finished with value: 0.668 and parameters: {'penalty': 'l2', 'alpha': 6.157293122913626, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 23 with value: 0.655.[0m


Trial 25 finished with value: 0.668 and parameters: {'penalty': 'l2', 'alpha': 6.157293122913626, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 23 with value: 0.655.


[32m[I 2023-02-12 20:18:15,008][0m Trial 26 finished with value: 0.706 and parameters: {'penalty': 'l2', 'alpha': 4.803254553913317, 'l1_ratio': 0.1, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 23 with value: 0.655.[0m


Trial 26 finished with value: 0.706 and parameters: {'penalty': 'l2', 'alpha': 4.803254553913317, 'l1_ratio': 0.1, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 23 with value: 0.655.


[32m[I 2023-02-12 20:18:29,355][0m Trial 27 finished with value: 0.646 and parameters: {'penalty': 'l2', 'alpha': 4.394181615429851, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.[0m


Trial 27 finished with value: 0.646 and parameters: {'penalty': 'l2', 'alpha': 4.394181615429851, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.


[32m[I 2023-02-12 20:18:43,786][0m Trial 28 finished with value: 0.711 and parameters: {'penalty': 'l1', 'alpha': 4.642118412320941, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.[0m


Trial 28 finished with value: 0.711 and parameters: {'penalty': 'l1', 'alpha': 4.642118412320941, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.


[32m[I 2023-02-12 20:18:58,334][0m Trial 29 finished with value: 0.708 and parameters: {'penalty': 'elasticnet', 'alpha': 4.014005753530335, 'l1_ratio': 0.1, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.[0m


Trial 29 finished with value: 0.708 and parameters: {'penalty': 'elasticnet', 'alpha': 4.014005753530335, 'l1_ratio': 0.1, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.


[32m[I 2023-02-12 20:19:12,540][0m Trial 30 finished with value: 0.662 and parameters: {'penalty': 'l2', 'alpha': 6.127275965662736, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.[0m


Trial 30 finished with value: 0.662 and parameters: {'penalty': 'l2', 'alpha': 6.127275965662736, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.


[32m[I 2023-02-12 20:19:26,690][0m Trial 31 finished with value: 0.658 and parameters: {'penalty': 'l2', 'alpha': 6.02758604271719, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.[0m


Trial 31 finished with value: 0.658 and parameters: {'penalty': 'l2', 'alpha': 6.02758604271719, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.


[32m[I 2023-02-12 20:19:40,918][0m Trial 32 finished with value: 0.654 and parameters: {'penalty': 'l2', 'alpha': 4.428106778123649, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.[0m


Trial 32 finished with value: 0.654 and parameters: {'penalty': 'l2', 'alpha': 4.428106778123649, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.


[32m[I 2023-02-12 20:19:55,107][0m Trial 33 finished with value: 0.646 and parameters: {'penalty': 'l2', 'alpha': 4.324907804857163, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.[0m


Trial 33 finished with value: 0.646 and parameters: {'penalty': 'l2', 'alpha': 4.324907804857163, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.


[32m[I 2023-02-12 20:20:09,502][0m Trial 34 finished with value: 0.705 and parameters: {'penalty': 'l2', 'alpha': 4.188653225648363, 'l1_ratio': 0.1, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.[0m


Trial 34 finished with value: 0.705 and parameters: {'penalty': 'l2', 'alpha': 4.188653225648363, 'l1_ratio': 0.1, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.


[32m[I 2023-02-12 20:20:23,838][0m Trial 35 finished with value: 0.707 and parameters: {'penalty': 'l1', 'alpha': 3.534612704964779, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.[0m


Trial 35 finished with value: 0.707 and parameters: {'penalty': 'l1', 'alpha': 3.534612704964779, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.


[32m[I 2023-02-12 20:20:38,044][0m Trial 36 finished with value: 0.649 and parameters: {'penalty': 'l2', 'alpha': 2.8234199594516722, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.[0m


Trial 36 finished with value: 0.649 and parameters: {'penalty': 'l2', 'alpha': 2.8234199594516722, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.


[32m[I 2023-02-12 20:20:52,461][0m Trial 37 finished with value: 0.706 and parameters: {'penalty': 'l1', 'alpha': 2.50972994638051, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.[0m


Trial 37 finished with value: 0.706 and parameters: {'penalty': 'l1', 'alpha': 2.50972994638051, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.


[32m[I 2023-02-12 20:21:06,646][0m Trial 38 finished with value: 0.654 and parameters: {'penalty': 'l2', 'alpha': 2.7617317641419845, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.[0m


Trial 38 finished with value: 0.654 and parameters: {'penalty': 'l2', 'alpha': 2.7617317641419845, 'l1_ratio': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.


[32m[I 2023-02-12 20:21:20,906][0m Trial 39 finished with value: 0.703 and parameters: {'penalty': 'l2', 'alpha': 3.3353359835785104, 'l1_ratio': 0.1, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.[0m


Trial 39 finished with value: 0.703 and parameters: {'penalty': 'l2', 'alpha': 3.3353359835785104, 'l1_ratio': 0.1, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 27 with value: 0.646.


In [29]:
study.trials_dataframe().sort_values(by="value").head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_l1_ratio,params_normalization,params_penalty,params_scaling,params_tfidf,state
33,33,0.646,2023-02-12 20:19:40.921652,2023-02-12 20:19:55.090151,0 days 00:00:14.168499,4.324908,0.1,False,l2,False,False,COMPLETE
27,27,0.646,2023-02-12 20:18:15.011229,2023-02-12 20:18:29.342369,0 days 00:00:14.331140,4.394182,0.1,False,l2,False,False,COMPLETE
36,36,0.649,2023-02-12 20:20:23.841917,2023-02-12 20:20:38.031959,0 days 00:00:14.190042,2.82342,0.1,False,l2,False,False,COMPLETE
38,38,0.654,2023-02-12 20:20:52.464274,2023-02-12 20:21:06.633232,0 days 00:00:14.168958,2.761732,0.1,False,l2,False,False,COMPLETE
32,32,0.654,2023-02-12 20:19:26.693659,2023-02-12 20:19:40.906652,0 days 00:00:14.212993,4.428107,0.1,False,l2,False,False,COMPLETE
23,23,0.655,2023-02-12 20:17:18.110064,2023-02-12 20:17:32.220056,0 days 00:00:14.109992,6.133459,0.1,False,l2,False,False,COMPLETE
31,31,0.658,2023-02-12 20:19:12.543496,2023-02-12 20:19:26.676186,0 days 00:00:14.132690,6.027586,0.1,False,l2,False,False,COMPLETE
17,17,0.66,2023-02-12 20:15:51.947395,2023-02-12 20:16:06.275483,0 days 00:00:14.328088,5.391655,0.1,False,l2,False,False,COMPLETE
24,24,0.661,2023-02-12 20:17:32.235063,2023-02-12 20:17:46.365057,0 days 00:00:14.129994,5.782303,0.1,False,l2,False,False,COMPLETE
30,30,0.662,2023-02-12 20:18:58.337785,2023-02-12 20:19:12.528499,0 days 00:00:14.190714,6.127276,0.1,False,l2,False,False,COMPLETE


In [30]:
print(f"Best SGD regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

Best SGD regression params: {'alpha': 4.394181615429851, 'l1_ratio': 0.1, 'normalization': False, 'penalty': 'l2', 'scaling': False, 'tfidf': False}, achieving val RMSE: 0.646


In [31]:
save = False
load = True

regressor_params = {"alpha": 4.39, "l1_ratio": 0.1, "penalty": "l2"}
pipe_params = {
    'normalization': False,
    'scaling': False,
    'tfidf': False,
}

if load:
    pipe = joblib.load(f"{exp_dir}/sgd/pipe")
else:
    regressor = SGDRegressor(**regressor_params)
    pipe = sklearn_regressors.train_pipe(regressor, X_train, y_train, **pipe_params)

    if save:
        additional_attributes = {
            "regressor_params": regressor_params,
            "pipe_params": pipe_params,
            "test_programs": test_programs,
        }

        joblib.dump(pipe, f"{exp_dir}/sgd/pipe")
        with open(f"{exp_dir}/sgd/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [32]:
maes = {}
prog_energy = {}

for test_program in test_programs:
    prog_df = test_df[test_df.program_name == test_program]
    prog_bbs = np.array([" ".join(bb) for bb in prog_df.bb.tolist()])
    test_preds = pipe.predict(prog_bbs)
    true_energies = prog_df.energy.values
    
    maes[test_program] = mean_absolute_error(true_energies, test_preds)
    prog_energy[test_program] = {
        "true_energy": sum(true_energies),
        "pred_energy": sum(test_preds),
    }

print("Mean absolute error for test programs")
print(f"{maes}\n")

print("Total energies for test programs")
print(prog_energy)

Mean absolute error for test programs
{'faa': 0.9330119034821376, 'quicksort': 0.25573919607704576, 'declare': 0.4441559337023836}

Total energies for test programs
{'faa': {'true_energy': 1593.1851865544543, 'pred_energy': 456.1074803135414}, 'quicksort': {'true_energy': 4246.078264105911, 'pred_energy': 6264.6491942112025}, 'declare': {'true_energy': 6382.83618144921, 'pred_energy': 3514.0047917998977}}


## SVR

Support Vector Regression. Tune:
* kernel
* gamma
* C

In [8]:
study = optuna.create_study(study_name="SVR-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-02-13 15:33:41,824][0m Using an existing study with name 'SVR-regression' instead of creating a new one.[0m


Using an existing study with name 'SVR-regression' instead of creating a new one.


In [10]:
def objective(trial):

    kernel = trial.suggest_categorical("kernel", ["rbf", "linear"])
    C = trial.suggest_float("C", 0.1, 10.0)
    gamma = trial.suggest_float("gamma", 0.1, 0.1)
    tfidf = trial.suggest_categorical("tfidf", [True, False])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = SVR(kernel=kernel, C=C, gamma=gamma)
    score = sklearn_regressors.evaluate_regressor(
        regressor=regressor,
        X=X_train[:10000],
        y=y_train[:10000],
        tfidf=tfidf,
        normalization=normalization, 
        scaling=scaling
    )

    return score

study.optimize(objective, n_trials=40)

[32m[I 2023-02-13 00:57:53,743][0m Trial 0 finished with value: 0.604 and parameters: {'kernel': 'linear', 'C': 5.208669556171662, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 0 with value: 0.604.[0m


Trial 0 finished with value: 0.604 and parameters: {'kernel': 'linear', 'C': 5.208669556171662, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 0 with value: 0.604.


[32m[I 2023-02-13 01:22:24,755][0m Trial 1 finished with value: 0.614 and parameters: {'kernel': 'linear', 'C': 4.936226669403213, 'gamma': 0.1, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 0 with value: 0.604.[0m


Trial 1 finished with value: 0.614 and parameters: {'kernel': 'linear', 'C': 4.936226669403213, 'gamma': 0.1, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 0 with value: 0.604.


[32m[I 2023-02-13 01:22:34,089][0m Trial 2 finished with value: 0.635 and parameters: {'kernel': 'rbf', 'C': 4.441320775205987, 'gamma': 0.1, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.604.[0m


Trial 2 finished with value: 0.635 and parameters: {'kernel': 'rbf', 'C': 4.441320775205987, 'gamma': 0.1, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 0 with value: 0.604.


[32m[I 2023-02-13 01:22:48,064][0m Trial 3 finished with value: 0.611 and parameters: {'kernel': 'linear', 'C': 7.053715745057905, 'gamma': 0.1, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.604.[0m


Trial 3 finished with value: 0.611 and parameters: {'kernel': 'linear', 'C': 7.053715745057905, 'gamma': 0.1, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.604.


[32m[I 2023-02-13 01:22:56,279][0m Trial 4 finished with value: 0.618 and parameters: {'kernel': 'rbf', 'C': 8.783889125360108, 'gamma': 0.1, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 0 with value: 0.604.[0m


Trial 4 finished with value: 0.618 and parameters: {'kernel': 'rbf', 'C': 8.783889125360108, 'gamma': 0.1, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 0 with value: 0.604.


[32m[I 2023-02-13 01:23:04,206][0m Trial 5 finished with value: 0.645 and parameters: {'kernel': 'rbf', 'C': 3.4035365610017996, 'gamma': 0.1, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.604.[0m


Trial 5 finished with value: 0.645 and parameters: {'kernel': 'rbf', 'C': 3.4035365610017996, 'gamma': 0.1, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 0 with value: 0.604.


[32m[I 2023-02-13 01:23:12,051][0m Trial 6 finished with value: 0.568 and parameters: {'kernel': 'rbf', 'C': 2.200024904021774, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 6 with value: 0.568.[0m


Trial 6 finished with value: 0.568 and parameters: {'kernel': 'rbf', 'C': 2.200024904021774, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 6 with value: 0.568.


[32m[I 2023-02-13 01:23:21,270][0m Trial 7 finished with value: 0.559 and parameters: {'kernel': 'rbf', 'C': 5.455871781357861, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 7 with value: 0.559.[0m


Trial 7 finished with value: 0.559 and parameters: {'kernel': 'rbf', 'C': 5.455871781357861, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 7 with value: 0.559.


[32m[I 2023-02-13 02:25:38,181][0m Trial 8 finished with value: 0.564 and parameters: {'kernel': 'linear', 'C': 8.705072500948557, 'gamma': 0.1, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.559.[0m


Trial 8 finished with value: 0.564 and parameters: {'kernel': 'linear', 'C': 8.705072500948557, 'gamma': 0.1, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.559.


[32m[I 2023-02-13 02:34:46,811][0m Trial 9 finished with value: 0.655 and parameters: {'kernel': 'linear', 'C': 1.5626808876325884, 'gamma': 0.1, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 7 with value: 0.559.[0m


Trial 9 finished with value: 0.655 and parameters: {'kernel': 'linear', 'C': 1.5626808876325884, 'gamma': 0.1, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 7 with value: 0.559.


[32m[I 2023-02-13 02:34:55,315][0m Trial 10 finished with value: 0.607 and parameters: {'kernel': 'rbf', 'C': 0.31892278023665366, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 7 with value: 0.559.[0m


Trial 10 finished with value: 0.607 and parameters: {'kernel': 'rbf', 'C': 0.31892278023665366, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 7 with value: 0.559.


[32m[I 2023-02-13 03:16:45,165][0m Trial 11 finished with value: 0.621 and parameters: {'kernel': 'linear', 'C': 9.594059165956022, 'gamma': 0.1, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.559.[0m


Trial 11 finished with value: 0.621 and parameters: {'kernel': 'linear', 'C': 9.594059165956022, 'gamma': 0.1, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.559.


[32m[I 2023-02-13 03:16:54,453][0m Trial 12 finished with value: 0.564 and parameters: {'kernel': 'rbf', 'C': 7.275444676333413, 'gamma': 0.1, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.559.[0m


Trial 12 finished with value: 0.564 and parameters: {'kernel': 'rbf', 'C': 7.275444676333413, 'gamma': 0.1, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.559.


[32m[I 2023-02-13 03:56:18,535][0m Trial 13 finished with value: 0.657 and parameters: {'kernel': 'linear', 'C': 6.807658660430857, 'gamma': 0.1, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.559.[0m


Trial 13 finished with value: 0.657 and parameters: {'kernel': 'linear', 'C': 6.807658660430857, 'gamma': 0.1, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.559.


[32m[I 2023-02-13 03:56:27,797][0m Trial 14 finished with value: 0.575 and parameters: {'kernel': 'rbf', 'C': 8.388470603822016, 'gamma': 0.1, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 7 with value: 0.559.[0m


Trial 14 finished with value: 0.575 and parameters: {'kernel': 'rbf', 'C': 8.388470603822016, 'gamma': 0.1, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 7 with value: 0.559.


[32m[I 2023-02-13 04:58:10,153][0m Trial 15 finished with value: 0.603 and parameters: {'kernel': 'linear', 'C': 9.866095469526194, 'gamma': 0.1, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.559.[0m


Trial 15 finished with value: 0.603 and parameters: {'kernel': 'linear', 'C': 9.866095469526194, 'gamma': 0.1, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.559.


[32m[I 2023-02-13 05:35:19,402][0m Trial 16 finished with value: 0.621 and parameters: {'kernel': 'linear', 'C': 6.161478303617464, 'gamma': 0.1, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 7 with value: 0.559.[0m


Trial 16 finished with value: 0.621 and parameters: {'kernel': 'linear', 'C': 6.161478303617464, 'gamma': 0.1, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 7 with value: 0.559.


[32m[I 2023-02-13 05:35:28,586][0m Trial 17 finished with value: 0.583 and parameters: {'kernel': 'rbf', 'C': 7.562994014269719, 'gamma': 0.1, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.559.[0m


Trial 17 finished with value: 0.583 and parameters: {'kernel': 'rbf', 'C': 7.562994014269719, 'gamma': 0.1, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.559.


[32m[I 2023-02-13 05:35:37,918][0m Trial 18 finished with value: 0.571 and parameters: {'kernel': 'rbf', 'C': 5.964181175350156, 'gamma': 0.1, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.559.[0m


Trial 18 finished with value: 0.571 and parameters: {'kernel': 'rbf', 'C': 5.964181175350156, 'gamma': 0.1, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 7 with value: 0.559.


[32m[I 2023-02-13 06:10:36,339][0m Trial 19 finished with value: 0.595 and parameters: {'kernel': 'linear', 'C': 8.127295656546263, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 7 with value: 0.559.[0m


Trial 19 finished with value: 0.595 and parameters: {'kernel': 'linear', 'C': 8.127295656546263, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 7 with value: 0.559.


[32m[I 2023-02-13 06:10:45,613][0m Trial 20 finished with value: 0.554 and parameters: {'kernel': 'rbf', 'C': 9.159041143405295, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.[0m


Trial 20 finished with value: 0.554 and parameters: {'kernel': 'rbf', 'C': 9.159041143405295, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.


[32m[I 2023-02-13 06:10:54,892][0m Trial 21 finished with value: 0.582 and parameters: {'kernel': 'rbf', 'C': 9.018874624519125, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.[0m


Trial 21 finished with value: 0.582 and parameters: {'kernel': 'rbf', 'C': 9.018874624519125, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.


[32m[I 2023-02-13 06:11:04,206][0m Trial 22 finished with value: 0.599 and parameters: {'kernel': 'rbf', 'C': 9.989513734454905, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.[0m


Trial 22 finished with value: 0.599 and parameters: {'kernel': 'rbf', 'C': 9.989513734454905, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.


[32m[I 2023-02-13 06:11:13,497][0m Trial 23 finished with value: 0.634 and parameters: {'kernel': 'rbf', 'C': 8.0577220239305, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.[0m


Trial 23 finished with value: 0.634 and parameters: {'kernel': 'rbf', 'C': 8.0577220239305, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.


[32m[I 2023-02-13 06:11:22,813][0m Trial 24 finished with value: 0.606 and parameters: {'kernel': 'rbf', 'C': 9.153839677560727, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.[0m


Trial 24 finished with value: 0.606 and parameters: {'kernel': 'rbf', 'C': 9.153839677560727, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.


[32m[I 2023-02-13 06:11:32,119][0m Trial 25 finished with value: 0.557 and parameters: {'kernel': 'rbf', 'C': 8.021215400267241, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.[0m


Trial 25 finished with value: 0.557 and parameters: {'kernel': 'rbf', 'C': 8.021215400267241, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.


[32m[I 2023-02-13 06:11:39,970][0m Trial 26 finished with value: 0.601 and parameters: {'kernel': 'rbf', 'C': 7.6872657552973775, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 20 with value: 0.554.[0m


Trial 26 finished with value: 0.601 and parameters: {'kernel': 'rbf', 'C': 7.6872657552973775, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 20 with value: 0.554.


[32m[I 2023-02-13 06:11:49,382][0m Trial 27 finished with value: 0.629 and parameters: {'kernel': 'rbf', 'C': 6.557935718749707, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.[0m


Trial 27 finished with value: 0.629 and parameters: {'kernel': 'rbf', 'C': 6.557935718749707, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.


[32m[I 2023-02-13 06:11:58,779][0m Trial 28 finished with value: 0.568 and parameters: {'kernel': 'rbf', 'C': 7.8142911755692115, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.[0m


Trial 28 finished with value: 0.568 and parameters: {'kernel': 'rbf', 'C': 7.8142911755692115, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.


[32m[I 2023-02-13 06:12:06,569][0m Trial 29 finished with value: 0.61 and parameters: {'kernel': 'rbf', 'C': 5.60071055149684, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 20 with value: 0.554.[0m


Trial 29 finished with value: 0.61 and parameters: {'kernel': 'rbf', 'C': 5.60071055149684, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 20 with value: 0.554.


[32m[I 2023-02-13 06:12:15,920][0m Trial 30 finished with value: 0.597 and parameters: {'kernel': 'rbf', 'C': 7.095491179819373, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.[0m


Trial 30 finished with value: 0.597 and parameters: {'kernel': 'rbf', 'C': 7.095491179819373, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.


[32m[I 2023-02-13 06:58:40,530][0m Trial 31 finished with value: 0.613 and parameters: {'kernel': 'linear', 'C': 8.601060317979337, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.[0m


Trial 31 finished with value: 0.613 and parameters: {'kernel': 'linear', 'C': 8.601060317979337, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.


[32m[I 2023-02-13 07:48:42,730][0m Trial 32 finished with value: 0.749 and parameters: {'kernel': 'linear', 'C': 9.212900272877636, 'gamma': 0.1, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 20 with value: 0.554.[0m


Trial 32 finished with value: 0.749 and parameters: {'kernel': 'linear', 'C': 9.212900272877636, 'gamma': 0.1, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 20 with value: 0.554.


[32m[I 2023-02-13 07:48:52,083][0m Trial 33 finished with value: 0.615 and parameters: {'kernel': 'rbf', 'C': 8.6271594003484, 'gamma': 0.1, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.[0m


Trial 33 finished with value: 0.615 and parameters: {'kernel': 'rbf', 'C': 8.6271594003484, 'gamma': 0.1, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.


[32m[I 2023-02-13 08:18:53,240][0m Trial 34 finished with value: 0.649 and parameters: {'kernel': 'linear', 'C': 5.124616718997896, 'gamma': 0.1, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 20 with value: 0.554.[0m


Trial 34 finished with value: 0.649 and parameters: {'kernel': 'linear', 'C': 5.124616718997896, 'gamma': 0.1, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 20 with value: 0.554.


[32m[I 2023-02-13 08:19:02,464][0m Trial 35 finished with value: 0.586 and parameters: {'kernel': 'rbf', 'C': 9.31668175652011, 'gamma': 0.1, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.[0m


Trial 35 finished with value: 0.586 and parameters: {'kernel': 'rbf', 'C': 9.31668175652011, 'gamma': 0.1, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.


[32m[I 2023-02-13 08:19:16,683][0m Trial 36 finished with value: 0.625 and parameters: {'kernel': 'linear', 'C': 8.25727945087857, 'gamma': 0.1, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 20 with value: 0.554.[0m


Trial 36 finished with value: 0.625 and parameters: {'kernel': 'linear', 'C': 8.25727945087857, 'gamma': 0.1, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 20 with value: 0.554.


[32m[I 2023-02-13 08:19:25,982][0m Trial 37 finished with value: 0.618 and parameters: {'kernel': 'rbf', 'C': 4.619650988104117, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.[0m


Trial 37 finished with value: 0.618 and parameters: {'kernel': 'rbf', 'C': 4.619650988104117, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.554.


[32m[I 2023-02-13 08:19:33,769][0m Trial 38 finished with value: 0.547 and parameters: {'kernel': 'rbf', 'C': 7.399229326147877, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 38 with value: 0.547.[0m


Trial 38 finished with value: 0.547 and parameters: {'kernel': 'rbf', 'C': 7.399229326147877, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 38 with value: 0.547.


[32m[I 2023-02-13 08:19:41,676][0m Trial 39 finished with value: 0.597 and parameters: {'kernel': 'rbf', 'C': 6.6553410056005475, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 38 with value: 0.547.[0m


Trial 39 finished with value: 0.597 and parameters: {'kernel': 'rbf', 'C': 6.6553410056005475, 'gamma': 0.1, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 38 with value: 0.547.


In [9]:
study.trials_dataframe().sort_values(by="value").head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_gamma,params_kernel,params_normalization,params_scaling,params_tfidf,state
38,38,0.547,2023-02-13 08:19:25.985788,2023-02-13 08:19:33.755813,0 days 00:00:07.770025,7.399229,0.1,rbf,False,False,False,COMPLETE
20,20,0.554,2023-02-13 06:10:36.342999,2023-02-13 06:10:45.600992,0 days 00:00:09.257993,9.159041,0.1,rbf,False,True,False,COMPLETE
25,25,0.557,2023-02-13 06:11:22.816078,2023-02-13 06:11:32.107077,0 days 00:00:09.290999,8.021215,0.1,rbf,False,True,False,COMPLETE
7,7,0.559,2023-02-13 01:23:12.054681,2023-02-13 01:23:21.257705,0 days 00:00:09.203024,5.455872,0.1,rbf,False,True,False,COMPLETE
8,8,0.564,2023-02-13 01:23:21.273674,2023-02-13 02:25:38.167492,0 days 01:02:16.893818,8.705073,0.1,linear,True,True,True,COMPLETE
12,12,0.564,2023-02-13 03:16:45.168369,2023-02-13 03:16:54.440032,0 days 00:00:09.271663,7.275445,0.1,rbf,True,True,False,COMPLETE
28,28,0.568,2023-02-13 06:11:49.385882,2023-02-13 06:11:58.766880,0 days 00:00:09.380998,7.814291,0.1,rbf,False,True,False,COMPLETE
6,6,0.568,2023-02-13 01:23:04.210359,2023-02-13 01:23:12.038704,0 days 00:00:07.828345,2.200025,0.1,rbf,False,False,False,COMPLETE
18,18,0.571,2023-02-13 05:35:28.589634,2023-02-13 05:35:37.904512,0 days 00:00:09.314878,5.964181,0.1,rbf,True,True,True,COMPLETE
14,14,0.575,2023-02-13 03:56:18.538148,2023-02-13 03:56:27.784050,0 days 00:00:09.245902,8.388471,0.1,rbf,False,True,True,COMPLETE


In [10]:
print(f"Best SVR regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

Best SVR regression params: {'C': 7.399229326147877, 'gamma': 0.1, 'kernel': 'rbf', 'normalization': False, 'scaling': False, 'tfidf': False}, achieving val RMSE: 0.547


In [17]:
save = True
load = False

regressor_params = {"kernel": "rbf", "C": 7.4, "gamma": 0.1}
pipe_params = {
    'normalization': False,
    'scaling': False,
    'tfidf': False,
}

sample = 50000

if load:
    pipe = joblib.load(f"{exp_dir}/svr/pipe")
else:
    regressor = SVR(**regressor_params)
    pipe = sklearn_regressors.train_pipe(regressor, X_train[:sample], y_train[:sample], **pipe_params)
    if save:
        additional_attributes = {
            "regressor_params": regressor_params,
            "pipe_params": pipe_params,
            "test_programs": test_programs,
        }

        joblib.dump(pipe, f"{exp_dir}/svr/pipe")
        with open(f"{exp_dir}/svr/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [18]:
maes = {}
prog_energy = {}

for test_program in test_programs:
    prog_df = test_df[test_df.program_name == test_program]
    prog_bbs = np.array([" ".join(bb) for bb in prog_df.bb.tolist()])
    test_preds = pipe.predict(prog_bbs)
    true_energies = prog_df.energy.values
    
    maes[test_program] = mean_absolute_error(true_energies, test_preds)
    prog_energy[test_program] = {
        "true_energy": sum(true_energies),
        "pred_energy": sum(test_preds),
    }

print("Mean absolute error for test programs")
print(f"{maes}\n")

print("Total energies for test programs")
print(prog_energy)

Mean absolute error for test programs
{'faa': 0.947999546789649, 'quicksort': 0.20917131737962452, 'declare': 0.46181885270920847}

Total energies for test programs
{'faa': {'true_energy': 1593.1851865544559, 'pred_energy': 457.6059564138028}, 'quicksort': {'true_energy': 4246.078264105905, 'pred_energy': 6267.793849637768}, 'declare': {'true_energy': 6382.836181449226, 'pred_energy': 2851.4726615175578}}


## Hist Gradient Boosting Regressor

Gradient Boosting regression for large datasets. Tune:
* learning rate
* max leaf nodes
* l2 regularization

In [8]:
study = optuna.create_study(study_name="HistGBoost-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-02-13 15:34:41,326][0m A new study created in RDB with name: HistGBoost-regression[0m


A new study created in RDB with name: HistGBoost-regression


In [9]:
def objective(trial):

    learning_rate = trial.suggest_float("learning_rate", 1e-4, 0.5)
    max_leaf_nodes = trial.suggest_int("max_leaf_nodes", 10, 60)
    l2_regularization = trial.suggest_float("l2_regularization", 0.1, 10.0)
    tfidf = trial.suggest_categorical("tfidf", [True, False])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = HistGradientBoostingRegressor(learning_rate=learning_rate, max_leaf_nodes=max_leaf_nodes, l2_regularization=l2_regularization)
    score = sklearn_regressors.evaluate_regressor(
        regressor=regressor,
        X=X_train,
        y=y_train,
        tfidf=tfidf,
        normalization=normalization, 
        scaling=scaling,
        requires_dense=True,
    )

    return score

study.optimize(objective, n_trials=40)

[32m[I 2023-02-13 15:36:56,474][0m Trial 0 finished with value: 0.564 and parameters: {'learning_rate': 0.2942079553055097, 'max_leaf_nodes': 16, 'l2_regularization': 5.512124349723596, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 0 with value: 0.564.[0m


Trial 0 finished with value: 0.564 and parameters: {'learning_rate': 0.2942079553055097, 'max_leaf_nodes': 16, 'l2_regularization': 5.512124349723596, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 0 with value: 0.564.


[32m[I 2023-02-13 15:38:53,078][0m Trial 1 finished with value: 0.561 and parameters: {'learning_rate': 0.30633465499519946, 'max_leaf_nodes': 13, 'l2_regularization': 8.880231849204621, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 1 with value: 0.561.[0m


Trial 1 finished with value: 0.561 and parameters: {'learning_rate': 0.30633465499519946, 'max_leaf_nodes': 13, 'l2_regularization': 8.880231849204621, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 1 with value: 0.561.


[32m[I 2023-02-13 15:41:52,067][0m Trial 2 finished with value: 0.582 and parameters: {'learning_rate': 0.017378811465498512, 'max_leaf_nodes': 31, 'l2_regularization': 0.1338996789187556, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 1 with value: 0.561.[0m


Trial 2 finished with value: 0.582 and parameters: {'learning_rate': 0.017378811465498512, 'max_leaf_nodes': 31, 'l2_regularization': 0.1338996789187556, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 1 with value: 0.561.


[32m[I 2023-02-13 15:44:22,462][0m Trial 3 finished with value: 0.564 and parameters: {'learning_rate': 0.08615391625331668, 'max_leaf_nodes': 24, 'l2_regularization': 5.432855802619206, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 1 with value: 0.561.[0m


Trial 3 finished with value: 0.564 and parameters: {'learning_rate': 0.08615391625331668, 'max_leaf_nodes': 24, 'l2_regularization': 5.432855802619206, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 1 with value: 0.561.


[32m[I 2023-02-13 15:46:50,208][0m Trial 4 finished with value: 0.572 and parameters: {'learning_rate': 0.07108130204140424, 'max_leaf_nodes': 18, 'l2_regularization': 3.9700205170161, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 1 with value: 0.561.[0m


Trial 4 finished with value: 0.572 and parameters: {'learning_rate': 0.07108130204140424, 'max_leaf_nodes': 18, 'l2_regularization': 3.9700205170161, 'tfidf': False, 'normalization': False, 'scaling': False}. Best is trial 1 with value: 0.561.


[32m[I 2023-02-13 15:49:42,364][0m Trial 5 finished with value: 0.565 and parameters: {'learning_rate': 0.11973467901858023, 'max_leaf_nodes': 34, 'l2_regularization': 2.4590683432617326, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 1 with value: 0.561.[0m


Trial 5 finished with value: 0.565 and parameters: {'learning_rate': 0.11973467901858023, 'max_leaf_nodes': 34, 'l2_regularization': 2.4590683432617326, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 1 with value: 0.561.


[32m[I 2023-02-13 15:52:30,034][0m Trial 6 finished with value: 0.576 and parameters: {'learning_rate': 0.02174984379863535, 'max_leaf_nodes': 28, 'l2_regularization': 3.350232714316225, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 1 with value: 0.561.[0m


Trial 6 finished with value: 0.576 and parameters: {'learning_rate': 0.02174984379863535, 'max_leaf_nodes': 28, 'l2_regularization': 3.350232714316225, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 1 with value: 0.561.


[32m[I 2023-02-13 15:55:08,649][0m Trial 7 finished with value: 0.562 and parameters: {'learning_rate': 0.457671667104507, 'max_leaf_nodes': 60, 'l2_regularization': 5.685665191887072, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 1 with value: 0.561.[0m


Trial 7 finished with value: 0.562 and parameters: {'learning_rate': 0.457671667104507, 'max_leaf_nodes': 60, 'l2_regularization': 5.685665191887072, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 1 with value: 0.561.


[32m[I 2023-02-13 15:57:15,033][0m Trial 8 finished with value: 0.564 and parameters: {'learning_rate': 0.44295139383359927, 'max_leaf_nodes': 25, 'l2_regularization': 2.3440541250601514, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 1 with value: 0.561.[0m


Trial 8 finished with value: 0.564 and parameters: {'learning_rate': 0.44295139383359927, 'max_leaf_nodes': 25, 'l2_regularization': 2.3440541250601514, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 1 with value: 0.561.


[32m[I 2023-02-13 15:58:57,814][0m Trial 9 finished with value: 0.568 and parameters: {'learning_rate': 0.43636562963668124, 'max_leaf_nodes': 10, 'l2_regularization': 6.464801158890502, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 1 with value: 0.561.[0m


Trial 9 finished with value: 0.568 and parameters: {'learning_rate': 0.43636562963668124, 'max_leaf_nodes': 10, 'l2_regularization': 6.464801158890502, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 1 with value: 0.561.


[32m[I 2023-02-13 16:01:55,414][0m Trial 10 finished with value: 0.567 and parameters: {'learning_rate': 0.27056471177068536, 'max_leaf_nodes': 45, 'l2_regularization': 9.327395151488126, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 1 with value: 0.561.[0m


Trial 10 finished with value: 0.567 and parameters: {'learning_rate': 0.27056471177068536, 'max_leaf_nodes': 45, 'l2_regularization': 9.327395151488126, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 1 with value: 0.561.


[32m[I 2023-02-13 16:04:54,919][0m Trial 11 finished with value: 0.562 and parameters: {'learning_rate': 0.4975456500998111, 'max_leaf_nodes': 60, 'l2_regularization': 8.459157633797098, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 1 with value: 0.561.[0m


Trial 11 finished with value: 0.562 and parameters: {'learning_rate': 0.4975456500998111, 'max_leaf_nodes': 60, 'l2_regularization': 8.459157633797098, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 1 with value: 0.561.


[32m[I 2023-02-13 16:07:34,199][0m Trial 12 finished with value: 0.571 and parameters: {'learning_rate': 0.359015102825519, 'max_leaf_nodes': 47, 'l2_regularization': 7.780671411842416, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 1 with value: 0.561.[0m


Trial 12 finished with value: 0.571 and parameters: {'learning_rate': 0.359015102825519, 'max_leaf_nodes': 47, 'l2_regularization': 7.780671411842416, 'tfidf': False, 'normalization': True, 'scaling': True}. Best is trial 1 with value: 0.561.


[32m[I 2023-02-13 16:11:17,523][0m Trial 13 finished with value: 0.562 and parameters: {'learning_rate': 0.19246495943040753, 'max_leaf_nodes': 56, 'l2_regularization': 7.394104872801991, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 1 with value: 0.561.[0m


Trial 13 finished with value: 0.562 and parameters: {'learning_rate': 0.19246495943040753, 'max_leaf_nodes': 56, 'l2_regularization': 7.394104872801991, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 1 with value: 0.561.


[32m[I 2023-02-13 16:14:12,008][0m Trial 14 finished with value: 0.565 and parameters: {'learning_rate': 0.3481833757358719, 'max_leaf_nodes': 43, 'l2_regularization': 9.669110623791262, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 1 with value: 0.561.[0m


Trial 14 finished with value: 0.565 and parameters: {'learning_rate': 0.3481833757358719, 'max_leaf_nodes': 43, 'l2_regularization': 9.669110623791262, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 1 with value: 0.561.


[32m[I 2023-02-13 16:16:56,366][0m Trial 15 finished with value: 0.566 and parameters: {'learning_rate': 0.3489329591881863, 'max_leaf_nodes': 39, 'l2_regularization': 6.74434207323936, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 1 with value: 0.561.[0m


Trial 15 finished with value: 0.566 and parameters: {'learning_rate': 0.3489329591881863, 'max_leaf_nodes': 39, 'l2_regularization': 6.74434207323936, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 1 with value: 0.561.


[32m[I 2023-02-13 16:20:37,926][0m Trial 16 finished with value: 0.564 and parameters: {'learning_rate': 0.18533567223628303, 'max_leaf_nodes': 52, 'l2_regularization': 9.929047109456913, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 1 with value: 0.561.[0m


Trial 16 finished with value: 0.564 and parameters: {'learning_rate': 0.18533567223628303, 'max_leaf_nodes': 52, 'l2_regularization': 9.929047109456913, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 1 with value: 0.561.


[32m[I 2023-02-13 16:22:29,821][0m Trial 17 finished with value: 0.575 and parameters: {'learning_rate': 0.40172644669091645, 'max_leaf_nodes': 10, 'l2_regularization': 8.148799196825088, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 1 with value: 0.561.[0m


Trial 17 finished with value: 0.575 and parameters: {'learning_rate': 0.40172644669091645, 'max_leaf_nodes': 10, 'l2_regularization': 8.148799196825088, 'tfidf': False, 'normalization': False, 'scaling': True}. Best is trial 1 with value: 0.561.


[32m[I 2023-02-13 16:24:48,366][0m Trial 18 finished with value: 0.563 and parameters: {'learning_rate': 0.49801016176557056, 'max_leaf_nodes': 51, 'l2_regularization': 6.502855385122128, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 1 with value: 0.561.[0m


Trial 18 finished with value: 0.563 and parameters: {'learning_rate': 0.49801016176557056, 'max_leaf_nodes': 51, 'l2_regularization': 6.502855385122128, 'tfidf': False, 'normalization': True, 'scaling': False}. Best is trial 1 with value: 0.561.


[32m[I 2023-02-13 16:27:53,025][0m Trial 19 finished with value: 0.568 and parameters: {'learning_rate': 0.2969827356669297, 'max_leaf_nodes': 38, 'l2_regularization': 8.79824421503589, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 1 with value: 0.561.[0m


Trial 19 finished with value: 0.568 and parameters: {'learning_rate': 0.2969827356669297, 'max_leaf_nodes': 38, 'l2_regularization': 8.79824421503589, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 1 with value: 0.561.


[32m[I 2023-02-13 16:30:13,566][0m Trial 20 finished with value: 0.558 and parameters: {'learning_rate': 0.23595804024556916, 'max_leaf_nodes': 20, 'l2_regularization': 8.803675229357003, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.558.[0m


Trial 20 finished with value: 0.558 and parameters: {'learning_rate': 0.23595804024556916, 'max_leaf_nodes': 20, 'l2_regularization': 8.803675229357003, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.558.


[32m[I 2023-02-13 16:32:26,996][0m Trial 21 finished with value: 0.563 and parameters: {'learning_rate': 0.23764408191595043, 'max_leaf_nodes': 17, 'l2_regularization': 8.862525323334934, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.558.[0m


Trial 21 finished with value: 0.563 and parameters: {'learning_rate': 0.23764408191595043, 'max_leaf_nodes': 17, 'l2_regularization': 8.862525323334934, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.558.


[32m[I 2023-02-13 16:34:51,497][0m Trial 22 finished with value: 0.566 and parameters: {'learning_rate': 0.21805688116073704, 'max_leaf_nodes': 22, 'l2_regularization': 7.731875622912549, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.558.[0m


Trial 22 finished with value: 0.566 and parameters: {'learning_rate': 0.21805688116073704, 'max_leaf_nodes': 22, 'l2_regularization': 7.731875622912549, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.558.


[32m[I 2023-02-13 16:36:55,395][0m Trial 23 finished with value: 0.56 and parameters: {'learning_rate': 0.31505292162702436, 'max_leaf_nodes': 14, 'l2_regularization': 9.057125130908094, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.558.[0m


Trial 23 finished with value: 0.56 and parameters: {'learning_rate': 0.31505292162702436, 'max_leaf_nodes': 14, 'l2_regularization': 9.057125130908094, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.558.


[32m[I 2023-02-13 16:38:59,021][0m Trial 24 finished with value: 0.567 and parameters: {'learning_rate': 0.3051630526554213, 'max_leaf_nodes': 14, 'l2_regularization': 9.986748687814384, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.558.[0m


Trial 24 finished with value: 0.567 and parameters: {'learning_rate': 0.3051630526554213, 'max_leaf_nodes': 14, 'l2_regularization': 9.986748687814384, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.558.


[32m[I 2023-02-13 16:41:18,475][0m Trial 25 finished with value: 0.571 and parameters: {'learning_rate': 0.2542746257139527, 'max_leaf_nodes': 20, 'l2_regularization': 8.732735849892915, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.558.[0m


Trial 25 finished with value: 0.571 and parameters: {'learning_rate': 0.2542746257139527, 'max_leaf_nodes': 20, 'l2_regularization': 8.732735849892915, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.558.


[32m[I 2023-02-13 16:43:16,509][0m Trial 26 finished with value: 0.565 and parameters: {'learning_rate': 0.3191102588767558, 'max_leaf_nodes': 13, 'l2_regularization': 9.23813638342909, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 20 with value: 0.558.[0m


Trial 26 finished with value: 0.565 and parameters: {'learning_rate': 0.3191102588767558, 'max_leaf_nodes': 13, 'l2_regularization': 9.23813638342909, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 20 with value: 0.558.


[32m[I 2023-02-13 16:45:49,105][0m Trial 27 finished with value: 0.568 and parameters: {'learning_rate': 0.26174404015668484, 'max_leaf_nodes': 26, 'l2_regularization': 8.104617421186699, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.558.[0m


Trial 27 finished with value: 0.568 and parameters: {'learning_rate': 0.26174404015668484, 'max_leaf_nodes': 26, 'l2_regularization': 8.104617421186699, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.558.


[32m[I 2023-02-13 16:47:44,574][0m Trial 28 finished with value: 0.567 and parameters: {'learning_rate': 0.3824867947347591, 'max_leaf_nodes': 13, 'l2_regularization': 7.345889519815389, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 20 with value: 0.558.[0m


Trial 28 finished with value: 0.567 and parameters: {'learning_rate': 0.3824867947347591, 'max_leaf_nodes': 13, 'l2_regularization': 7.345889519815389, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 20 with value: 0.558.


[32m[I 2023-02-13 16:49:56,281][0m Trial 29 finished with value: 0.561 and parameters: {'learning_rate': 0.28681667462170857, 'max_leaf_nodes': 20, 'l2_regularization': 8.222416647401545, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 20 with value: 0.558.[0m


Trial 29 finished with value: 0.561 and parameters: {'learning_rate': 0.28681667462170857, 'max_leaf_nodes': 20, 'l2_regularization': 8.222416647401545, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 20 with value: 0.558.


[32m[I 2023-02-13 16:52:00,098][0m Trial 30 finished with value: 0.566 and parameters: {'learning_rate': 0.31984087400456973, 'max_leaf_nodes': 16, 'l2_regularization': 9.303089932380201, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.558.[0m


Trial 30 finished with value: 0.566 and parameters: {'learning_rate': 0.31984087400456973, 'max_leaf_nodes': 16, 'l2_regularization': 9.303089932380201, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.558.


[32m[I 2023-02-13 16:54:14,077][0m Trial 31 finished with value: 0.566 and parameters: {'learning_rate': 0.2754935879052238, 'max_leaf_nodes': 21, 'l2_regularization': 8.50694991533685, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 20 with value: 0.558.[0m


Trial 31 finished with value: 0.566 and parameters: {'learning_rate': 0.2754935879052238, 'max_leaf_nodes': 21, 'l2_regularization': 8.50694991533685, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 20 with value: 0.558.


[32m[I 2023-02-13 16:56:23,158][0m Trial 32 finished with value: 0.566 and parameters: {'learning_rate': 0.27999826927850524, 'max_leaf_nodes': 19, 'l2_regularization': 9.365130601439098, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 20 with value: 0.558.[0m


Trial 32 finished with value: 0.566 and parameters: {'learning_rate': 0.27999826927850524, 'max_leaf_nodes': 19, 'l2_regularization': 9.365130601439098, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 20 with value: 0.558.


[32m[I 2023-02-13 16:58:56,747][0m Trial 33 finished with value: 0.569 and parameters: {'learning_rate': 0.32869310506985494, 'max_leaf_nodes': 31, 'l2_regularization': 8.483828492335654, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 20 with value: 0.558.[0m


Trial 33 finished with value: 0.569 and parameters: {'learning_rate': 0.32869310506985494, 'max_leaf_nodes': 31, 'l2_regularization': 8.483828492335654, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 20 with value: 0.558.


[32m[I 2023-02-13 17:01:02,692][0m Trial 34 finished with value: 0.564 and parameters: {'learning_rate': 0.2309580242265282, 'max_leaf_nodes': 15, 'l2_regularization': 7.090122997305471, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 20 with value: 0.558.[0m


Trial 34 finished with value: 0.564 and parameters: {'learning_rate': 0.2309580242265282, 'max_leaf_nodes': 15, 'l2_regularization': 7.090122997305471, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 20 with value: 0.558.


[32m[I 2023-02-13 17:03:26,024][0m Trial 35 finished with value: 0.565 and parameters: {'learning_rate': 0.2904351125348219, 'max_leaf_nodes': 22, 'l2_regularization': 7.696852907461228, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 20 with value: 0.558.[0m


Trial 35 finished with value: 0.565 and parameters: {'learning_rate': 0.2904351125348219, 'max_leaf_nodes': 22, 'l2_regularization': 7.696852907461228, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 20 with value: 0.558.


[32m[I 2023-02-13 17:06:01,770][0m Trial 36 finished with value: 0.568 and parameters: {'learning_rate': 0.19441114282683633, 'max_leaf_nodes': 28, 'l2_regularization': 9.00347612636283, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 20 with value: 0.558.[0m


Trial 36 finished with value: 0.568 and parameters: {'learning_rate': 0.19441114282683633, 'max_leaf_nodes': 28, 'l2_regularization': 9.00347612636283, 'tfidf': True, 'normalization': False, 'scaling': False}. Best is trial 20 with value: 0.558.


[32m[I 2023-02-13 17:07:55,112][0m Trial 37 finished with value: 0.565 and parameters: {'learning_rate': 0.15251306407309845, 'max_leaf_nodes': 10, 'l2_regularization': 8.029144757963675, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 20 with value: 0.558.[0m


Trial 37 finished with value: 0.565 and parameters: {'learning_rate': 0.15251306407309845, 'max_leaf_nodes': 10, 'l2_regularization': 8.029144757963675, 'tfidf': True, 'normalization': True, 'scaling': False}. Best is trial 20 with value: 0.558.


[32m[I 2023-02-13 17:10:45,476][0m Trial 38 finished with value: 0.561 and parameters: {'learning_rate': 0.2270912819968195, 'max_leaf_nodes': 32, 'l2_regularization': 9.58297881151908, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.558.[0m


Trial 38 finished with value: 0.561 and parameters: {'learning_rate': 0.2270912819968195, 'max_leaf_nodes': 32, 'l2_regularization': 9.58297881151908, 'tfidf': True, 'normalization': False, 'scaling': True}. Best is trial 20 with value: 0.558.


[32m[I 2023-02-13 17:12:55,185][0m Trial 39 finished with value: 0.567 and parameters: {'learning_rate': 0.28839988443284753, 'max_leaf_nodes': 18, 'l2_regularization': 5.72787286872817, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 20 with value: 0.558.[0m


Trial 39 finished with value: 0.567 and parameters: {'learning_rate': 0.28839988443284753, 'max_leaf_nodes': 18, 'l2_regularization': 5.72787286872817, 'tfidf': True, 'normalization': True, 'scaling': True}. Best is trial 20 with value: 0.558.


In [10]:
study.trials_dataframe().sort_values(by="value").head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_l2_regularization,params_learning_rate,params_max_leaf_nodes,params_normalization,params_scaling,params_tfidf,state
20,20,0.558,2023-02-13 16:27:53.031437,2023-02-13 16:30:13.552935,0 days 00:02:20.521498,8.803675,0.235958,20,False,True,True,COMPLETE
23,23,0.56,2023-02-13 16:34:51.510108,2023-02-13 16:36:55.377005,0 days 00:02:03.866897,9.057125,0.315053,14,False,True,True,COMPLETE
1,1,0.561,2023-02-13 15:36:56.492012,2023-02-13 15:38:53.064597,0 days 00:01:56.572585,8.880232,0.306335,13,True,True,True,COMPLETE
38,38,0.561,2023-02-13 17:07:55.122314,2023-02-13 17:10:45.452352,0 days 00:02:50.330038,9.582979,0.227091,32,False,True,True,COMPLETE
29,29,0.561,2023-02-13 16:47:44.578250,2023-02-13 16:49:56.268404,0 days 00:02:11.690154,8.222417,0.286817,20,False,False,True,COMPLETE
7,7,0.562,2023-02-13 15:52:30.038867,2023-02-13 15:55:08.636643,0 days 00:02:38.597776,5.685665,0.457672,60,False,True,False,COMPLETE
11,11,0.562,2023-02-13 16:01:55.418044,2023-02-13 16:04:54.906149,0 days 00:02:59.488105,8.459158,0.497546,60,False,True,False,COMPLETE
13,13,0.562,2023-02-13 16:07:34.214612,2023-02-13 16:11:17.505659,0 days 00:03:43.291047,7.394105,0.192465,56,False,True,True,COMPLETE
21,21,0.563,2023-02-13 16:30:13.570936,2023-02-13 16:32:26.975964,0 days 00:02:13.405028,8.862525,0.237644,17,False,True,True,COMPLETE
18,18,0.563,2023-02-13 16:22:29.826937,2023-02-13 16:24:48.352937,0 days 00:02:18.526000,6.502855,0.49801,51,True,False,False,COMPLETE


In [11]:
print(f"Best HistGBoost regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

Best HistGBoost regression params: {'l2_regularization': 8.803675229357003, 'learning_rate': 0.23595804024556916, 'max_leaf_nodes': 20, 'normalization': False, 'scaling': True, 'tfidf': True}, achieving val RMSE: 0.558


In [15]:
save = True
load = False

regressor_params = {"learning_rate": 0.24, "max_leaf_nodes": 20, "l2_regularization": 8.8}
pipe_params = {
    'normalization': False,
    'scaling': True,
    'tfidf': True,
    'requires_dense': True,
}

if load:
    pipe = joblib.load(f"{exp_dir}/hist_gradient_boosting/pipe")
else:
    regressor = HistGradientBoostingRegressor(**regressor_params)
    pipe = sklearn_regressors.train_pipe(regressor, X_train, y_train, **pipe_params)

    if save:
        additional_attributes = {
            "regressor_params": regressor_params,
            "pipe_params": pipe_params,
            "test_programs": test_programs,
        }

        joblib.dump(pipe, f"{exp_dir}/hist_gradient_boosting/pipe")
        with open(f"{exp_dir}/hist_gradient_boosting/additional_attributes.json", "w") as file:
            json.dump(additional_attributes, file, indent=4)

In [16]:
maes = {}
prog_energy = {}

for test_program in test_programs:
    prog_df = test_df[test_df.program_name == test_program]
    prog_bbs = np.array([" ".join(bb) for bb in prog_df.bb.tolist()])
    test_preds = pipe.predict(prog_bbs)
    true_energies = prog_df.energy.values
    
    maes[test_program] = mean_absolute_error(true_energies, test_preds)
    prog_energy[test_program] = {
        "true_energy": sum(true_energies),
        "pred_energy": sum(test_preds),
    }

print("Mean absolute error for test programs")
print(f"{maes}\n")

print("Total energies for test programs")
print(prog_energy)

Mean absolute error for test programs
{'faa': 0.8123990241181436, 'quicksort': 0.2551870738675057, 'declare': 0.4204020368187791}

Total energies for test programs
{'faa': {'true_energy': 1593.1851865544559, 'pred_energy': 596.5419792575909}, 'quicksort': {'true_energy': 4246.078264105905, 'pred_energy': 7377.537371436673}, 'declare': {'true_energy': 6382.836181449226, 'pred_energy': 3948.5523294498194}}
