# Imports

In [1]:
import os
import sys
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import optuna

from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
from sklearn.linear_model import LinearRegression, SGDRegressor, ElasticNet, BayesianRidge, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.decomposition import PCA

from sklearn.preprocessing import normalize, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline

from collections import Counter
from typing import Union

sys.path.append("../../src")

import embedder
import utils

# Data

In [2]:
data_files = os.listdir("../../dataset/bb_data/")

if "energy_dataset.pkl" not in data_files:
    result_files = [f for f in os.listdir("../../../energy_dataset/") if f.endswith("results")] 
    data_df = pd.DataFrame()

    for file in result_files:
        file_df = utils.read_bb_data(f"../../../energy_dataset/{file}/breaker_code.txt", f"../../../energy_dataset/{file}/breaker_final_energy.txt")
        file_df = utils.preprocess_bb_df(file_df)
        data_df = pd.concat([data_df, file_df], ignore_index=True)

    data_df["bb_embeddings"] = data_df.bb.apply(lambda x: embedder.encode(x))
    data_df.to_pickle("../../dataset/bb_data/energy_dataset.pkl")
    
else:
    data_df = pd.read_pickle("../../dataset/bb_data/energy_dataset.pkl")
    if "bb_embeddings" not in data_df.columns:
        data_df["bb_embeddings"] = data_df.bb.apply(lambda x: embedder.encode(x))
        data_df.to_pickle("../../dataset/bb_data/energy_dataset.pkl")

In [3]:
def pad_sequence(sequence: list, max_len: int) -> list:
    sequence.extend([0 for i in range(max_len - len(sequence))])
    return sequence

def concat_embeddings(df: pd.DataFrame) -> pd.DataFrame:
    if "bb_embeddings" not in df: 
        raise KeyError("'bb_embeddings' not in dataframe columns. Please provide a dataframe with the 'bb_embeddings' column.")

    df_util = df.copy()
    df_util.bb_embeddings = df_util.bb_embeddings.map(np.concatenate)

    #pad sequences
    max_len = max(df_util.bb_embeddings.map(len))
    df_util.bb_embeddings = df_util.bb_embeddings.apply(lambda x: pad_sequence(x.tolist(), max_len))

    return df_util

concat_df = concat_embeddings(data_df)

In [7]:
X, y = np.array(concat_df.bb_embeddings.tolist()), concat_df.energy.values

# Testing Regressors

In [None]:
def evaluate_regressor(
    regressor: Union[
        LinearRegression,
        Lasso,
        Ridge,
        ElasticNet,
        SGDRegressor,
        SVR,
        HistGradientBoostingRegressor,
    ],
    pca_components: int,
    normalization: bool = False,
    scaling: bool = False,
):

    pca = PCA(n_components=pca_components)

    if scaling:
        pipe = Pipeline([("scaler", StandardScaler()), ("pca", pca), ("reg", regressor)])
    else:
        pipe = Pipeline([("pca", pca), ("regressor", regressor)])

    if normalization:
        X_util = normalize(X, norm="l2", axis=1)
    else:
        X_util = X

    cv = ShuffleSplit(n_splits=5, test_size=0.2)
    scores = -cross_val_score(
        pipe, X_util, y, cv=cv, scoring="neg_root_mean_squared_error"
    )

    score = round(np.median(scores), 3)

    return score

In [None]:
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
storage_name = f"sqlite:////Users/thodo/Documents/sxoli/optuna-studies/sklearn-regressors-palmtree-embs.db"

## Linear Regression

Simple least squares regression.
No need for extra tuning.

In [None]:
study = optuna.create_study(study_name="linear-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

[32m[I 2023-01-26 15:09:09,833][0m A new study created in RDB with name: linear-regression[0m


A new study created in RDB with name: linear-regression


In [None]:
def objective(trial):

    pca_components = trial.suggest_int("pca_components", 32, X[0].shape[0]//2)
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = LinearRegression()
    score = evaluate_regressor(regressor=regressor, pca_components=pca_components, normalization=normalization, scaling=scaling)

    return score

study.optimize(objective, n_trials=60)

In [None]:
study.trials_dataframe().sort_values(by="value").head(10)

In [None]:
print(f"Best linear regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

## Lasso

Linear regression with L1 regularization. Tune:
* alpha (L1 regularization term).

In [None]:
study = optuna.create_study(study_name="lasso-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

In [None]:
def objective(trial):

    alpha = trial.suggest_float("alpha", 0.1, 10.0)
    pca_components = trial.suggest_int("pca_components", 32, X[0].shape[0])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = Lasso(alpha=alpha)
    score = evaluate_regressor(regressor=regressor, pca_components=pca_components, normalization=normalization, scaling=scaling)

    return score

study.optimize(objective, n_trials=60)

NameError: ignored

In [None]:
study.trials_dataframe().sort_values(by="value").head(10)

In [None]:
print(f"Best lasso regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

## Ridge

Linear regression with L2 regularization. Tune:
* alpha (L2 regularization term).

In [None]:
study = optuna.create_study(study_name="ridge-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

In [None]:
def objective(trial):

    alpha = trial.suggest_float("alpha", 0.1, 10.0)
    pca_components = trial.suggest_int("pca_components", 32, X[0].shape[0])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = Ridge(alpha=alpha)
    score = evaluate_regressor(regressor=regressor, pca_components=pca_components, normalization=normalization, scaling=scaling)

    return score

study.optimize(objective, n_trials=60)

NameError: ignored

In [None]:
study.trials_dataframe().sort_values(by="value").head(10)

In [None]:
print(f"Best ridge regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

## ElasticNet

Linear regression with both L1 and L2 regularization. Tune
* alpha
* l1_ratio. 

In [None]:
study = optuna.create_study(study_name="elasticnet-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

In [None]:
def objective(trial):

    alpha = trial.suggest_float("alpha", 0.1, 10.0)
    l1_ratio = trial.suggest_float("l1_ratio", 0.1, 0.9)
    pca_components = trial.suggest_int("pca_components", 32, X[0].shape[0])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
    score = evaluate_regressor(regressor=regressor, pca_components=pca_components, normalization=normalization, scaling=scaling)

    return score

study.optimize(objective, n_trials=60)

NameError: ignored

In [None]:
study.trials_dataframe().sort_values(by="value").head(10)

In [None]:
print(f"Best ElasticNET regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

## SGD Regression

Stochastic Gradient Descent Regression using either of the above reguliration techniques. Tune:
* penalty method
* alpha
* l1_ratio (if elasticnet penalty).

In [None]:
study = optuna.create_study(study_name="SGD-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

In [None]:
def objective(trial):

    penalty = trial.suggest_categorical("penalty", ["l1", "l2", "elasticnet"])
    alpha = trial.suggest_float("alpha", 0.1, 10.0)
    l1_ratio = trial.suggest_float("l1_ratio", 0.1, 0.1)
    pca_components = trial.suggest_int("pca_components", 32, X[0].shape[0])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = SGDRegressor(penalty=penalty, alpha=alpha, l1_ratio=l1_ratio)
    score = evaluate_regressor(regressor=regressor, pca_components=pca_components, normalization=normalization, scaling=scaling)

    return score

study.optimize(objective, n_trials=60)

NameError: ignored

In [None]:
study.trials_dataframe().sort_values(by="value").head(10)

In [None]:
print(f"Best SGD regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

## SVR

Support Vector Regression. Tune:
* kernel
* gamma
* C

In [None]:
study = optuna.create_study(study_name="SVR-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

In [None]:
def objective(trial):

    kernel = trial.suggest_categorical("kernel", ["l1", "l2", "elasticnet"])
    C = trial.suggest_float("C", 0.1, 10.0)
    gamma = trial.suggest_float("gamma", 0.1, 0.1)
    pca_components = trial.suggest_int("pca_components", 32, X[0].shape[0])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = SVR(kernel=kernel, C=C, gamma=gamma)
    score = evaluate_regressor(regressor=regressor, pca_components=pca_components, normalization=normalization, scaling=scaling)

    return score

study.optimize(objective, n_trials=60)

NameError: ignored

In [None]:
study.trials_dataframe().sort_values(by="value").head(10)

In [None]:
print(f"Best SVR regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

## Hist Gradient Boosting Regressor

Gradient Boosting regression for large datasets. Tune:
* learning rate
* max leaf nodes
* l2 regularization

In [None]:
study = optuna.create_study(study_name="HistGBoost-regression", storage=storage_name, load_if_exists=True, direction="minimize")
study.set_user_attr("Loss", "RMSE")

In [None]:
def objective(trial):

    learning_rate = trial.suggest_float("learning_rate", 1e-4, 0.5)
    max_leaf_nodes = trial.suggest_int("max_leaf_nodes", 10, 60)
    l2_regularization = trial.suggest_float("l2_regularization", 0.1, 10.0)
    pca_components = trial.suggest_int("pca_components", 32, X[0].shape[0])
    normalization = trial.suggest_categorical("normalization", [True, False])
    scaling = trial.suggest_categorical("scaling", [True, False])

    regressor = HistGradientBoostingRegressor(learning_rate=learning_rate, max_leaf_nodes=max_leaf_nodes, l2_regularization=l2_regularization)
    score = evaluate_regressor(regressor=regressor, pca_components=pca_components, normalization=normalization, scaling=scaling)

    return score

study.optimize(objective, n_trials=60)

NameError: ignored

In [None]:
study.trials_dataframe().sort_values(by="value").head(10)

In [None]:
print(f"Best HistGBoost regression params: {study.best_params}, achieving val RMSE: {study.best_value}")

# Results

# Draft

In [6]:
regressor = Lasso()
regressor_name = "Lasso regressor"

reg = regressor.fit(x_train, y_train)
preds = reg.predict(x_test)

mse = mean_squared_error(y_test, preds)
mae = mean_absolute_error(y_test, preds)

print(f"\033[1m{regressor_name} \033[0m")
print(f"Test MSE: {round(mse, 3)}")
print(f"Test RMSE: {round(np.sqrt(mse), 3)}")
print(f"Test MAE: {round(mae, 3)}\n")

[1mLasso regressor [0m
Test MSE: 0.499
Test RMSE: 0.706
Test MAE: 0.361



In [7]:
#regressors = [LinearRegression(n_jobs=-1), SGDRegressor(), ElasticNet(), BayesianRidge(), SVR(), GradientBoostingRegressor()]
regressors = [ElasticNet(), BayesianRidge()]
regressor_names = ["ElasticNet Regressor", "BayesianRidge Regressor"]

for reg, reg_name in zip(regressors, regressor_names):
    
    reg = reg.fit(x_train, y_train)
    preds = reg.predict(x_test)

    mse = mean_squared_error(y_test, preds)
    mae = mean_absolute_error(y_test, preds)

    print(f"\033[1m{reg_name} \033[0m")
    print(f"Test MSE: {round(mse, 3)}")
    print(f"Test RMSE: {round(np.sqrt(mse), 3)}")
    print(f"Test MAE: {round(mae, 3)}\n")

[1mElasticNet Regressor [0m
Test MSE: 0.499
Test RMSE: 0.706
Test MAE: 0.361

[1mBayesianRidge Regressor [0m
Test MSE: 0.322
Test RMSE: 0.567
Test MAE: 0.28



In [8]:
regressor = LinearRegression()
regressor_name = "Linear Regressor"

for components in [5, 50, 100, 200, 500]:
    pipe = Pipeline([('pca', PCA(n_components=components)), (regressor_name, regressor)])
    pipe.fit(x_train, y_train)
    preds = pipe.predict(x_test)

    mse = mean_squared_error(y_test, preds)
    mae = mean_absolute_error(y_test, preds)

    print(f"\033[1mPCA components: {components} \033[0m")
    print(f"Test MSE: {round(mse, 3)}")
    print(f"Test RMSE: {round(np.sqrt(mse), 3)}")
    print(f"Test MAE: {round(mae, 3)}\n")

[1mPCA components: 5 [0m
Test MSE: 0.454
Test RMSE: 0.674
Test MAE: 0.334

[1mPCA components: 50 [0m
Test MSE: 0.437
Test RMSE: 0.661
Test MAE: 0.325

[1mPCA components: 100 [0m
Test MSE: 0.376
Test RMSE: 0.613
Test MAE: 0.319

[1mPCA components: 200 [0m
Test MSE: 0.334
Test RMSE: 0.578
Test MAE: 0.29

[1mPCA components: 500 [0m
Test MSE: 0.325
Test RMSE: 0.57
Test MAE: 0.282

