# Critical Temperature of Superconductors

- In order to compare in detail the results of different hyperparameters configurations, it is developed a system based on GridSearchcv[*](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) and Pipeline[*](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) that execute a single configuration at each execution, and save it into a csv file. A different csv file is generated for each model.
    - Another advantage of this system is that the program execution can be stopped at any time without losing the already trained configurations
    - The only downside is that the execution is not parallel, but the dataset is relateively small, thus not much time for each configuration execution
- 

In [17]:
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor

from sklearn import preprocessing
from sklearn.decomposition import PCA

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import r2_score, mean_squared_error

from utils import Step, Pipe, Combination, extract_combinations, combination_already_tested, print_results

In [18]:
plt.style.use("seaborn-v0_8")

DATA_FOLDER = "data/"
OUTPUT_FOLDER = "outputs/"

In [19]:
RANDOM_STATE = 42

OUTLIER_REMOVAL = False

---
---
## Data Load

In [20]:
df = pd.concat(
    [
        pd.read_csv(DATA_FOLDER + "formula_train.csv").drop(columns=["critical_temp"]),
        pd.read_csv(DATA_FOLDER + "train.csv"),
    ],
    axis=1,
)
print("Shapes of Properties+Formula df: ", df.shape)

Shapes of Properties+Formula df:  (17010, 169)


In [21]:
# Remove "material" feature
df = df.drop(columns="material")

---
---
## Split

In [22]:
train, test = train_test_split(df, test_size=0.2, random_state=RANDOM_STATE)

X_train = train.drop(columns=["critical_temp"])
y_train = train[["critical_temp"]]

X_test = test.drop(columns=["critical_temp"])
y_test = test[["critical_temp"]]

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((13608, 167), (3402, 167), (13608, 1), (3402, 1))

---
---
## Remove Highly correlated features

In [23]:
class HighCorrFeaturesRemover:
    """
    Removes features with high correlation, according to the 'corr_threshold' parameter.

    Class that provide the fit and transform methods, in order to be used as a "transformer" into the Pipeline class

    ## Parameters
    corr_threshold: float (0,1]
        Percentage of minimum correlation between features, above which a feature is removed
    """

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        corr_matrix = df.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        cols_to_drop = [column for column in upper.columns if any(upper[column] >= self.corr_threshold)]

        # print("{} Cols Removed: {}".format(len(cols_to_drop), cols_to_drop))
        X = X.drop(columns=cols_to_drop)
        return X

    def set_params(self, corr_threshold):
        self.corr_threshold = corr_threshold
        return self

---
---
## Only Properties dataset

In [24]:
class OnlyProperties:
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.drop(
            columns=pd.read_csv(DATA_FOLDER + "formula_train.csv").drop(columns=["critical_temp", "material"]).columns
        )
        return X

## Only Formula dataset

In [25]:
class OnlyFormula:
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.drop(columns=pd.read_csv(DATA_FOLDER + "train.csv").drop(columns=["critical_temp"]).columns)
        return X

---
---
## Outlier removal

In [26]:
if OUTLIER_REMOVAL:
    columns = train.columns
    outliers = pd.Series(index=train.index, dtype=bool)

    clf = LocalOutlierFactor(n_jobs=-1)
    # clf = IsolationForest(
    #     max_samples=1.0,
    #     contamination=0.001,
    #     n_jobs=-1,
    #     random_state=random_state,
    # )
    outliers = clf.fit_predict(train) == -1

    print("Outliers removed: {}".format(outliers.sum()))
    train = train[~outliers]

    X_train = train.drop(columns=["critical_temp"])
    y_train = train[["critical_temp"]]
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [27]:
# TODO: Remove, wrong bc not applicable
# class OutliersRemover:
#     def __init__(self) -> None:
#         self.outliers_vector = None

#     def fit(self, X, y):
#         self.outliers_vector = pd.Series(index=X.index, dtype=bool)

#         clf = LocalOutlierFactor(n_jobs=-1)
#         # clf = IsolationForest(
#         #     max_samples=1.0,
#         #     contamination=0.001,
#         #     n_jobs=-1,
#         #     random_state=random_state,
#         # )
#         self.outliers_vector = clf.fit_predict(np.column_stack((X, y))) == -1
#         print("Outliers removed: {}".format(self.outliers_vector.sum()))
#         return self

#     def transform(self, X, y=None):

#         X = X[~self.outliers_vector]
#         # y = y[~self.outliers_vector]

#         print("Outliers removed: {}".format(self.outliers_vector.sum()))
#         print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
#         return X  # , y

#     def set_params(self):
#         return self

---
---
## Preprocessing

In [28]:
high_corr_features_remover_step = Step(
    "high_corr_features_remover",
    HighCorrFeaturesRemover(),
    {"corr_threshold": 0.95},
)
only_properties_step = Step(
    "only_properties",
    OnlyProperties(),
)
only_formula_step = Step(
    "only_formula",
    OnlyFormula(),
)
std_step = Step(
    "std",
    preprocessing.StandardScaler(),
)
minmax_step = Step(
    "minmax",
    preprocessing.MinMaxScaler(),
)
l1_step = Step(
    "l1",
    preprocessing.Normalizer(norm="l1"),
)
l2_step = Step(
    "l2",
    preprocessing.Normalizer(norm="l2"),
)
lmax_step = Step(
    "lmax",
    preprocessing.Normalizer(norm="max"),
)
pca_step = Step(
    "pca",
    PCA(random_state=RANDOM_STATE),
    {
        "n_components": [0.95],
        # "whiten": [True, False],
    },
)

---
---
## Search

In [29]:
def grid_search(combinations: list[Combination], estimator_tag: str, save_results=True):

    # Iterate over *all* combinations
    for index, combination in enumerate(combinations):
        print("\nCombination {}/{}  |  {}".format(index + 1, len(combinations), combination.tag))

        # Check if this combination is already tested
        if save_results:
            file_name = OUTPUT_FOLDER + estimator_tag + "_output.csv"
            if combination_already_tested(file_name, combination):
                print("  ==> Already done. Skipped.")
                continue

        gs = GridSearchCV(
            estimator=combination.pipeline,
            param_grid=combination.parameters,
            n_jobs=-1,
            cv=3,
            verbose=0,
        )

        # Fit
        gs.fit(X_train, np.ravel(y_train))
        # Predict
        y_pred = gs.predict(X_test)
        # Test scores
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print("  ==> R2: {}\tMSE: {}".format(r2, mse))

        # Save results
        if save_results:
            results = combination.set_MSE(mse).set_R2(r2).as_df()
            if os.path.isfile(file_name):
                outputs = pd.read_csv(file_name)
                if not outputs.empty:
                    results = pd.concat([outputs, results], axis=0)
            results.to_csv(file_name, index=False)

In [33]:
def best_hyperparameters(file_name, percentage):
    df = pd.read_csv(file_name)
    samples = df.shape[0] * percentage // 100
    df = df.sort_values(by="R2", ascending=False).drop(columns=["R2", "MSE"]).iloc[:samples]
    results = dict()
    for hyperparameter in df.columns:
        results.update({hyperparameter: list(df[hyperparameter].unique())})
    display(results)

---
## Linear Regression

In [35]:
estimator_tag = "linear_regression"
linear_regression_step = Step(estimator_tag, LinearRegression())

combinations = extract_combinations(
    Pipe(linear_regression_step),
    Pipe(l2_step, linear_regression_step),
    #
    Pipe(minmax_step, linear_regression_step),
    Pipe(high_corr_features_remover_step, minmax_step, linear_regression_step),
    #
    Pipe(std_step, linear_regression_step),
    Pipe(std_step, pca_step, linear_regression_step),
    Pipe(l2_step, std_step, pca_step, linear_regression_step),
    #
    Pipe(l1_step, minmax_step, linear_regression_step),
    Pipe(l2_step, minmax_step, linear_regression_step),
    Pipe(lmax_step, minmax_step, linear_regression_step),
    Pipe(lmax_step, std_step, linear_regression_step),
    #
    Pipe(high_corr_features_remover_step, std_step, pca_step, linear_regression_step),
    Pipe(high_corr_features_remover_step, lmax_step, minmax_step, linear_regression_step),
    #
    Pipe(only_properties_step, linear_regression_step),
    Pipe(only_formula_step, linear_regression_step),
)
grid_search(combinations, estimator_tag=estimator_tag)

best_hyperparameters(OUTPUT_FOLDER + estimator_tag + "_output.csv", 20)

print_results(OUTPUT_FOLDER + estimator_tag + "_output.csv", 15)


Combination 1/15  |  linear_regression
  ==> Already done. Skipped.

Combination 2/15  |  l2 + linear_regression
  ==> Already done. Skipped.

Combination 3/15  |  minmax + linear_regression
  ==> Already done. Skipped.

Combination 4/15  |  high_corr_features_remover + minmax + linear_regression
  ==> Already done. Skipped.

Combination 5/15  |  std + linear_regression
  ==> Already done. Skipped.

Combination 6/15  |  std + pca + linear_regression
  ==> Already done. Skipped.

Combination 7/15  |  l2 + std + pca + linear_regression
  ==> Already done. Skipped.

Combination 8/15  |  l1 + minmax + linear_regression
  ==> Already done. Skipped.

Combination 9/15  |  l2 + minmax + linear_regression
  ==> Already done. Skipped.

Combination 10/15  |  lmax + minmax + linear_regression
  ==> Already done. Skipped.

Combination 11/15  |  lmax + std + linear_regression
  ==> Already done. Skipped.

Combination 12/15  |  high_corr_features_remover + std + pca + linear_regression
  ==> Already

{'tag': ['minmax + linear_regression',
  'linear_regression',
  'std + linear_regression'],
 'high_corr_features_remover__corr_threshold': [nan],
 'pca__n_components': [nan]}

Unnamed: 0,tag,R2,MSE,high_corr_features_remover__corr_threshold,pca__n_components
2,minmax + linear_regression,0.7538,280.1071,,
0,linear_regression,0.7538,280.1071,,
4,std + linear_regression,0.7538,280.1704,,
7,l1 + minmax + linear_regression,0.7491,285.4663,,
1,l2 + linear_regression,0.7396,296.326,,
8,l2 + minmax + linear_regression,0.7396,296.326,,
3,high_corr_features_remover + minmax + linear_regression,0.7384,297.6406,0.95,
13,only_properties + linear_regression,0.7264,311.3427,,
10,lmax + std + linear_regression,0.7043,336.4186,,
9,lmax + minmax + linear_regression,0.7043,336.4332,,


---
## Random Forest


In [38]:
estimator_tag = "random_forest"
random_forest_step = Step(
    estimator_tag,
    RandomForestRegressor(n_jobs=-1, random_state=RANDOM_STATE),
    {
        "max_features": [0.2, 0.4, 0.6, 0.8],
        "max_samples": [0.66],
        "n_estimators": [200],
        "max_depth": [25],
        "ccp_alpha": [0.0, 0.01],
        "criterion": ["squared_error"],
    },
)

combinations = extract_combinations(
    Pipe(random_forest_step),
    Pipe(l2_step, random_forest_step),
    #
    Pipe(minmax_step, random_forest_step),
    Pipe(high_corr_features_remover_step, minmax_step, random_forest_step),
    #
    Pipe(std_step, random_forest_step),
    Pipe(std_step, pca_step, random_forest_step),
    Pipe(l2_step, std_step, pca_step, random_forest_step),
    #
    Pipe(l1_step, minmax_step, random_forest_step),
    Pipe(l2_step, minmax_step, random_forest_step),
    Pipe(lmax_step, minmax_step, random_forest_step),
    Pipe(lmax_step, std_step, random_forest_step),
    #
    Pipe(high_corr_features_remover_step, std_step, pca_step, random_forest_step),
    Pipe(high_corr_features_remover_step, lmax_step, minmax_step, random_forest_step),
    #
    Pipe(only_properties_step, random_forest_step),
    Pipe(only_formula_step, random_forest_step),
    #
    Pipe(only_properties_step, minmax_step, random_forest_step),
    Pipe(only_formula_step, minmax_step, random_forest_step),
    #
    Pipe(only_properties_step, lmax_step, minmax_step, random_forest_step),
    Pipe(only_formula_step, lmax_step, minmax_step, random_forest_step),
)
grid_search(combinations, estimator_tag=estimator_tag)

best_hyperparameters(OUTPUT_FOLDER + estimator_tag + "_output.csv", 20)

print_results(OUTPUT_FOLDER + estimator_tag + "_output.csv", 15)


Combination 1/152  |  random_forest
  ==> Already done. Skipped.

Combination 2/152  |  random_forest
  ==> Already done. Skipped.

Combination 3/152  |  random_forest
  ==> Already done. Skipped.

Combination 4/152  |  random_forest
  ==> Already done. Skipped.

Combination 5/152  |  random_forest
  ==> Already done. Skipped.

Combination 6/152  |  random_forest
  ==> Already done. Skipped.

Combination 7/152  |  random_forest
  ==> Already done. Skipped.

Combination 8/152  |  random_forest
  ==> Already done. Skipped.

Combination 9/152  |  l2 + random_forest
  ==> Already done. Skipped.

Combination 10/152  |  l2 + random_forest
  ==> Already done. Skipped.

Combination 11/152  |  l2 + random_forest
  ==> Already done. Skipped.

Combination 12/152  |  l2 + random_forest
  ==> Already done. Skipped.

Combination 13/152  |  l2 + random_forest
  ==> Already done. Skipped.

Combination 14/152  |  l2 + random_forest
  ==> Already done. Skipped.

Combination 15/152  |  l2 + random_fores

{'tag': ['high_corr_features_remover + minmax + random_forest',
  'lmax + minmax + random_forest',
  'lmax + std + random_forest',
  'high_corr_features_remover + lmax + std + random_forest',
  'high_corr_features_remover + lmax + minmax + random_forest',
  'lmax + random_forest',
  'std + random_forest',
  'random_forest',
  'minmax + random_forest',
  'minmax + lmax + random_forest'],
 'random_forest__max_samples': [0.66],
 'random_forest__criterion': ['squared_error'],
 'random_forest__n_estimators': [200],
 'random_forest__max_depth': [25, 35, 50],
 'random_forest__max_features': [0.2, 0.7, 0.6, 0.3, 0.5, 0.4, 0.8, 0.9, 0.1],
 'pca__n_components': [nan],
 'high_corr_features_remover__corr_threshold': [0.95, nan],
 'random_forest__max_leaf_nodes': [nan, "'None'"],
 'features_remover__corr_threshold': [nan],
 'random_forest__ccp_alpha': [0.0, nan, 0.01, 0.001]}

Unnamed: 0,tag,R2,MSE,random_forest__max_samples,random_forest__criterion,random_forest__n_estimators,random_forest__max_depth,random_forest__max_features,pca__n_components,high_corr_features_remover__corr_threshold,random_forest__max_leaf_nodes,features_remover__corr_threshold,random_forest__ccp_alpha
418,high_corr_features_remover + minmax + random_forest,0.9278,82.18,0.66,squared_error,200,25,0.2,,0.95,,,0.0
108,lmax + minmax + random_forest,0.9275,82.5286,0.66,squared_error,200,25,0.7,,,,,
598,lmax + std + random_forest,0.9274,82.5686,0.66,squared_error,200,25,0.6,,,,,0.0
161,lmax + std + random_forest,0.9274,82.5686,0.66,squared_error,200,25,0.6,,,,,
167,high_corr_features_remover + lmax + std + random_forest,0.9274,82.578,0.66,squared_error,200,25,0.3,,0.95,,,
187,high_corr_features_remover + lmax + minmax + random_forest,0.9274,82.6101,0.66,squared_error,200,25,0.5,,0.95,,,
424,high_corr_features_remover + minmax + random_forest,0.9273,82.7082,0.66,squared_error,200,25,0.4,,0.95,,,0.0
160,lmax + std + random_forest,0.9272,82.8147,0.66,squared_error,200,25,0.5,,,,,
172,high_corr_features_remover + lmax + std + random_forest,0.9272,82.8284,0.66,squared_error,200,25,0.8,,0.95,,,
185,high_corr_features_remover + lmax + minmax + random_forest,0.9272,82.8345,0.66,squared_error,200,25,0.3,,0.95,,,


---
## XGBoost

[Documentation](https://xgboost.readthedocs.io/en/stable/parameter.html)

In [42]:
from xgboost import XGBRegressor

estimator_tag = "xg_boost"
xg_boost_step = Step(
    estimator_tag,
    XGBRegressor(n_jobs=-1, random_state=RANDOM_STATE),
    {
        "n_estimators": [300],
        "learning_rate": [0.01, 0.3],
        "max_depth": [6],  # default
        "min_child_weight": [1, 3],
        "gamma": [0.0, 0.5],
        "subsample": [0.66, 1.0],
        "colsample_bytree": [0.66, 1.0],
        "reg_lambda": [0.01, 0.1, 1.0],
        "reg_alpha": [0, 0.1, 1.0],
    },
)

combinations = extract_combinations(
    Pipe(xg_boost_step),
    #
    Pipe(minmax_step, xg_boost_step),
    Pipe(high_corr_features_remover_step, minmax_step, xg_boost_step),
    #
    Pipe(std_step, xg_boost_step),
    Pipe(std_step, pca_step, xg_boost_step),
    #
    Pipe(high_corr_features_remover_step, std_step, pca_step, xg_boost_step),
    #
    Pipe(only_properties_step, xg_boost_step),
    Pipe(only_formula_step, xg_boost_step),
)
grid_search(combinations, estimator_tag=estimator_tag)

best_hyperparameters(OUTPUT_FOLDER + estimator_tag + "_output.csv", 20)

print_results(OUTPUT_FOLDER + estimator_tag + "_output.csv", 15)


Combination 1/2304  |  xg_boost
  ==> Already done. Skipped.

Combination 2/2304  |  xg_boost
  ==> Already done. Skipped.

Combination 3/2304  |  xg_boost
  ==> Already done. Skipped.

Combination 4/2304  |  xg_boost
  ==> Already done. Skipped.

Combination 5/2304  |  xg_boost
  ==> Already done. Skipped.

Combination 6/2304  |  xg_boost
  ==> Already done. Skipped.

Combination 7/2304  |  xg_boost
  ==> Already done. Skipped.

Combination 8/2304  |  xg_boost
  ==> Already done. Skipped.

Combination 9/2304  |  xg_boost
  ==> Already done. Skipped.

Combination 10/2304  |  xg_boost
  ==> Already done. Skipped.

Combination 11/2304  |  xg_boost
  ==> Already done. Skipped.

Combination 12/2304  |  xg_boost
  ==> Already done. Skipped.

Combination 13/2304  |  xg_boost
  ==> Already done. Skipped.

Combination 14/2304  |  xg_boost
  ==> Already done. Skipped.

Combination 15/2304  |  xg_boost
  ==> Already done. Skipped.

Combination 16/2304  |  xg_boost
  ==> Already done. Skipped.



KeyboardInterrupt: 

---
## Gradient Boosting

In [None]:
estimator_tag = "gradient_boost"
gradient_boost_step = Step(
    estimator_tag,
    GradientBoostingRegressor(n_jobs=-1, random_state=RANDOM_STATE),
    {
        "loss": ["squared_error", "absolute_error", "huber", "quantile"],
        "learning_rate": [0.001, 0.01, 0.1, 1],
        "n_estimators": [100, 500, 1000],
        "criterion": ["friedman_mse", "squared_error"],
        "min_samples_leaf": [1, 3],
        "max_depth": [25, 50],
        "max_leaf_nodes": [30, 50],
        "ccp_alpha": [0.0, 0.001, 0.01, 0.1, 1],
    },
)

combinations = extract_combinations(
    Pipe(gradient_boost_step),
    Pipe(l2_step, gradient_boost_step),
    #
    Pipe(minmax_step, gradient_boost_step),
    Pipe(high_corr_features_remover_step, minmax_step, gradient_boost_step),
    #
    Pipe(std_step, gradient_boost_step),
    Pipe(std_step, pca_step, gradient_boost_step),
    Pipe(l2_step, std_step, pca_step, gradient_boost_step),
    #
    Pipe(l1_step, minmax_step, gradient_boost_step),
    Pipe(l2_step, minmax_step, gradient_boost_step),
    Pipe(lmax_step, minmax_step, gradient_boost_step),
    Pipe(lmax_step, std_step, gradient_boost_step),
    #
    Pipe(high_corr_features_remover_step, std_step, pca_step, gradient_boost_step),
    Pipe(high_corr_features_remover_step, lmax_step, minmax_step, gradient_boost_step),
    #
    Pipe(only_properties_step, gradient_boost_step),
    Pipe(only_formula_step, gradient_boost_step),
)
grid_search(combinations, estimator_tag=estimator_tag)

best_hyperparameters(OUTPUT_FOLDER + estimator_tag + "_output.csv", 20)

print_results(OUTPUT_FOLDER + estimator_tag + "_output.csv", 15)

---
## KNN

In [None]:
estimator_tag = "knr"
knr_step = Step(
    estimator_tag,
    KNeighborsRegressor(n_jobs=-1),
    {
        "n_neighbors": [5, 15, 25, 35],
        "weights": ["distance", "uniform"],
        "metric": ["cosine", "euclidean", "cityblock", "nan_euclidean"],
    },
)

combinations = extract_combinations(
    Pipe(knr_step),
    Pipe(l2_step, knr_step),
    #
    Pipe(minmax_step, knr_step),
    Pipe(high_corr_features_remover_step, minmax_step, knr_step),
    #
    Pipe(std_step, knr_step),
    Pipe(std_step, pca_step, knr_step),
    Pipe(l2_step, std_step, pca_step, knr_step),
    #
    Pipe(l1_step, minmax_step, knr_step),
    Pipe(l2_step, minmax_step, knr_step),
    Pipe(lmax_step, minmax_step, knr_step),
    Pipe(lmax_step, std_step, knr_step),
    #
    Pipe(high_corr_features_remover_step, std_step, pca_step, knr_step),
    Pipe(high_corr_features_remover_step, lmax_step, minmax_step, knr_step),
    #
    Pipe(only_properties_step, knr_step),
    Pipe(only_formula_step, knr_step),
)
grid_search(combinations, estimator_tag=estimator_tag)

best_hyperparameters(OUTPUT_FOLDER + estimator_tag + "_output.csv", 20)

print_results(OUTPUT_FOLDER + estimator_tag + "_output.csv", 15)

---
## SVR

In [None]:
estimator_tag = "svr"
svr_step = Step(
    estimator_tag,
    SVR(n_jobs=-1),
    {
        "kernel": ["linear", "poly", "rbf", "sigmoid", "precomputed"],
        "epsilon": [0.01],  # 0, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
        "tol": [0.0001],  # [1e-5, 1e-4, 1e-3, 1e-2],
        "C": [10],  # 0.1, 1, 100, 500, 5, 20, 8, 9, 15
        "loss": ["epsilon_insensitive"],  # "squared_epsilon_insensitive"
        "max_iter": [2000],
    },
)

combinations = extract_combinations(
    Pipe(svr_step),
    Pipe(l2_step, svr_step),
    #
    Pipe(minmax_step, svr_step),
    Pipe(high_corr_features_remover_step, minmax_step, svr_step),
    #
    Pipe(std_step, svr_step),
    Pipe(std_step, pca_step, svr_step),
    Pipe(l2_step, std_step, pca_step, svr_step),
    #
    Pipe(l1_step, minmax_step, svr_step),
    Pipe(l2_step, minmax_step, svr_step),
    Pipe(lmax_step, minmax_step, svr_step),
    Pipe(lmax_step, std_step, svr_step),
    #
    Pipe(high_corr_features_remover_step, std_step, pca_step, svr_step),
    Pipe(high_corr_features_remover_step, lmax_step, minmax_step, svr_step),
    #
    Pipe(only_properties_step, svr_step),
    Pipe(only_formula_step, svr_step),
)
grid_search(combinations, estimator_tag=estimator_tag)

best_hyperparameters(OUTPUT_FOLDER + estimator_tag + "_output.csv", 20)

print_results(OUTPUT_FOLDER + estimator_tag + "_output.csv", 15)

In [None]:
import hiplot as hip

hip.Experiment.from_csv(OUTPUT_FOLDER + estimator_tag + "_output.csv").display()