# Critical Temperature of Superconductors

In [46]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor

from sklearn import preprocessing

from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

import itertools
from sklearn.model_selection import GridSearchCV

from utils import Step, Pipe, MultiplePipes

import os
from sklearn.metrics import r2_score, mean_squared_error

In [47]:
plt.style.use("seaborn-v0_8")

DATA_FOLDER = "data/"

In [48]:
RANDOM_STATE = 42

# REMOVE_HIGH_CORR_FEATURES = True
# CORR_THRESHOLD = 0.95


OUTLIER_REMOVAL = False

---
---
## Data Load

In [49]:
df = pd.concat(
    [
        pd.read_csv(DATA_FOLDER + "formula_train.csv").drop(columns=["critical_temp"]),
        pd.read_csv(DATA_FOLDER + "train.csv"),
    ],
    axis=1,
)
print("Shapes of Properties+Formula df: ", df.shape)

Shapes of Properties+Formula df:  (17010, 169)


In [50]:
# Remove "material" feature
df = df.drop(columns="material")

---
---
## Split

In [51]:
train, test = train_test_split(df, test_size=0.2, random_state=RANDOM_STATE)

X_train = train.drop(columns=["critical_temp"])
y_train = train[["critical_temp"]]

X_test = test.drop(columns=["critical_temp"])
y_test = test[["critical_temp"]]

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((13608, 167), (3402, 167), (13608, 1), (3402, 1))

---
---
## Remove Highly correlated features

In [52]:
# if REMOVE_HIGH_CORR_FEATURES:
#     corr_matrix = df.corr().abs()
#     upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
#     cols_to_drop = [column for column in upper.columns if any(upper[column] >= CORR_THRESHOLD)]

#     print("{} Cols Removed: {}".format(len(cols_to_drop), cols_to_drop))
#     X_train = X_train.drop(columns=cols_to_drop)
#     X_test = X_test.drop(columns=cols_to_drop)
#     print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [53]:
class FeaturesRemover:
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        corr_matrix = df.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        cols_to_drop = [column for column in upper.columns if any(upper[column] >= self.corr_threshold)]

        # print("{} Cols Removed: {}".format(len(cols_to_drop), cols_to_drop))
        X = X.drop(columns=cols_to_drop)
        return X

    def set_params(self, corr_threshold):
        self.corr_threshold = corr_threshold
        return self

---
---
## Outlier removal

In [54]:
if OUTLIER_REMOVAL:
    columns = train.columns
    outliers = pd.Series(index=train.index, dtype=bool)

    clf = LocalOutlierFactor(n_jobs=-1)
    # clf = IsolationForest(
    #     max_samples=1.0,
    #     contamination=0.001,
    #     n_jobs=-1,
    #     random_state=random_state,
    # )
    outliers = clf.fit_predict(train) == -1

    print("Outliers removed: {}".format(outliers.sum()))
    train = train[~outliers]

    X_train = train.drop(columns=["critical_temp"])
    y_train = train[["critical_temp"]]
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [55]:
# class OutliersRemover:
#     def __init__(self) -> None:
#         self.outliers_vector = None

#     def fit(self, X, y):
#         self.outliers_vector = pd.Series(index=X.index, dtype=bool)

#         clf = LocalOutlierFactor(n_jobs=-1)
#         # clf = IsolationForest(
#         #     max_samples=1.0,
#         #     contamination=0.001,
#         #     n_jobs=-1,
#         #     random_state=random_state,
#         # )
#         self.outliers_vector = clf.fit_predict(np.column_stack((X, y))) == -1
#         print("Outliers removed: {}".format(self.outliers_vector.sum()))
#         return self

#     def transform(self, X, y=None):

#         X = X[~self.outliers_vector]
#         y = y[~self.outliers_vector]

#         print("Outliers removed: {}".format(self.outliers_vector.sum()))
#         print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
#         return X, y

#     def set_params(self):
#         return self

---
---
## Preprocessing

In [56]:
features_remover_step = Step(
    "features_remover",
    FeaturesRemover(),
    {"corr_threshold": 0.95},
)
std_step = Step(
    "std",
    preprocessing.StandardScaler(),
)
minmax_step = Step(
    "minmax",
    preprocessing.MinMaxScaler(),
)
l1_step = Step(
    "l1",
    preprocessing.Normalizer(norm="l1"),
)
l2_step = Step(
    "l2",
    preprocessing.Normalizer(norm="l2"),
)
lmax_step = Step(
    "lmax",
    preprocessing.Normalizer(norm="max"),
)
pca_step = Step(
    "pca",
    PCA(random_state=RANDOM_STATE),
    {
        "n_components": [0.95],
        # "whiten": [True, False],
        # "svd_solver": "full",
    },
)

In [57]:
def grid_search(combinations: MultiplePipes, estimator_tag: str, save_results=True):

    # Iterate over *all* combinations
    for index, (pipeline, parameters, tag) in enumerate(combinations.combinations):
        print(
            "Combination {}/{}. Steps: {}\n\tParams:{}".format(
                index + 1, len(combinations.combinations), tag, parameters
            )
        )

        if save_results:
            file_name = "outputs/" + "1_" + estimator_tag + "_output.csv"
            if os.path.isfile(file_name):
                outputs = pd.read_csv(file_name)
                # if all parameters are present in the output:
                if all(x in outputs.columns for x in parameters.keys()):
                    outputs = outputs.loc[:, parameters.keys()]
                    if outputs.isin(parameters).any(axis=1).any(): # TODO: IL PROBLEMA è CHE OGNI ELEMENTO DEL DIZIONARIO è UNA LISTA, MENTRE NEL DF OVVIAMENTE NO. MA PERCHè FA SEMPRE MATCH???
                        print("\tAlready done. Skipped.")
                        continue

        gs = GridSearchCV(
            estimator=pipeline,
            param_grid=parameters,
            scoring="r2",
            n_jobs=-1,
            cv=3,
            verbose=0,
        )

        # Fit
        gs.fit(X_train, np.ravel(y_train))
        # Predict
        y_pred = gs.predict(X_test)
        # Test scores
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print("MSE: {}\tR2: {}".format(mse, r2))

        if save_results:
            outputs = pd.DataFrame(parameters or None, index=[0])
            outputs = outputs.assign(tag=tag, MSE=mse, R2=r2)
            outputs = pd.concat(
                [pd.read_csv(file_name) if os.path.isfile(file_name) else pd.DataFrame(), outputs], axis=0
            )
            outputs.to_csv(file_name, index=False)

In [58]:
estimator_tag = "random_forest"
random_forest_step = Step(
    estimator_tag,
    RandomForestRegressor(n_jobs=-1, random_state=RANDOM_STATE),
    {
        "max_samples": [0.66, 0.33],
        "criterion": ["squared_error"],
        "n_estimators": [200],
        "max_depth": [25],  # , 5, 10, 15, 20],
        "max_leaf_nodes": [None],  # 50, 100, 200, 300, 400
        "ccp_alpha": [0.01],  # 0.0, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0
        "max_features": [0.4],  # "sqrt", "log2", 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9
    },
)

combinations = MultiplePipes(
    Pipe(minmax_step, pca_step, random_forest_step),
    Pipe(features_remover_step, minmax_step, pca_step, random_forest_step),
)
grid_search(combinations, estimator_tag=estimator_tag)

Combination 1/4. Steps: minmax + pca + random_forest
	Params:{'pca__n_components': [0.95], 'random_forest__max_samples': [0.66], 'random_forest__criterion': ['squared_error'], 'random_forest__n_estimators': [200], 'random_forest__max_depth': [25], 'random_forest__max_leaf_nodes': [None], 'random_forest__ccp_alpha': [0.01], 'random_forest__max_features': [0.4]}


TypeError: argument of type 'float' is not iterable

---
---
## Search