# Critical Temperature of Superconductors

In [39]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor

from sklearn.preprocessing import MinMaxScaler

In [40]:
plt.style.use("seaborn-v0_8")

DATA_FOLDER = "data/"

In [41]:
RANDOM_STATE = 42

REMOVE_HIGH_CORR_FEATURES = True
CORR_THRESHOLD = 0.95


OUTLIER_REMOVAL = True

---
---
## Data Load

In [42]:
df = pd.concat(
    [
        pd.read_csv(DATA_FOLDER + "formula_train.csv").drop(columns=["critical_temp"]),
        pd.read_csv(DATA_FOLDER + "train.csv"),
    ],
    axis=1,
)
print("Shapes of Properties+Formula df: ", df.shape)

# Remove "material" feature
df = df.drop(columns="material")

Shapes of Properties+Formula df:  (17010, 169)


---
---
## Remove Highly correlated features

In [43]:
if REMOVE_HIGH_CORR_FEATURES:
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    cols_to_drop = [column for column in upper.columns if any(upper[column] >= CORR_THRESHOLD)]

    print("{} Cols Removed: {}".format(len(cols_to_drop), cols_to_drop))
    df = df.drop(columns=cols_to_drop)

23 Cols Removed: ['wtd_gmean_atomic_mass', 'std_atomic_mass', 'gmean_fie', 'wtd_gmean_fie', 'entropy_fie', 'std_fie', 'wtd_gmean_atomic_radius', 'entropy_atomic_radius', 'wtd_entropy_atomic_radius', 'std_atomic_radius', 'wtd_std_atomic_radius', 'wtd_gmean_Density', 'std_Density', 'std_ElectronAffinity', 'wtd_gmean_FusionHeat', 'std_FusionHeat', 'std_ThermalConductivity', 'wtd_std_ThermalConductivity', 'gmean_Valence', 'wtd_gmean_Valence', 'entropy_Valence', 'wtd_entropy_Valence', 'std_Valence']


---
---
## Split

In [44]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns="critical_temp"), df[["critical_temp"]], test_size=0.2, random_state=RANDOM_STATE
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((13608, 144), (3402, 144), (13608, 1), (3402, 1))

---
---
## Outlier removal

In [47]:
# if OUTLIER_REMOVAL:
#     columns = X_train.columns
#     outliers = pd.Series(index=X_train.index, dtype=bool)

#     clf = LocalOutlierFactor(n_neighbors=25, contamination=0.001, n_jobs=-1)
#     # clf = IsolationForest(
#     #     max_samples=1.0,
#     #     contamination=0.001,
#     #     n_jobs=-1,
#     #     random_state=random_state,
#     # )
#     outliers = clf.fit_predict(X_train) == -1

#     print("Outliers removed: {}".format(outliers.sum()))
#     X_train = X_train[~outliers]

Outliers removed: 14


(13566, 144)

---
---
## Preprocessing