In [4]:
"""
Please download all necessary packages below in a new conda environment:
conda install numpy pandas scikit-learn
pip install ucimlrepo
conda install matplotlib jupyterlab
"""




import pandas as pd
import numpy as np

from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance

In [2]:
# fetch dataset 
superconductivty_data = fetch_ucirepo(id=464) 
  
# data (as pandas dataframes) 
X = superconductivty_data.data.features 
y = superconductivty_data.data.targets 
  
# metadata 
print(superconductivty_data.metadata) 
  
# variable information 
print(superconductivty_data.variables) 

{'uci_id': 464, 'name': 'Superconductivty Data', 'repository_url': 'https://archive.ics.uci.edu/dataset/464/superconductivty+data', 'data_url': 'https://archive.ics.uci.edu/static/public/464/data.csv', 'abstract': 'Two file s contain data on 21263 superconductors and their relevant features.', 'area': 'Physics and Chemistry', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 21263, 'num_features': 81, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['critical_temp'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2018, 'last_updated': 'Fri Mar 29 2024', 'dataset_doi': '10.24432/C53P47', 'creators': ['Kam Hamidieh'], 'intro_paper': {'ID': 452, 'type': 'NATIVE', 'title': 'A data-driven statistical model for predicting the critical temperature of a superconductor', 'authors': 'K. Hamidieh', 'venue': 'Computational materials science', 'year': 2018, 'journal': None, 'DOI': None, 'URL': 'http

In [6]:
import pandas as pd
import numpy as np

from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance

# ============================================================
# Helper: map feature name -> human-readable English description
# ============================================================
def describe_feature(name: str) -> str:
    # Special case
    if name == "number_of_elements":
        return "Number of distinct chemical elements present in the compound."

    # Map property token to a human-readable phrase
    prop_map = {
        "atomic_mass": "atomic mass of constituent elements",
        "fie": "first ionization energy of constituent elements",
        "atomic_radius": "atomic radius of constituent elements",
        "Density": "mass density of constituent elements",
        "ElectronAffinity": "electron affinity of constituent elements",
        "FusionHeat": "heat of fusion (fusion enthalpy) of constituent elements",
        "ThermalConductivity": "thermal conductivity of constituent elements",
        "Valence": "valence electron count of constituent elements",
    }

    # Identify prefix and property name
    prefix = None
    prop_token = None
    for pre in [
        "wtd_mean_", "mean_",
        "wtd_gmean_", "gmean_",
        "wtd_entropy_", "entropy_",
        "wtd_range_", "range_",
        "wtd_std_", "std_",
    ]:
        if name.startswith(pre):
            prefix = pre
            prop_token = name[len(pre):]
            break

    # Fallback: generic description
    if prefix is None or prop_token is None:
        return f"Descriptor derived from elemental property '{name}', computed over the compound's composition."

    prop_phrase = prop_map.get(
        prop_token,
        f"property '{prop_token}' of constituent elements"
    )

    # Construct description according to prefix
    if prefix == "mean_":
        return f"Unweighted arithmetic mean of the {prop_phrase} in the compound."
    if prefix == "wtd_mean_":
        return f"Atomic-fraction-weighted arithmetic mean of the {prop_phrase} in the compound."
    if prefix == "gmean_":
        return f"Unweighted geometric mean of the {prop_phrase} in the compound."
    if prefix == "wtd_gmean_":
        return f"Atomic-fraction-weighted geometric mean of the {prop_phrase} in the compound."
    if prefix == "entropy_":
        return f"Entropy-like measure of how heterogeneous the {prop_phrase} are across different elements in the compound (higher = more diverse)."
    if prefix == "wtd_entropy_":
        return f"Atomic-fraction-weighted entropy-like measure of how heterogeneous the {prop_phrase} are across different elements in the compound."
    if prefix == "range_":
        return f"Range (max minus min) of the {prop_phrase} among the elements in the compound."
    if prefix == "wtd_range_":
        return f"Range-like measure of the {prop_phrase}, taking stoichiometric fractions into account."
    if prefix == "std_":
        return f"Standard deviation of the {prop_phrase} among the elements in the compound."
    if prefix == "wtd_std_":
        return f"Atomic-fraction-weighted standard deviation of the {prop_phrase} among the elements in the compound."

    # Generic fallback
    return f"Descriptor derived from elemental property '{prop_token}', computed over the compound's composition."


# ============================================================
# 0. Load superconductivity dataset
# ============================================================
superconductivty_data = fetch_ucirepo(id=464)
X = superconductivty_data.data.features
y = superconductivty_data.data.targets["critical_temp"]

# ============================================================
# 1. Train/test split
# ============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

# ============================================================
# 2. Pearson correlation (filter-based importance)
# ============================================================
df_train = X_train.copy()
df_train["critical_temp"] = y_train

corr_series = df_train.corr()["critical_temp"].drop("critical_temp")
corr_sorted = corr_series.reindex(corr_series.abs().sort_values(ascending=False).index)

top20_corr = set(corr_sorted.head(20).index)
top40_corr = set(corr_sorted.head(40).index)

# ============================================================
# 3. Random Forest feature_importances_
# ============================================================
rf = RandomForestRegressor(
    n_estimators=400,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

feat_names = X.columns
rf_importances = rf.feature_importances_

rf_imp_df = (
    pd.DataFrame({"feature": feat_names, "rf_importance": rf_importances})
    .sort_values("rf_importance", ascending=False)
)

top20_rf = set(rf_imp_df["feature"].head(20))
top40_rf = set(rf_imp_df["feature"].head(40))

# ============================================================
# 4. Permutation importance (model-agnostic)
# ============================================================
perm_result = permutation_importance(
    rf,
    X_test,
    y_test,
    n_repeats=10,
    random_state=42,
    n_jobs=-1
)

perm_imp_df = (
    pd.DataFrame({
        "feature": feat_names,
        "perm_importance_mean": perm_result.importances_mean,
        "perm_importance_std": perm_result.importances_std,
    })
    .sort_values("perm_importance_mean", ascending=False)
)

top20_perm = set(perm_imp_df["feature"].head(20))
top40_perm = set(perm_imp_df["feature"].head(40))

# ============================================================
# 5. Merge all three importance views into one table
# ============================================================
corr_df = corr_sorted.reset_index()
corr_df.columns = ["feature", "pearson_corr_with_Tc"]

summary_df = (
    rf_imp_df
    .merge(perm_imp_df, on="feature", how="outer")
    .merge(corr_df, on="feature", how="outer")
)

# ============================================================
# 6. Shortlist = intersection of Top-40 sets from all three methods
# ============================================================
shortlist_features = sorted(list(top40_corr & top40_rf & top40_perm))
print(f"Shortlist size (intersection of Top-40): {len(shortlist_features)}")

shortlist_df = summary_df[summary_df["feature"].isin(shortlist_features)].copy()
shortlist_df["description"] = shortlist_df["feature"].map(describe_feature)

# Reorder columns for readability
cols_order = [
    "feature",
    "description",
    "rf_importance",
    "perm_importance_mean",
    "perm_importance_std",
    "pearson_corr_with_Tc",
]
shortlist_df = shortlist_df[cols_order]

shortlist_path = "important_features_shortlist_superconductivity.csv"
shortlist_df.to_csv(shortlist_path, index=False)
print(f"Saved shortlist CSV to: {shortlist_path}")

# ============================================================
# 7. Longlist = union of Top-20 sets from all three methods
# ============================================================
longlist_features = sorted(list(top20_corr | top20_rf | top20_perm))
print(f"Longlist size (union of Top-20): {len(longlist_features)}")

longlist_df = summary_df[summary_df["feature"].isin(longlist_features)].copy()
longlist_df["description"] = longlist_df["feature"].map(describe_feature)
longlist_df = longlist_df[cols_order]

longlist_path = "important_features_longlist_superconductivity.csv"
longlist_df.to_csv(longlist_path, index=False)
print(f"Saved longlist CSV to: {longlist_path}")


Shortlist size (intersection of Top-40): 16
Saved shortlist CSV to: important_features_shortlist_superconductivity.csv
Longlist size (union of Top-20): 37
Saved longlist CSV to: important_features_longlist_superconductivity.csv
