In [1]:
import sys
sys.path.append('..')
from PHF_RF_code import *

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.express as px
from pymatgen.io.cif import CifParser

## Functions to run the code

In [3]:
def plot_predicted_vs_actual(target_tc, predicted_tc, df, save_image, width, height, name):    
    df["predicted_tc"] = predicted_tc
    
    # Define a dictionary to map the old names to the new names
    name_mapping = {
        'Other': 'Other',
        'Not_supercon': 'Not Superconductor',
        'Cuprate': 'Cuprate',
        'Ferrite': 'Ferrite',
        'Heavy_fermion': 'Heavy Fermion',
        'Oxide': 'Oxide',
        'Chevrel': 'Chevrel',
        'Carbon': 'Carbon',
        'Heavy_fermionChevrel': 'Heavy Fermion Chevrel',
        'OxideHeavy_fermion': 'Oxide Heavy Fermion'
    }

    # Replace the values in the sc_class column with the new names
    df['sc_class_name'] = df['sc_class'].map(name_mapping)
    
    unique_categories = df["sc_class_name"].unique()

    colors = px.colors.qualitative.G10

    color_map = {
        category: colors[i % len(colors)]
        for i, category in enumerate(unique_categories)
    }


    traces = []
    for category, color in color_map.items():
        filtered_df = df[df["sc_class_name"] == category]
        trace = go.Scatter(
            x=filtered_df["tc"], 
            y=filtered_df["predicted_tc"], 
            mode='markers', 
            text=filtered_df["formula_sc"], 
            hoverinfo='text',
            marker=dict(
                color=color,
                size=8,  # Adjust marker size as needed
                opacity=0.8  # Adjust marker opacity as needed
            ),
            name=category  # Use category name for legend
        )
        traces.append(trace)

    # Add y=x line
    traces.append(go.Scatter(
        x=np.linspace(min(predicted_tc), max(predicted_tc), 100),
        y=np.linspace(min(predicted_tc), max(predicted_tc), 100),
        mode='lines',
        name='y=x',
        line=dict(color='black', width=5, dash='dash')
    ))

    # Create layout
    layout = go.Layout(
        xaxis=dict(title="Real Value", showline=True, linewidth=5, linecolor='black',
                ticks='inside', tickwidth=4, ticklen=5, range=[min(target_tc)-0.5, max(target_tc)+10]),
        yaxis=dict(title="Predicted Value", showline=True, linewidth=5, linecolor='black',
                ticks='inside', tickwidth=4, ticklen=5, range=[min(predicted_tc)-0.5, max(predicted_tc)+10]),
        width=width,
        height=height,
        plot_bgcolor='white',
        paper_bgcolor='white', 
        font=dict(family='Helvetica', size=24, color='black'),
        margin=dict(l=10, r=10, b=10, t=10),
        legend=dict(orientation="v", yanchor="bottom", y=0, xanchor="right", x=1.5)
    )

    # Create figure
    fig = go.Figure(data=traces, layout=layout)

    # Show the plot
    fig.show()
    
    if save_image:
        if not os.path.exists("plots"):
            os.mkdir("plots")
        fig.write_image(f"plots/actvspred_{name}.png", width=width, height=height, scale=3)

# Data Read In

In [4]:
df_MP = pd.read_csv("../3DSC_MP.csv", skiprows=1)
df_MP['cif'] = df_MP['cif'].str.replace('data/final/MP/', '../')
df_MP.columns = df_MP.columns.str.replace('_2', '')
df_MP.head()

Unnamed: 0,formula_sc,formula_similarity,totreldiff,formula_frac,correct_formula_frac,formula,orig_formula_cif,tc,sc_class,sc_class_unique_sc,...,monoclinic,orthorhombic,tetragonal,triclinic,trigonal,primitive,base-centered,body-centered,face-centered,weight
0,Ag0.02Ge2Pd1.98Sr1,2,0.008,1.0,True,Ag0.02Ge2Pd1.98Sr1,Ge2Pd2Sr1,2.64,Other,True,...,0,0,7,0,0,0,0,1,0,1.0
1,Ag0.15Sn0.85Te1,3,0.15,1.0,True,Ag0.15Sn0.85Te1,Sn1Te1,2.15,Other,True,...,0,0,0,0,0,0,0,0,1,1.0
2,Ag0.1Ge2Pd1.9Sr1,2,0.04,1.0,True,Ag0.1Ge2Pd1.9Sr1,Ge2Pd2Sr1,2.62,Other,True,...,0,0,7,0,0,0,0,1,0,1.0
3,Ag0.1In0.9Te1,3,0.1,1.0,True,Ag0.1In0.9Te1,In1Te1,1.2,Other,True,...,0,0,0,0,0,0,0,0,1,1.0
4,Ag0.2Ba1Si1.8,3,0.133333,4.0,False,Ag0.8Ba4Si7.2,Ba4Si8,3.2,Other,True,...,0,0,0,0,0,1,0,0,0,1.0


In [5]:
for cif_path in df_MP["cif"]:
    parser = CifParser(cif_path)
    structure = parser.get_structures()[0]  # Assuming there's only one structure in the file

    # Get the composition of the structure
    composition = structure.composition

    # Calculate the average atomic weight
    average_atomic_weight = composition.weight
    
    df_MP.loc[df_MP["cif"] == cif_path, "average_atomic_weight"] = average_atomic_weight

In [6]:
df_MP.loc[df_MP["tc"] == 0, "sc_class"] = "Not_supercon"

## Entire Dataset

In [7]:
physical_features =['num_elements_sc',
                    'lata',
                    'latb',
                    'latc',
                    'density',
                    'e_above_hull',
                    'efermi',
                    'final_energy',
                    'final_energy_per_atom',
                    'formation_energy_per_atom',
                    'nsites',
                    'cell_volume',
                    'exchange_symmetry',
                    'true_total_magnetization',
                    'average_atomic_weight',
                    'totreldiff']
PH_features = []

asymcell_features = np.load("featurised_datasets\PHF_AsymCell.npy")

df_all = df_MP.copy()

for i, feature in enumerate(asymcell_features.T):
    df_all[f"Feature {i}"] = np.squeeze(feature)
    PH_features.append(f"Feature {i}")
    
feature_names_all = physical_features + PH_features

df_all.head()

Unnamed: 0,formula_sc,formula_similarity,totreldiff,formula_frac,correct_formula_frac,formula,orig_formula_cif,tc,sc_class,sc_class_unique_sc,...,Feature 8,Feature 9,Feature 10,Feature 11,Feature 12,Feature 13,Feature 14,Feature 15,Feature 16,Feature 17
0,Ag0.02Ge2Pd1.98Sr1,2,0.008,1.0,True,Ag0.02Ge2Pd1.98Sr1,Ge2Pd2Sr1,2.64,Other,True,...,0.0,2.781936,0.297758,0.0,1.809062,0.132656,0.0,4.888015,0.0,0.0
1,Ag0.15Sn0.85Te1,3,0.15,1.0,True,Ag0.15Sn0.85Te1,Sn1Te1,2.15,Other,True,...,0.0,2.778744,0.0,0.0,3.781856,0.0,0.0,0.0,0.0,0.0
2,Ag0.1Ge2Pd1.9Sr1,2,0.04,1.0,True,Ag0.1Ge2Pd1.9Sr1,Ge2Pd2Sr1,2.62,Other,True,...,0.0,2.781936,0.297758,0.0,1.809062,0.132656,0.0,4.888015,0.0,0.0
3,Ag0.1In0.9Te1,3,0.1,1.0,True,Ag0.1In0.9Te1,In1Te1,1.2,Other,True,...,0.0,2.721158,0.0,0.0,3.664907,0.0,0.0,0.0,0.0,0.0
4,Ag0.2Ba1Si1.8,3,0.133333,4.0,False,Ag0.8Ba4Si7.2,Ba4Si8,3.2,Other,True,...,0.0,4.866506,0.682774,0.0,1.839699,0.182906,0.0,12.958865,3.481724,0.0


## Only superconductors

In [8]:
df_supercon_all = df_all[df_all["sc_class"] != "Not_supercon"]
df_supercon_all.head()

Unnamed: 0,formula_sc,formula_similarity,totreldiff,formula_frac,correct_formula_frac,formula,orig_formula_cif,tc,sc_class,sc_class_unique_sc,...,Feature 8,Feature 9,Feature 10,Feature 11,Feature 12,Feature 13,Feature 14,Feature 15,Feature 16,Feature 17
0,Ag0.02Ge2Pd1.98Sr1,2,0.008,1.0,True,Ag0.02Ge2Pd1.98Sr1,Ge2Pd2Sr1,2.64,Other,True,...,0.0,2.781936,0.297758,0.0,1.809062,0.132656,0.0,4.888015,0.0,0.0
1,Ag0.15Sn0.85Te1,3,0.15,1.0,True,Ag0.15Sn0.85Te1,Sn1Te1,2.15,Other,True,...,0.0,2.778744,0.0,0.0,3.781856,0.0,0.0,0.0,0.0,0.0
2,Ag0.1Ge2Pd1.9Sr1,2,0.04,1.0,True,Ag0.1Ge2Pd1.9Sr1,Ge2Pd2Sr1,2.62,Other,True,...,0.0,2.781936,0.297758,0.0,1.809062,0.132656,0.0,4.888015,0.0,0.0
3,Ag0.1In0.9Te1,3,0.1,1.0,True,Ag0.1In0.9Te1,In1Te1,1.2,Other,True,...,0.0,2.721158,0.0,0.0,3.664907,0.0,0.0,0.0,0.0,0.0
4,Ag0.2Ba1Si1.8,3,0.133333,4.0,False,Ag0.8Ba4Si7.2,Ba4Si8,3.2,Other,True,...,0.0,4.866506,0.682774,0.0,1.839699,0.182906,0.0,12.958865,3.481724,0.0


# Regressor Models

## PHFs Only

In [9]:
PHF_only = randomforests(df_all, PH_features, "tc", test_size=0.2, random_state=42, name=f"Only PH Features", stratify=False)

best_model_PHFonly, best_parameters_PHFonly = PHF_only.train_regressor_model_grid_search()
print(best_parameters_PHFonly)

{'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [10]:
mae, mse, r2 = PHF_only.evaluate_regressor_model(best_model_PHFonly)
print(f"Mean Absolute Error: {mae}, Mean Squared Error: {mse}, R^2 Score: {r2}")

cv_scores = PHF_only.calc_cross_val_score(best_model_PHFonly, cv = 10, scoring = 'r2')
mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)
print(f"Mean CV Score: {mean_cv_score}, Standard Deviation of CV Scores: {std_cv_score}")

Mean Absolute Error: 4.76656803019643, Mean Squared Error: 74.69829143357735, R^2 Score: 0.7927879501764883
Mean CV Score: 0.7158556359877906, Standard Deviation of CV Scores: 0.06159239772028609


In [11]:
PHF_only.plot_feature_importance(best_model_PHFonly.feature_importances_, show=True, width=1000, height=500, save_image=True)

In [12]:
plot_predicted_vs_actual(df_all["tc"], best_model_PHFonly.predict(df_all[PH_features]), df_all,
                         save_image=True, width=1000, height=700, name="PHF_only")

## Combined

In [13]:
allFeat_allMat = randomforests(df_all, feature_names_all, "tc", test_size=0.2, random_state=42, name=f"All Features", stratify=False)

best_model_allFallM, best_parameters_allFallM = allFeat_allMat.train_regressor_model_grid_search()
print(best_parameters_allFallM)

{'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}


In [14]:
mae, mse, r2 = allFeat_allMat.evaluate_regressor_model(best_model_allFallM)
print(f"Mean Absolute Error: {mae}, Mean Squared Error: {mse}, R^2 Score: {r2}")

cv_scores = allFeat_allMat.calc_cross_val_score(best_model_allFallM, cv = 10, scoring = 'r2')
mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)
print(f"Mean CV Score: {mean_cv_score}, Standard Deviation of CV Scores: {std_cv_score}")

Mean Absolute Error: 3.96742470031129, Mean Squared Error: 64.03827991351217, R^2 Score: 0.8223586779109924
Mean CV Score: 0.7664736586066236, Standard Deviation of CV Scores: 0.05211961915047858


In [15]:
allFeat_allMat.plot_feature_importance(best_model_allFallM.feature_importances_, show=True, width=1500, height=700, save_image=True)

In [16]:
plot_predicted_vs_actual(df_all["tc"], best_model_allFallM.predict(df_all[feature_names_all]), df_all,
                         save_image=True, width=1000, height=700, name="allFeat_allMat")

## Physical Features Only

In [17]:
Phys_only = randomforests(df_all, physical_features, "tc", test_size=0.2, random_state=42, name=f"Only Physical Features", stratify=False)

best_model_Phys_only, best_parameters_Phys_only = Phys_only.train_regressor_model_grid_search()
print(best_parameters_Phys_only)

{'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}


In [18]:
mae, mse, r2 = Phys_only.evaluate_regressor_model(best_model_Phys_only)
print(f"Mean Absolute Error: {mae}, Mean Squared Error: {mse}, R^2 Score: {r2}")

cv_scores = Phys_only.calc_cross_val_score(best_model_Phys_only, cv = 10, scoring = 'r2')
mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)
print(f"Mean CV Score: {mean_cv_score}, Standard Deviation of CV Scores: {std_cv_score}")

Mean Absolute Error: 4.080021217021993, Mean Squared Error: 66.49560340730959, R^2 Score: 0.8155420957849874
Mean CV Score: 0.7659150752613652, Standard Deviation of CV Scores: 0.04621779090373402


In [19]:
Phys_only.plot_feature_importance(best_model_Phys_only.feature_importances_, show=True, width=1000, height=700, save_image=True)

In [20]:
plot_predicted_vs_actual(df_all["tc"], best_model_Phys_only.predict(df_all[physical_features]), df_all,
                         save_image=True, width=1000, height=700, name="Phys_only")

## With only superconductors

In [21]:
allFeat_SupMat = randomforests(df_supercon_all, feature_names_all, "tc", test_size=0.2, random_state=42, name=f"All Features Only Supercon", stratify=False)

best_model_allFeat_SupMat, best_parameters_allFeat_SupMat = allFeat_SupMat.train_regressor_model_grid_search()
print(best_parameters_Phys_only)

{'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}


In [22]:
mae, mse, r2 = allFeat_SupMat.evaluate_regressor_model(best_model_allFeat_SupMat)
print(f"Mean Absolute Error: {mae}, Mean Squared Error: {mse}, R^2 Score: {r2}")

cv_scores = allFeat_SupMat.calc_cross_val_score(best_model_allFeat_SupMat, cv = 10, scoring = 'r2')
mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)
print(f"Mean CV Score: {mean_cv_score}, Standard Deviation of CV Scores: {std_cv_score}")

Mean Absolute Error: 3.735670345422754, Mean Squared Error: 50.73883701494927, R^2 Score: 0.8894527167470981
Mean CV Score: 0.8922703770147601, Standard Deviation of CV Scores: 0.024994662293464812


In [23]:
allFeat_SupMat.plot_feature_importance(best_model_allFeat_SupMat.feature_importances_, show=True, width=1500, height=700, save_image=True)

In [24]:
plot_predicted_vs_actual(df_supercon_all["tc"], best_model_allFeat_SupMat.predict(df_supercon_all[feature_names_all]), df_supercon_all,
                         save_image=True, width=1000, height=700, name="allFeat_SupMat")

## Plots to compare models

In [25]:
def run_cvscores(list_of_models, list_of_vars, save_image=True, width=1000, height=700, name="cvscores"):
    colors = ["rgb(94, 201, 98)", "rgb(33, 145, 140)", "rgb(59, 82, 139)", "rgb(68, 1, 84)"]
    traces = []
    for model, vars in zip(list_of_models, list_of_vars):
        cv_scores = vars.calc_cross_val_score(model, cv = 10, scoring = 'r2')
        trace = go.Box(
            y=cv_scores,
            name=vars.name,
            jitter=0.5,
            whiskerwidth=0.2,
            marker=dict(
                size=2,
                color=colors[len(traces)],
            ),
            line=dict(width=1),
        )
        traces.append(trace)
        
    layout = go.Layout(
        xaxis=dict(title="Model", showline=True, linewidth=5, linecolor='black',
                ticks='inside', tickwidth=4, ticklen=5),
        yaxis=dict(title=f"Cross Validation Regression Score", showline=True, linewidth=5, linecolor='black',
                ticks='inside', tickwidth=4, ticklen=5),
        width=width,
        height=height,
        plot_bgcolor='white',
        paper_bgcolor='white', 
        font=dict(family='Helvetica', size=24, color='black'),
        margin=dict(l=10, r=10, b=10, t=10),
        legend=dict(orientation="v", yanchor="bottom", y=0, xanchor="right", x=1),
    )
    
    fig = go.Figure(data=traces, layout=layout)
    fig.show()
    
    if save_image:
        if not os.path.exists("plots"):
            os.mkdir("plots")
        fig.write_image(f"plots/box_{name}.png", width=width, height=height, scale=3)

run_cvscores([best_model_PHFonly, best_model_Phys_only, best_model_allFallM],[PHF_only, Phys_only, allFeat_allMat], save_image=True)

In [26]:
run_cvscores([best_model_allFeat_SupMat], [allFeat_SupMat], save_image=True, width=700, height=700, name="cvscores_SupMat")

# Classification Model

In [27]:
df_superornot = df_all.copy()
df_superornot["supercon_ornot"] = df_superornot["sc_class"].apply(lambda x: 1 if x != "Not_supercon" else 0)

not_features = ['tc','formula_sc', 'formula', 'orig_formula_cif', 'norm_formula_sc', 'chemical_composition_sc','origin_sc', 'old_formula_sc', 'database_id',
                'original_formula','chemical_composition', 'norm_formula', 'spacegroup', 'crystal_system',
                'cif', 'original_cif', 'material_id', 'band_structure','created_at',
                'doi','doi_bibtex','dos','exp','has', 'has_bandstructure', 'icsd_ids', 'last_updated', 
                'magnetic_type', 'ntask_ids', 'original_task_id', 'oxide_type', 'pretty_formula', 'pseudo_potential', 'reduced_cell_formula',
                'run_type','task_id','task_ids','unit_cell_formula','warnings','ordering','magmoms','origin','cif_before_synthetic_doping',
                'Reason for exclusion','graph','crystal_temp','no_crystal_temp_given','point_group',"weight",
                "energy_per_atom", "energy", "total_magnetization", "total_magnetization_normalized_vol", "total_magnetization_normalized_formula_units",
                'sc_class', 'sc_class_unique_sc', 'predicted_tc', 'sc_class_name']

df_superornot = df_superornot.drop(columns=not_features)

In [28]:
df_superornot.columns[:-1]

Index(['formula_similarity', 'totreldiff', 'formula_frac',
       'correct_formula_frac', 'num_elements_sc', 'lata', 'latb', 'latc',
       'band_gap', 'density', 'e_above_hull', 'efermi', 'encut',
       'final_energy', 'final_energy_per_atom', 'formation_energy_per_atom',
       'is_ordered', 'nsites', 'cell_volume', 'is_magnetic',
       'exchange_symmetry', 'num_unique_magnetic_sites', 'num_magnetic_sites',
       'true_total_magnetization', 'synth_doped', 'cubic', 'hexagonal',
       'monoclinic', 'orthorhombic', 'tetragonal', 'triclinic', 'trigonal',
       'primitive', 'base-centered', 'body-centered', 'face-centered',
       'average_atomic_weight', 'Feature 0', 'Feature 1', 'Feature 2',
       'Feature 3', 'Feature 4', 'Feature 5', 'Feature 6', 'Feature 7',
       'Feature 8', 'Feature 9', 'Feature 10', 'Feature 11', 'Feature 12',
       'Feature 13', 'Feature 14', 'Feature 15', 'Feature 16', 'Feature 17'],
      dtype='object')

In [29]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

selector = SelectKBest(mutual_info_classif, k=15)
selector.fit(df_superornot[df_superornot.columns[:-1]], df_superornot["supercon_ornot"])
# Get columns to keep and create new dataframe with those only
cols_idxs = selector.get_support(indices=True)
features_names_class = df_superornot.iloc[:,cols_idxs].columns.tolist()

In [30]:
features_names_class

['lata',
 'latb',
 'latc',
 'density',
 'efermi',
 'final_energy',
 'final_energy_per_atom',
 'formation_energy_per_atom',
 'cell_volume',
 'true_total_magnetization',
 'Feature 0',
 'Feature 6',
 'Feature 9',
 'Feature 12',
 'Feature 15']

In [36]:
supercon_ornot = randomforests(df_superornot, features_names_class, "supercon_ornot", test_size=0.2, random_state=42, name=f"classification", stratify=True)

model_supercon_ornot = supercon_ornot.train_classifier_model(n_estimators=100,
                                                                max_depth=20,
                                                                min_samples_split=5,
                                                                min_samples_leaf=2)

In [37]:
accuracy, f1, precision, recall, conf_matrix = supercon_ornot.evaluate_classifier_model(model_supercon_ornot)
print(f"Accuracy: {accuracy}, F1 Score: {f1}, Precision: {precision}, Recall: {recall}")

Accuracy: 0.793073593073593, F1 Score: 0.78427289527768, Precision: 0.7856498974580156, Recall: 0.793073593073593


In [38]:
model_supercon_ornot.classes_

array([0, 1], dtype=int64)

In [39]:
supercon_ornot.plot_confusion_matrix(["Not Superconductor", "Superconductor",] ,conf_matrix, show=True, width=700, height=550, save_image=True)

In [41]:
supercon_ornot.plot_feature_importance(model_supercon_ornot.feature_importances_, show=True, width=1000, height=700, save_image=True)