In [1]:
import sys
sys.path.append('..')
from PHF_RF_code import *

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.express as px
import pickle
from pymatgen.io.cif import CifParser

In [3]:
not_features = ['tc','formula_sc', 'formula', 'orig_formula_cif', 'norm_formula_sc', 'chemical_composition_sc','origin_sc', 'old_formula_sc', 'database_id',
                'original_formula','chemical_composition', 'norm_formula', 'spacegroup', 'crystal_system',
                'cif', 'original_cif', 'material_id', 'band_structure','created_at',
                'doi','doi_bibtex','dos','exp','has', 'has_bandstructure', 'icsd_ids', 'last_updated', 
                'magnetic_type', 'ntask_ids', 'original_task_id', 'oxide_type', 'pretty_formula', 'pseudo_potential', 'reduced_cell_formula',
                'run_type','task_id','task_ids','unit_cell_formula','warnings','ordering','magmoms','origin','cif_before_synthetic_doping',
                'Reason for exclusion','graph','crystal_temp','no_crystal_temp_given','point_group',"weight",
                "energy_per_atom", "energy", "total_magnetization", "total_magnetization_normalized_vol", "total_magnetization_normalized_formula_units",
                'sc_class', 'sc_class_unique_sc']

In [4]:
df_MP = pd.read_csv("../3DSC_MP.csv", skiprows=1)
df_MP['cif'] = df_MP['cif'].str.replace('data/final/MP/', '../')
df_MP.columns = df_MP.columns.str.replace('_2', '')

df_MP.loc[df_MP["tc"] == 0, "sc_class"] = "Not_supercon"

df_cuprates = df_MP[df_MP["sc_class"] == "Cuprate"]

for cif_path in df_cuprates["cif"]:
    parser = CifParser(cif_path)
    structure = parser.get_structures()[0]  # Assuming there's only one structure in the file

    # Get the composition of the structure
    composition = structure.composition

    # Calculate the average atomic weight
    average_atomic_weight = composition.weight
    
    df_cuprates.loc[df_MP["cif"] == cif_path, "average_atomic_weight"] = average_atomic_weight
    
df_cuprates_tc = df_cuprates["tc"]
df_formulas = df_cuprates["formula_sc"]

df_cuprates = df_cuprates.drop(columns=not_features)

In [5]:
with open('featurised_datasets/unitcell.pkl', 'rb') as f:
    unitcell = pickle.load(f)
    
unitcell_cuperates = [unitcell[i] for i in df_cuprates.index.tolist()]

features = PresistentHomologyFeatures(coords=unitcell_cuperates)
topol_feat_mat, topol_feat_list = features.featurising_coords()

PH_features = []

for i, feature in enumerate(topol_feat_mat.T):
    df_cuprates[f"Feature {i}"] = np.squeeze(feature)
    PH_features.append(f"Feature {i}")  

df_cuprates.head()

Unnamed: 0,formula_similarity,totreldiff,formula_frac,correct_formula_frac,num_elements_sc,lata,latb,latc,band_gap,density,...,Feature 8,Feature 9,Feature 10,Feature 11,Feature 12,Feature 13,Feature 14,Feature 15,Feature 16,Feature 17
932,2,0.008889,4.0,False,5,7.877744,7.877744,9.467551,0.0,4.624821,...,0.460601,18.241141,3.908913,1.810337,1.338293,0.361756,0.37671,344.363172,393.591399,115.291351
933,2,0.011111,4.0,False,5,7.877744,7.877744,9.467551,0.0,4.624821,...,0.460601,18.241141,3.908913,1.810337,1.338293,0.361756,0.37671,344.363172,393.591399,115.291351
935,2,0.013333,4.0,False,5,7.877744,7.877744,9.467551,0.0,4.624821,...,0.460601,18.241141,3.908913,1.810337,1.338293,0.361756,0.37671,344.363172,393.591399,115.291351
936,2,0.015556,4.0,False,5,7.877744,7.877744,9.467551,0.0,4.624821,...,0.460601,18.241141,3.908913,1.810337,1.338293,0.361756,0.37671,344.363172,393.591399,115.291351
937,2,0.017778,4.0,False,5,7.877744,7.877744,9.467551,0.0,4.624821,...,0.460601,18.241141,3.908913,1.810337,1.338293,0.361756,0.37671,344.363172,393.591399,115.291351


In [6]:
df_cuprates.columns

Index(['formula_similarity', 'totreldiff', 'formula_frac',
       'correct_formula_frac', 'num_elements_sc', 'lata', 'latb', 'latc',
       'band_gap', 'density', 'e_above_hull', 'efermi', 'encut',
       'final_energy', 'final_energy_per_atom', 'formation_energy_per_atom',
       'is_ordered', 'nsites', 'cell_volume', 'is_magnetic',
       'exchange_symmetry', 'num_unique_magnetic_sites', 'num_magnetic_sites',
       'true_total_magnetization', 'synth_doped', 'cubic', 'hexagonal',
       'monoclinic', 'orthorhombic', 'tetragonal', 'triclinic', 'trigonal',
       'primitive', 'base-centered', 'body-centered', 'face-centered',
       'average_atomic_weight', 'Feature 0', 'Feature 1', 'Feature 2',
       'Feature 3', 'Feature 4', 'Feature 5', 'Feature 6', 'Feature 7',
       'Feature 8', 'Feature 9', 'Feature 10', 'Feature 11', 'Feature 12',
       'Feature 13', 'Feature 14', 'Feature 15', 'Feature 16', 'Feature 17'],
      dtype='object')

In [7]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

selector = SelectKBest(mutual_info_regression, k=15)
selector.fit(df_cuprates, df_cuprates_tc)
# Get columns to keep and create new dataframe with those only
cols_idxs = selector.get_support(indices=True)
features_names = df_cuprates.iloc[:,cols_idxs].columns.tolist()
features_names

['latc',
 'final_energy',
 'final_energy_per_atom',
 'nsites',
 'cell_volume',
 'average_atomic_weight',
 'Feature 3',
 'Feature 4',
 'Feature 5',
 'Feature 9',
 'Feature 10',
 'Feature 11',
 'Feature 13',
 'Feature 15',
 'Feature 17']

In [8]:
df_cup_all = pd.concat([df_cuprates[features_names], df_cuprates_tc], axis=1)

# Regressor

In [9]:
PHF_only = randomforests(df_cup_all, features_names, "tc", test_size=0.2, random_state=42, name=f"cuperates", stratify=False)

best_model_PHFonly, best_parameters_PHFonly = PHF_only.train_regressor_model_grid_search()
print(best_parameters_PHFonly)

{'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [10]:
mae, mse, r2 = PHF_only.evaluate_regressor_model(best_model_PHFonly)
print(f"Mean Absolute Error: {mae}, Mean Squared Error: {mse}, R^2 Score: {r2}")

cv_scores = PHF_only.calc_cross_val_score(best_model_PHFonly, cv = 10, scoring = 'r2')
mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)
print(f"Mean CV Score: {mean_cv_score}, Standard Deviation of CV Scores: {std_cv_score}")

Mean Absolute Error: 8.790082975869325, Mean Squared Error: 165.10721727967027, R^2 Score: 0.8032440373649639
Mean CV Score: 0.8062360457327431, Standard Deviation of CV Scores: 0.059570529013872796


In [11]:
PHF_only.plot_feature_importance(best_model_PHFonly.feature_importances_, show=True, width=1300, height=700, save_image=False)

In [12]:
predicted_tc = best_model_PHFonly.predict(df_cup_all[features_names])
target_tc = df_cup_all["tc"]

# Create a trace for the scatter plot
trace = go.Scatter(
    x=target_tc,
    y=predicted_tc,
    mode='markers', 
    text=df_formulas, 
    hoverinfo='text',
    marker=dict(
        color='rgb(33, 145, 140)',
        size=8,  # Adjust marker size as needed
        opacity=0.8  # Adjust marker opacity as needed
    ),
    name='Scatter Plot'  # Name of the trace
)

trace2  = go.Scatter(
    x=np.linspace(min(predicted_tc), max(predicted_tc), 100),
    y=np.linspace(min(predicted_tc), max(predicted_tc), 100),
    mode='lines',
    name='y=x',
    line=dict(color='black', width=5, dash='dash')
)

# Create layout
layout = go.Layout(
    xaxis=dict(title="Real Value", showline=True, linewidth=5, linecolor='black',
            ticks='inside', tickwidth=4, ticklen=5, range=[min(target_tc)-0.5, max(target_tc)+10]),
    yaxis=dict(title="Predicted Value", showline=True, linewidth=5, linecolor='black',
            ticks='inside', tickwidth=4, ticklen=5, range=[min(predicted_tc)-0.5, max(predicted_tc)+10]),
    width=800,
    height=700,
    plot_bgcolor='white',
    paper_bgcolor='white', 
    font=dict(family='Helvetica', size=24, color='black'),
    margin=dict(l=10, r=10, b=10, t=10),
    showlegend=False,
)

# Create the figure with data and layout, and plot it
fig = go.Figure(data=[trace, trace2], layout=layout)

# Show the plot (you can also save it to a file using fig.write_image("scatter_plot.png"))
fig.show()