In [None]:
import pandas as pd
from pymatgen.io.cif import CifParser
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import PersistenceEntropy
from gtda.diagrams import NumberOfPoints
from gtda.diagrams import Amplitude

from sklearn.pipeline import make_union, Pipeline

import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [None]:
def persistence_diagrams(coords):
    # these also make our nice diagrams that we like (need to analyse them and put them in my paper with barcode plots as well)
    # Track connected components, loops, and voids
    homology_dimensions = [0, 1, 2]

    # Collapse edges to speed up H2 persistence calculation!
    persistence = VietorisRipsPersistence(
        metric="euclidean",
        homology_dimensions=homology_dimensions,
        n_jobs=1,
        collapse_edges=True,
    )
    
    reshaped_coords=coords[None, :, :]
    diagrams_basic = persistence.fit_transform(reshaped_coords)
    return coords, diagrams_basic

def make_pipeline():

    metrics = [
        {"metric": metric}
        for metric in ["bottleneck", "wasserstein", "landscape", "persistence_image"]
    ]

    # Concatenate to generate 3 + 3 + (4 x 3) = 18 topological features
    feature_union = make_union(
        PersistenceEntropy(normalize=True),
        NumberOfPoints(n_jobs=1),
        *[Amplitude(**metric, n_jobs=1) for metric in metrics]
    )

    ## then we use a pipeline to transform, the data and spit i out
    # mwah hahahahaha
    pipe = Pipeline(
        [
            ("features", feature_union)
        ]
    )
        
    return pipe

def featurising_coords(coords_of_structures):
    topol_feat_list = []
    pipe = make_pipeline()

    for coords in coords_of_structures:
        _ , diagrams_basic = persistence_diagrams(coords)
        X_basic = pipe.fit_transform(diagrams_basic)
        # topology feat list stores the topological features for each structure
        topol_feat_list.append([x for x in X_basic[0]])
    
    # topol feat mat is a matrix of topological features
    topol_feat_mat = np.array(topol_feat_list)
    
    return topol_feat_mat, topol_feat_list

In [None]:
MP_df = pd.read_csv('3DSC_MP.csv', skiprows=1)
MP_df['cif'] = MP_df['cif'].str.replace('data/final/MP/', '')

cif_path = MP_df['cif'][0]
parser = CifParser(cif_path)

structure = parser.get_structures(primitive=True)[0]

Experiments I want to run: 

Asymmetric unit RF and NN, 

3x3 Supercell RF and NN, 

Whole Dataset vs Excluding Doped materials, 

With Physical properties vs ONLY PHFs (working with BCS theory vs working without)

Asymmetric Unit

In [None]:
coords = []
for item in MP_df["cif"]:
    cif_path = item
    parser = CifParser(cif_path)
    structure = parser.get_structures(primitive=True)[0]
    coords.append(structure.cart_coords)

In [None]:
matrix_list, feat_cryst_list = featurising_coords(coords_of_structures=coords)
MP_df['featurised_crystals'] = feat_cryst_list

In [None]:
MP_df.to_csv('asymmetric unit.csv', index=False)

In [None]:
features = MP_df["featurised_crystals"]
target = MP_df["tc"]

print(features)
print(target)

Random Forest Control

In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [35]:
import pandas as pd
import numpy as np
MP_df = pd.read_csv('asymmetric unit.csv')
MP_df

Unnamed: 0,formula_sc,formula_similarity,totreldiff,formula_frac,correct_formula_frac,formula_2,orig_formula_cif,tc,sc_class,sc_class_unique_sc,...,orthorhombic,tetragonal,triclinic,trigonal,primitive,base-centered,body-centered,face-centered,weight,featurised_crystals
0,Ag0.02Ge2Pd1.98Sr1,2,0.008000,1.0,True,Ag0.02Ge2Pd1.98Sr1,Ge2Pd2Sr1,2.640000,Other,True,...,0,7,0,0,0,0,1,0,1.0,"[0.5739123821015334, -0.0, -1.0, 4.0, 1.0, 0.0..."
1,Ag0.15Sn0.85Te1,3,0.150000,1.0,True,Ag0.15Sn0.85Te1,Sn1Te1,2.150000,Other,True,...,0,0,0,0,0,0,0,1,1.0,"[0.0, -1.0, -1.0, 1.0, 0.0, 0.0, 2.77874374389..."
2,Ag0.1Ge2Pd1.9Sr1,2,0.040000,1.0,True,Ag0.1Ge2Pd1.9Sr1,Ge2Pd2Sr1,2.620000,Other,True,...,0,7,0,0,0,0,1,0,1.0,"[0.5739123821015334, -0.0, -1.0, 4.0, 1.0, 0.0..."
3,Ag0.1In0.9Te1,3,0.100000,1.0,True,Ag0.1In0.9Te1,In1Te1,1.200000,Other,True,...,0,0,0,0,0,0,0,1,1.0,"[0.0, -1.0, -1.0, 1.0, 0.0, 0.0, 2.72115826606..."
4,Ag0.2Ba1Si1.8,3,0.133333,4.0,False,Ag0.8Ba4Si7.2,Ba4Si8,3.200000,Other,True,...,0,0,0,0,1,0,0,0,1.0,"[0.6889375959460826, 1.4487280361041706, -1.0,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5768,Y1Zn1,1,0.000000,1.0,True,Y1Zn1,Y1Zn1,0.000000,Other,True,...,0,0,0,0,1,0,0,0,1.0,"[0.0, -1.0, -1.0, 1.0, 0.0, 0.0, 1.55254149436..."
5769,Yb1,1,0.000000,3.0,False,Yb3,Yb3,0.000000,Heavy_fermion,True,...,0,0,0,6,1,0,0,0,1.0,"[0.303454860932062, -1.0, -1.0, 2.0, 0.0, 0.0,..."
5770,Zn1,1,0.000000,2.0,False,Zn2,Zn2,0.850800,Other,True,...,0,0,0,0,1,0,0,0,1.0,"[0.0, -1.0, -1.0, 1.0, 0.0, 0.0, 1.50654625892..."
5771,Zn2Zr1,1,0.000000,2.0,False,Zn4Zr2,Zn4Zr2,0.296667,Other,True,...,0,0,0,0,0,0,0,1,1.0,"[0.6057948743593848, -1.0, -1.0, 5.0, 0.0, 0.0..."


In [41]:
evaluated_features = []
for item in MP_df['featurised_crystals']:
    evaluated_features.append(eval(item))
    
evaluated_features[0]
columns = ['feature_{}'.format(i) for i in range(18)]
df = pd.DataFrame(evaluated_features, columns=columns)
MP_df = pd.concat([MP_df, df], axis=1)
MP_df

Unnamed: 0,formula_sc,formula_similarity,totreldiff,formula_frac,correct_formula_frac,formula_2,orig_formula_cif,tc,sc_class,sc_class_unique_sc,...,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17
0,Ag0.02Ge2Pd1.98Sr1,2,0.008000,1.0,True,Ag0.02Ge2Pd1.98Sr1,Ge2Pd2Sr1,2.640000,Other,True,...,0.0,2.781936,0.297758,0.0,1.809062,0.132656,0.0,4.888015,0.000000,0.0
1,Ag0.15Sn0.85Te1,3,0.150000,1.0,True,Ag0.15Sn0.85Te1,Sn1Te1,2.150000,Other,True,...,0.0,2.778744,0.000000,0.0,3.781856,0.000000,0.0,0.000000,0.000000,0.0
2,Ag0.1Ge2Pd1.9Sr1,2,0.040000,1.0,True,Ag0.1Ge2Pd1.9Sr1,Ge2Pd2Sr1,2.620000,Other,True,...,0.0,2.781936,0.297758,0.0,1.809062,0.132656,0.0,4.888015,0.000000,0.0
3,Ag0.1In0.9Te1,3,0.100000,1.0,True,Ag0.1In0.9Te1,In1Te1,1.200000,Other,True,...,0.0,2.721158,0.000000,0.0,3.664907,0.000000,0.0,0.000000,0.000000,0.0
4,Ag0.2Ba1Si1.8,3,0.133333,4.0,False,Ag0.8Ba4Si7.2,Ba4Si8,3.200000,Other,True,...,0.0,4.866506,0.682774,0.0,1.839699,0.182906,0.0,12.958865,3.481724,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5768,Y1Zn1,1,0.000000,1.0,True,Y1Zn1,Y1Zn1,0.000000,Other,True,...,0.0,1.552541,0.000000,0.0,1.579418,0.000000,0.0,0.000000,0.000000,0.0
5769,Yb1,1,0.000000,3.0,False,Yb3,Yb3,0.000000,Heavy_fermion,True,...,0.0,3.351341,0.000000,0.0,3.703747,0.000000,0.0,0.000000,0.000000,0.0
5770,Zn1,1,0.000000,2.0,False,Zn2,Zn2,0.850800,Other,True,...,0.0,1.506546,0.000000,0.0,1.509753,0.000000,0.0,0.000000,0.000000,0.0
5771,Zn2Zr1,1,0.000000,2.0,False,Zn4Zr2,Zn4Zr2,0.296667,Other,True,...,0.0,3.177911,0.000000,0.0,1.661336,0.000000,0.0,0.000000,0.000000,0.0


In [42]:
features = MP_df[columns]
features

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17
0,0.573912,-0.000000,-1.0,4.0,1.0,0.0,1.699603,0.297758,0.0,2.781936,0.297758,0.0,1.809062,0.132656,0.0,4.888015,0.000000,0.0
1,0.000000,-1.000000,-1.0,1.0,0.0,0.0,2.778744,0.000000,0.0,2.778744,0.000000,0.0,3.781856,0.000000,0.0,0.000000,0.000000,0.0
2,0.573912,-0.000000,-1.0,4.0,1.0,0.0,1.699603,0.297758,0.0,2.781936,0.297758,0.0,1.809062,0.132656,0.0,4.888015,0.000000,0.0
3,0.000000,-1.000000,-1.0,1.0,0.0,0.0,2.721158,0.000000,0.0,2.721158,0.000000,0.0,3.664907,0.000000,0.0,0.000000,0.000000,0.0
4,0.688938,1.448728,-1.0,11.0,5.0,0.0,1.718738,0.368862,0.0,4.866506,0.682774,0.0,1.839699,0.182906,0.0,12.958865,3.481724,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5768,0.000000,-1.000000,-1.0,1.0,0.0,0.0,1.552541,0.000000,0.0,1.552541,0.000000,0.0,1.579418,0.000000,0.0,0.000000,0.000000,0.0
5769,0.303455,-1.000000,-1.0,2.0,0.0,0.0,2.740350,0.000000,0.0,3.351341,0.000000,0.0,3.703747,0.000000,0.0,0.000000,0.000000,0.0
5770,0.000000,-1.000000,-1.0,1.0,0.0,0.0,1.506546,0.000000,0.0,1.506546,0.000000,0.0,1.509753,0.000000,0.0,0.000000,0.000000,0.0
5771,0.605795,-1.000000,-1.0,5.0,0.0,0.0,1.605770,0.000000,0.0,3.177911,0.000000,0.0,1.661336,0.000000,0.0,0.000000,0.000000,0.0


In [43]:
# Prepare features and target
target = MP_df["tc"]

# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Define the RandomForestRegressor model
model = RandomForestRegressor(random_state=42)

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=1)

# Fit the grid search to the data
grid_search.fit(x_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Use the best model for prediction
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R2) Score:", r2)

Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
Mean Absolute Error (MAE): 4.779307558261787
Mean Squared Error (MSE): 75.85731831524106
Root Mean Squared Error (RMSE): 8.709610686778202
R-squared (R2) Score: 0.7895728252875389


In [69]:
import plotly.graph_objects as go

max_val = max(y_test.max(), y_pred.max())
min_val = min(y_test.min(), y_pred.min())

# Create a scatter plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=y_test, y=y_pred, mode='markers', name='Actual vs. Predicted'))

# Add y = x line
fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val], mode='lines', name='y = x', 
                         line=dict(
                             dash='dash',
                             color='red', 
                             width=4
                             )))


# Set the layout
fig.update_layout(
    xaxis=dict(
        title='Actual',
        title_font=dict(size=24),
        tickfont=dict(size=20), 
        tickmode = 'linear',
        dtick=round((max_val - min_val) / 5), 
        tickformat=".1f",
        range=[-.1, max_val+5]
    ),
    yaxis=dict(
        title='Predicted',
        title_font=dict(size=24),
        tickfont=dict(size=20), 
        tickmode = 'linear',
        dtick=round((max_val - min_val) / 5), 
        tickformat=".1f",
        range=[-.1, max_val+5]
    ),
    showlegend=True,
    width=1000,
    height=800,
    plot_bgcolor='white',
    paper_bgcolor='white',
    legend=dict(
        x=0,
        y=1,
        bgcolor='rgba(0,0,0,0)',
        font=dict(size=16)
    )
)

fig.update_xaxes(showline=True, linewidth=2, linecolor='black')
fig.update_yaxes(showline=True, linewidth=2, linecolor='black')

# Show plot
fig.show()


In [68]:
print(MP_df[MP_df["tc"] == 92.5]["formula_sc"])
print("\n")
print(MP_df[MP_df["tc"] == 100]["formula_sc"])
print("\n")
print(MP_df[MP_df["tc"] == 122]["formula_sc"])
print("\n")
print(MP_df[MP_df["tc"] == 127]["formula_sc"])

1813    Ba2Ca1Cu2Hg1O6.1
1873     Ba2Cu1Hg1O4.103
Name: formula_sc, dtype: object


1807    Ba2Ca0.9Ce0.1Cu2Tl2O8
1822         Ba2Ca1Cu2Hg1O6.3
Name: formula_sc, dtype: object


1817    Ba2Ca1Cu2Hg1O6.19
Name: formula_sc, dtype: object


1818    Ba2Ca1Cu2Hg1O6.21
1819    Ba2Ca1Cu2Hg1O6.22
Name: formula_sc, dtype: object


Deep Chem NN