In [2]:
import pandas as pd
from pymatgen.io.cif import CifParser
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import PersistenceEntropy
from gtda.diagrams import NumberOfPoints
from gtda.diagrams import Amplitude

from sklearn.pipeline import make_union, Pipeline

import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [3]:
def persistence_diagrams(coords):
    # these also make our nice diagrams that we like (need to analyse them and put them in my paper with barcode plots as well)
    # Track connected components, loops, and voids
    homology_dimensions = [0, 1, 2]

    # Collapse edges to speed up H2 persistence calculation!
    persistence = VietorisRipsPersistence(
        metric="euclidean",
        homology_dimensions=homology_dimensions,
        n_jobs=1,
        collapse_edges=True,
    )
    
    reshaped_coords=coords[None, :, :]
    diagrams_basic = persistence.fit_transform(reshaped_coords)
    return coords, diagrams_basic

def make_pipeline():

    metrics = [
        {"metric": metric}
        for metric in ["bottleneck", "wasserstein", "landscape", "persistence_image"]
    ]

    # Concatenate to generate 3 + 3 + (4 x 3) = 18 topological features
    feature_union = make_union(
        PersistenceEntropy(normalize=True),
        NumberOfPoints(n_jobs=1),
        *[Amplitude(**metric, n_jobs=1) for metric in metrics]
    )

    ## then we use a pipeline to transform, the data and spit i out
    # mwah hahahahaha
    pipe = Pipeline(
        [
            ("features", feature_union)
        ]
    )
        
    return pipe

def featurising_coords(coords_of_structures):
    topol_feat_list = []
    pipe = make_pipeline()

    for coords in coords_of_structures:
        _ , diagrams_basic = persistence_diagrams(coords)
        X_basic = pipe.fit_transform(diagrams_basic)
        # topology feat list stores the topological features for each structure
        topol_feat_list.append([x for x in X_basic[0]])
    
    # topol feat mat is a matrix of topological features
    topol_feat_mat = np.array(topol_feat_list)
    
    return topol_feat_mat, topol_feat_list

In [10]:
MP_df = pd.read_csv('3DSC_MP.csv', skiprows=1)
MP_df['cif'] = MP_df['cif'].str.replace('data/final/MP/', '')

cif_path = MP_df['cif'][0]
parser = CifParser(cif_path)

structure = parser.get_structures(primitive=True)[0]

Experiments I want to run: 

Asymmetric unit RF and NN, 

3x3 Supercell RF and NN, 

Whole Dataset vs Excluding Doped materials, 

With Physical properties vs ONLY PHFs (working with BCS theory vs working without)

Asymmetric Unit

In [12]:
coords = []
for item in MP_df["cif"]:
    cif_path = item
    parser = CifParser(cif_path)
    structure = parser.get_structures(primitive=True)[0]
    coords.append(structure.cart_coords)

In [13]:
matrix_list, feat_cryst_list = featurising_coords(coords_of_structures=coords)
MP_df['featurised_crystals'] = feat_cryst_list

In [14]:
MP_df.to_csv('asymmetric unit.csv', index=False)

In [30]:
features = MP_df["featurised_crystals"]
target = MP_df["tc"]

print(features)
print(target)

0       [0.5739123821015334, -0.0, -1.0, 4.0, 1.0, 0.0...
1       [0.0, -1.0, -1.0, 1.0, 0.0, 0.0, 2.77874374389...
2       [0.5739123821015334, -0.0, -1.0, 4.0, 1.0, 0.0...
3       [0.0, -1.0, -1.0, 1.0, 0.0, 0.0, 2.72115826606...
4       [0.6889375959460826, 1.4487280361041706, -1.0,...
                              ...                        
5768    [0.0, -1.0, -1.0, 1.0, 0.0, 0.0, 1.55254149436...
5769    [0.303454860932062, -1.0, -1.0, 2.0, 0.0, 0.0,...
5770    [0.0, -1.0, -1.0, 1.0, 0.0, 0.0, 1.50654625892...
5771    [0.6057948743593848, -1.0, -1.0, 5.0, 0.0, 0.0...
5772    [0.0, -1.0, -1.0, 1.0, 0.0, 0.0, 1.59573698043...
Name: featurised_crystals, Length: 5773, dtype: object
0       2.640000
1       2.150000
2       2.620000
3       1.200000
4       3.200000
          ...   
5768    0.000000
5769    0.000000
5770    0.850800
5771    0.296667
5772    0.996667
Name: tc, Length: 5773, dtype: float64


Random Forest Control

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# Prepare features and target
features = MP_df["featurised_crystals"]
target = MP_df["tc"]

# Transform features using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
features_transformed = mlb.fit_transform(features)

# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features_transformed, target, test_size=0.2, random_state=42)

# Define the RandomForestRegressor model
model = RandomForestRegressor(random_state=42)

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(x_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Use the best model for prediction
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R2) Score:", r2)

In [34]:
import pandas as pd

# Create a dataframe with y_test and y_pred
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

# Save the dataframe as a CSV file
df.to_csv('predictions.csv', index=False)


In [32]:
import plotly.graph_objects as go

# Create a scatter plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=y_test, y=y_pred, mode='markers', name='Actual vs. Predicted'))

# Add axis labels and title
fig.update_layout(
    xaxis_title="Actual",
    yaxis_title="Predicted",
    title="Actual vs. Predicted"
)

# Show plot
fig.show()


Deep Chem NN