In [1]:
import pandas as pd
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import PersistenceEntropy
from gtda.diagrams import NumberOfPoints
from gtda.diagrams import Amplitude

from sklearn.pipeline import make_union, Pipeline

import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [3]:
from pymatgen.io.cif import CifParser

In [4]:
def persistence_diagrams(coords):
    # these also make our nice diagrams that we like (need to analyse them and put them in my paper with barcode plots as well)
    # Track connected components, loops, and voids
    homology_dimensions = [0, 1, 2]

    # Collapse edges to speed up H2 persistence calculation!
    persistence = VietorisRipsPersistence(
        metric="euclidean",
        homology_dimensions=homology_dimensions,
        n_jobs=1,
        collapse_edges=True,
    )
    
    reshaped_coords=coords[None, :, :]
    diagrams_basic = persistence.fit_transform(reshaped_coords)
    return coords, diagrams_basic

def make_pipeline():

    metrics = [
        {"metric": metric}
        for metric in ["bottleneck", "wasserstein", "landscape", "persistence_image"]
    ]

    # Concatenate to generate 3 + 3 + (4 x 3) = 18 topological features
    feature_union = make_union(
        PersistenceEntropy(normalize=True),
        NumberOfPoints(n_jobs=1),
        *[Amplitude(**metric, n_jobs=1) for metric in metrics]
    )

    ## then we use a pipeline to transform, the data and spit i out
    # mwah hahahahaha
    pipe = Pipeline(
        [
            ("features", feature_union)
        ]
    )
        
    return pipe

def featurising_coords(coords_of_structures):
    topol_feat_list = []
    pipe = make_pipeline()

    for coords in coords_of_structures:
        _ , diagrams_basic = persistence_diagrams(coords)
        X_basic = pipe.fit_transform(diagrams_basic)
        # topology feat list stores the topological features for each structure
        topol_feat_list.append([x for x in X_basic[0]])
    
    # topol feat mat is a matrix of topological features
    topol_feat_mat = np.array(topol_feat_list)
    
    return topol_feat_mat, topol_feat_list

In [9]:
MP_df = pd.read_csv('3DSC_MP.csv', skiprows=1)
MP_df['cif'] = MP_df['cif'].str.replace('data/final/MP/', '')

cif_path = MP_df['cif'][0]
parser = CifParser(cif_path)

structure = parser.get_structures(primitive=True)
print(structure)

[Structure Summary
Lattice
    abc : 4.438672 4.438672 6.030548339846585
 angles : 111.59328751266106 111.5932875163173 90.00000001
 volume : 101.4530475362769
      A : 0.6465272925912613 -4.076209912182743 1.6335006383046717
      B : 4.127164255059669 -0.0 -1.6335006383046717
      C : -0.6465272925912613 4.076209912182743 4.397047701695328
PeriodicSite: Sr (0.0000, 0.0000, 0.0000) [0.0000, 0.0000, 0.0000]
PeriodicSite: Ge (1.2898, 1.5105, 3.2588) [0.3706, 0.3706, 0.7411]
PeriodicSite: Ge (2.8373, -1.5105, 1.1382) [0.6294, 0.6294, 0.2589]
PeriodicSite: Ag:0.010, Pd:0.990 (1.1934, -1.0191, 3.0153) [0.7500, 0.2500, 0.5000]
PeriodicSite: Ag:0.010, Pd:0.990 (2.9337, 1.0191, 1.3818) [0.2500, 0.7500, 0.5000]]


In [6]:
print(structure.make_supercell([2, 2, 2]))

None


In [None]:
coords = []
for item in MP_df["cif"]:
    cif_path = item
    parser = CifParser(cif_path)
    structure = parser.get_structures(primitive=True)[0]
    coords.append(structure.cart_coords)
    
matrix_list, feat_cryst_list = featurising_coords(coords_of_structures=coords)

MP_df['featurised_crystals'] = feat_cryst_list
MP_df.to_csv('asymmetric unit.csv', index=False)
features = MP_df["featurised_crystals"]
target = MP_df["tc"]