Trying to make a fake dataset here to be able to test how featurising these bravais lattices will work with PHF features

In [1]:
import pandas as pd
import numpy as np
import structures as st
import plotly.graph_objects as go

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import PersistenceEntropy
from gtda.diagrams import NumberOfPoints
from gtda.diagrams import Amplitude

from sklearn.pipeline import make_union, Pipeline

In [2]:
def persistence_diagrams(coords):
    # Gives data to make persitenace diagrams, need to make code for that
    # Track connected components, loops, and voids
    homology_dimensions = [0, 1, 2]

    # Collapse edges to speed up H2 persistence calculation!
    persistence = VietorisRipsPersistence(
        metric="euclidean",
        homology_dimensions=homology_dimensions,
        n_jobs=1,
        collapse_edges=True,
    )
    
    reshaped_coords=coords[None, :, :]
    diagrams_basic = persistence.fit_transform(reshaped_coords)
    return coords, diagrams_basic

def make_pipeline():

    metrics = [
        {"metric": metric}
        for metric in ["bottleneck", "wasserstein", "landscape", "persistence_image"]
    ]

    # Concatenate to generate 3 + 3 + (4 x 3) = 18 topological features
    feature_union = make_union(
        PersistenceEntropy(normalize=True),
        NumberOfPoints(n_jobs=1),
        *[Amplitude(**metric, n_jobs=1) for metric in metrics]
    )

    ## then we use a pipeline to transform, the data and spit i out
    # mwah hahahahaha
    pipe = Pipeline(
        [
            ("features", feature_union)
        ]
    )
        
    return pipe

def featurising_coords(coords_of_structures):
    topol_feat_list = []
    pipe = make_pipeline()

    for coords in coords_of_structures:
        _ , diagrams_basic = persistence_diagrams(coords)
        X_basic = pipe.fit_transform(diagrams_basic)
        # topology feat list stores the topological features for each structure
        topol_feat_list.append([x for x in X_basic[0]])
    
    # topol feat mat is a matrix of topological features
    topol_feat_mat = np.array(topol_feat_list)
    
    return topol_feat_mat, topol_feat_list

def do_random_forest(features, target):
    # we need to binarize the labels
    mlb = MultiLabelBinarizer()
    features_transformed = mlb.fit_transform(features)

    x_train, x_test ,y_train, y_test = train_test_split(features_transformed,target,test_size=0.2, random_state=42)

    model = RandomForestClassifier(n_estimators=100, random_state=42)

    model.fit(x_train, y_train)

    accuracy = round(model.score(x_test, y_test) * 100, 2)
    print(f"Accuracy of the random forest: {accuracy}%")
    
    return model, x_train, x_test ,y_train, y_test

In [3]:
def randomise_sides(length, num_of_diff_sides):
    sides_df = pd.DataFrame(columns=['a', 'b', 'c'])
    
    while len(sides_df) < length:
        side_a = round(np.random.uniform(3, 9.0), 3)
        if num_of_diff_sides == 1:
            side_b = side_a
            side_c  = round(np.random.uniform(3, 9.0), 3)
            
            if side_a != side_c:
                sides_df = sides_df.append({'a': side_a, 'b': side_b, 'c': side_c}, ignore_index=True)
            
        elif num_of_diff_sides == 2:
            side_b = round(np.random.uniform(3, 9.0), 3)
            side_c  = round(np.random.uniform(3, 9.0), 3)
            
            if side_a != side_b != side_c:
                sides_df = sides_df.append({'a': side_a, 'b': side_b, 'c': side_c}, ignore_index=True)
                
        else:
            side_b = side_a
            side_c = side_a
            sides_df = sides_df.append({'a': side_a, 'b': side_b, 'c': side_c}, ignore_index=True)
            
    return sides_df

In [4]:
def randomise_angles(length, randomise_1, randomise_3, same_3_randomised):
    if randomise_1 == True:
        angles = []
        while len(angles) < length:
            ang_a = np.random.uniform(45, 80)
            ang_a = round(ang_a, 1)
            angles.append(ang_a)
            
    elif randomise_3 == True:
        angles = []
        while len(angles) < length:
            ang_a = np.random.uniform(45, 80)
            ang_a = round(ang_a, 1)
            ang_b = np.random.uniform(45, 80)
            ang_b = round(ang_b, 1)
            ang_y = np.random.uniform(45, 80)
            ang_y = round(ang_y, 1)
            
            if ang_a != ang_b != ang_y:
                angles.append([ang_a, ang_b, ang_y])
                
    elif same_3_randomised == True:
        angles = []
        while len(angles) < length:
            ang_a = np.random.uniform(45, 80)
            ang_a = round(ang_a, 1)
            ang_b = ang_a
            ang_y = ang_a
            
            angles.append([ang_a, ang_b, ang_y])
            
    else:
        print('Please choose a valid option')
        
    return angles

# try_ang = randomise_angles(10, False, True, False)
# try_ang_df = pd.DataFrame(try_ang, columns=['angle_a', 'angle_b', 'angle_y'])

In [5]:
def make_dataset(a, b, c, alpha, beta, gamma, name):
    data = {
        'a': a,
        'b': b,
        'c': c,
        'alpha': alpha,
        'beta': beta,
        'gamma': gamma,
        'structure type': name
        }
    
    df = pd.DataFrame(data)
    return df

In [26]:
length = 5000

sides = randomise_sides(length, 0)
cubic = make_dataset(sides["a"], sides["b"], sides["c"], 90, 90, 90, "cubic")

sides = randomise_sides(length, 0)
hexagonal = make_dataset(sides["a"], sides["b"], sides["c"], 90, 90, 120, "hexagonal")

sides = randomise_sides(length, 1)
tetragonal = make_dataset(sides["a"], sides["b"], sides["c"], 90, 90, 90, "tetragonal")

sides = randomise_sides(length, 2)
orthorhombic = make_dataset(sides["a"], sides["b"], sides["c"], 90, 90, 90, "orthorhombic")

sides = randomise_sides(length, 0)
angles = randomise_angles(length, False, False, True)
angles = pd.DataFrame(angles, columns=['alpha', 'beta', 'gamma'])
rhomobohedral = make_dataset(sides["a"], sides["b"], sides["c"], angles["alpha"], 
                             angles["beta"], angles["gamma"], "rhomobohedral")

sides = randomise_sides(length, 2)
angles = randomise_angles(length, True, False, False)
monoclinc = make_dataset(sides["a"], sides["b"], sides["c"], 90, angles, 90, "monoclinc")

sides = randomise_sides(length, 2)
angles = randomise_angles(length, False, True, False)
angles = pd.DataFrame(angles, columns=['alpha', 'beta', 'gamma'])

triclinic = make_dataset(sides["a"], sides["b"], sides["c"],
                             angles["alpha"], angles["beta"], angles["gamma"], "triclinic")


final_df = pd.concat([cubic, hexagonal, rhomobohedral, tetragonal, orthorhombic, monoclinc, triclinic],ignore_index=True)

print("Data set made")

coords = []
for index, row in final_df.iterrows():
    coords.append(st.Structure(2,2,2,
                               row["a"] ,row["b"], row["c"], 
                               row["alpha"], row["beta"], row["gamma"],
                               False, False, False))
    
matrix_list, feat_cryst_list = featurising_coords(coords_of_structures=coords)
final_df['Crystals Featurised'] = feat_cryst_list

final_df["Lowest distortion"] = final_df["structure type"].astype('category')
final_df["Lowest distortion"] = final_df["Lowest distortion"].cat.codes

print("featurisation done")

do_random_forest(final_df["Crystals Featurised"], final_df["Lowest distortion"])

print("random forest done")

Data set made
featurisation done
Accuracy of the random forest: 44.43%
random forest done


In [27]:
final_df

Unnamed: 0,a,b,c,alpha,beta,gamma,structure type,Crystals Featurised,Lowest distortion
0,3.581,3.581,3.581,90.0,90.0,90.0,cubic,"[0.6040287379884409, 0.8032304643141407, -1.0,...",0
1,6.307,6.307,6.307,90.0,90.0,90.0,cubic,"[0.5137621202584425, 0.6263075891044909, -1.0,...",0
2,8.244,8.244,8.244,90.0,90.0,90.0,cubic,"[0.47983235519103434, 0.5671928407232433, -1.0...",0
3,7.322,7.322,7.322,90.0,90.0,90.0,cubic,"[0.4942880713228529, 0.5919340874761424, -1.0,...",0
4,4.163,4.163,4.163,90.0,90.0,90.0,cubic,"[0.5770539195454751, 0.7470815290419072, -1.0,...",0
...,...,...,...,...,...,...,...,...,...
34995,5.141,8.511,6.153,57.3,54.1,66.8,triclinic,"[0.517576590222618, 0.8249858859266951, -0.0, ...",6
34996,5.425,7.475,8.562,62.9,69.1,53.8,triclinic,"[0.5054709846975164, 0.7582094782415253, -0.0,...",6
34997,8.975,4.527,4.205,68.6,77.8,67.7,triclinic,"[0.5349290937238138, 0.9341858024070012, -0.0,...",6
34998,7.064,8.554,3.169,69.7,76.5,70.7,triclinic,"[0.5191677849226715, 0.9042124897693417, -0.0,...",6
