Featurisation, im just going to featurise the bravais lattices here

In [1]:
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import PersistenceEntropy
from gtda.diagrams import NumberOfPoints
from gtda.diagrams import Amplitude

from sklearn.pipeline import make_union, Pipeline

import numpy as np
import pandas as pd

import structures as st

import warnings
warnings.filterwarnings("ignore")

In [2]:
def persistence_diagrams(coords):
    # these also make our nice diagrams that we like (need to analyse them and put them in my paper with barcode plots as well)
    # Track connected components, loops, and voids
    homology_dimensions = [0, 1, 2]

    # Collapse edges to speed up H2 persistence calculation!
    persistence = VietorisRipsPersistence(
        metric="euclidean",
        homology_dimensions=homology_dimensions,
        n_jobs=1,
        collapse_edges=True,
    )
    
    reshaped_coords=coords[None, :, :]
    diagrams_basic = persistence.fit_transform(reshaped_coords)
    return coords, diagrams_basic

def make_pipeline():

    metrics = [
        {"metric": metric}
        for metric in ["bottleneck", "wasserstein", "landscape", "persistence_image"]
    ]

    # Concatenate to generate 3 + 3 + (4 x 3) = 18 topological features
    feature_union = make_union(
        PersistenceEntropy(normalize=True),
        NumberOfPoints(n_jobs=1),
        *[Amplitude(**metric, n_jobs=1) for metric in metrics]
    )

    ## then we use a pipeline to transform, the data and spit i out
    # mwah hahahahaha
    pipe = Pipeline(
        [
            ("features", feature_union)
        ]
    )
        
    return pipe

def featurising_coords(coords_of_structures):
    topol_feat_list = []
    pipe = make_pipeline()

    for coords in coords_of_structures:
        _ , diagrams_basic = persistence_diagrams(coords)
        X_basic = pipe.fit_transform(diagrams_basic)
        # topology feat list stores the topological features for each structure
        topol_feat_list.append([x for x in X_basic[0]])
    
    # topol feat mat is a matrix of topological features
    topol_feat_mat = np.array(topol_feat_list)
    
    return topol_feat_mat, topol_feat_list

In [3]:
df = pd.read_csv("HighthroughputDFTcalculationsofformationenergystabilityandoxygenvacancyformationenergyofABO3perovskites.csv")
(df == "-").sum()


Chemical formula                 0
A                                0
B                                0
In literature                    0
Valence A                        0
Valence B                        0
Radius A [ang]                   0
Radius B [ang]                   0
Lowest distortion               53
Formation energy [eV/atom]      53
Stability [eV/atom]             53
Magnetic moment [mu_B]        1013
Volume per atom [A^3/atom]      53
Band gap [eV]                   53
a [ang]                         53
b [ang]                         53
c [ang]                         53
alpha [deg]                     53
beta [deg]                      53
gamma [deg]                     53
Vacancy energy [eV/O atom]     415
dtype: int64

In [4]:
df = df.drop(df[["Valence A", "Valence B"]], axis = 1)
df.head()

Unnamed: 0,Chemical formula,A,B,In literature,Radius A [ang],Radius B [ang],Lowest distortion,Formation energy [eV/atom],Stability [eV/atom],Magnetic moment [mu_B],Volume per atom [A^3/atom],Band gap [eV],a [ang],b [ang],c [ang],alpha [deg],beta [deg],gamma [deg],Vacancy energy [eV/O atom]
0,Ac2O3,Ac,Ac,False,1.12,1.12,cubic,-2.732,0.848,0.0,20.836,0.332,4.705,4.705,4.705,90.0,90.0,90.0,3.15
1,AcAgO3,Ac,Ag,False,1.12,0.95,orthorhombic,-1.957,-0.055,0.0,14.485,0.0,5.779,6.077,8.248,90.0,90.0,90.0,0.817
2,AcAlO3,Ac,Al,False,1.12,0.54,cubic,-3.532,-0.11,0.0,11.487,4.307,3.858,3.858,3.858,90.0,90.0,90.0,6.695
3,AcAsO3,Ac,As,False,1.12,0.52,orthorhombic,-2.398,0.224,0.0,14.355,0.0,5.78,6.012,8.262,90.0,90.0,90.0,3.634
4,AcAuO3,Ac,Au,False,1.12,0.93,orthorhombic,-2.006,-0.056,0.0,15.19,0.745,5.899,6.75,7.63,90.0,90.0,90.0,0.807


In [5]:
inlit_df = df[df["In literature"] == True]
inlit_df.head()

Unnamed: 0,Chemical formula,A,B,In literature,Radius A [ang],Radius B [ang],Lowest distortion,Formation energy [eV/atom],Stability [eV/atom],Magnetic moment [mu_B],Volume per atom [A^3/atom],Band gap [eV],a [ang],b [ang],c [ang],alpha [deg],beta [deg],gamma [deg],Vacancy energy [eV/O atom]
109,AgNbO3,Ag,Nb,True,1.28,0.64,rhombohedral,-2.009,0.043,0.000,12.263,1.269,5.611,5.611,5.611,59.2,59.2,59.2,4.672
125,AgSbO3,Ag,Sb,True,1.28,0.6,orthorhombic,-1.157,0.109,-,12.152,0.0,5.539,5.591,7.848,90.0,90.0,90.0,2.278
131,AgTaO3,Ag,Ta,True,1.28,0.64,rhombohedral,-2.247,0.018,0.000,12.114,2.03,5.584,5.584,5.584,59.3,59.3,59.3,5.586
140,AgVO3,Ag,V,True,1.28,0.54,tetragonal,-1.582,0.125,0.000,11.419,0.341,3.684,3.684,4.208,90.0,90.0,90.0,1.832
449,BaCeO3,Ba,Ce,True,1.61,0.87,orthorhombic,-2.784,0.15,0.200,18.712,0.0,6.355,6.496,9.066,90.0,90.0,90.0,-0.006


In [6]:
(inlit_df == "-").sum()

Chemical formula               0
A                              0
B                              0
In literature                  0
Radius A [ang]                 0
Radius B [ang]                 0
Lowest distortion              0
Formation energy [eV/atom]     0
Stability [eV/atom]            0
Magnetic moment [mu_B]        13
Volume per atom [A^3/atom]     0
Band gap [eV]                  0
a [ang]                        0
b [ang]                        0
c [ang]                        0
alpha [deg]                    0
beta [deg]                     0
gamma [deg]                    0
Vacancy energy [eV/O atom]    14
dtype: int64

I need to explore what i need to do about these magnetic moments but for now we are not missing any data based on a, b, c, alpha, beta, gamma anyways so its alright

In [7]:
print("If I understood from the paper this is part of the dataset that we know is right as its in literature \n"
      ,inlit_df["Lowest distortion"].value_counts())

print("\n The 79% accurate predicted data set lots of cubic predicted\n"
      ,df["Lowest distortion"].value_counts())

If I understood from the paper this is part of the dataset that we know is right as its in literature 
 orthorhombic    183
cubic            18
rhombohedral     15
tetragonal        6
Name: Lowest distortion, dtype: int64

 The 79% accurate predicted data set lots of cubic predicted
 cubic           3253
orthorhombic    1573
rhombohedral     323
tetragonal       127
-                 53
Name: Lowest distortion, dtype: int64


In [8]:
# Get the column names that have numbers in them
numeric_columns = ["a [ang]","b [ang]", "c [ang]", "alpha [deg]", "beta [deg]", "gamma [deg]"]

# Convert the columns to float
inlit_df[numeric_columns] = inlit_df[numeric_columns].astype(float)

In [9]:
inlit_df.dtypes

Chemical formula               object
A                              object
B                              object
In literature                    bool
Radius A [ang]                float64
Radius B [ang]                float64
Lowest distortion              object
Formation energy [eV/atom]     object
Stability [eV/atom]            object
Magnetic moment [mu_B]         object
Volume per atom [A^3/atom]     object
Band gap [eV]                  object
a [ang]                       float64
b [ang]                       float64
c [ang]                       float64
alpha [deg]                   float64
beta [deg]                    float64
gamma [deg]                   float64
Vacancy energy [eV/O atom]     object
dtype: object

I need to make a dataset, this dataset will go through my lattice maker and give back coords (based on their respective unit measurements), these coords will then go through the pipelining, this will give us our X data. The Y data will be what type of bravais lattice is the compound. We can then do a train test split on this, either using a random forest or a NN. 

In [10]:
coords = []
for index, row in inlit_df.iterrows():
    coords.append(st.Structure(2,2,2,
                               row["a [ang]"] ,row["b [ang]"], row["c [ang]"], 
                               row["alpha [deg]"], row["beta [deg]"], row["gamma [deg]"],
                               False, False, False))

[array([[ 0.        ,  0.        ,  0.        ],
        [ 0.        , -0.51204286,  5.611     ],
        [-0.51204286,  5.611     ,  0.        ],
        [-0.51204286,  5.09895714,  5.611     ],
        [ 5.611     ,  0.        , -0.51204286],
        [ 5.611     , -0.51204286,  5.09895714],
        [ 5.09895714,  5.611     , -0.51204286],
        [ 5.09895714,  5.09895714,  5.09895714]]),
 array([[0.   , 0.   , 0.   ],
        [0.   , 0.   , 7.848],
        [0.   , 5.591, 0.   ],
        [0.   , 5.591, 7.848],
        [5.539, 0.   , 0.   ],
        [5.539, 0.   , 7.848],
        [5.539, 5.591, 0.   ],
        [5.539, 5.591, 7.848]]),
 array([[ 0.        ,  0.        ,  0.        ],
        [ 0.        , -0.51054292,  5.584     ],
        [-0.51054292,  5.584     ,  0.        ],
        [-0.51054292,  5.07345708,  5.584     ],
        [ 5.584     ,  0.        , -0.51054292],
        [ 5.584     , -0.51054292,  5.07345708],
        [ 5.07345708,  5.584     , -0.51054292],
        [ 5.0

In [11]:
matrix_list, feat_cryst_list = featurising_coords(coords_of_structures=coords)
inlit_df['Crystals Featurised'] = feat_cryst_list

In [12]:
class_names = inlit_df["Lowest distortion"].unique()
class_names

array(['rhombohedral', 'orthorhombic', 'tetragonal', 'cubic'],
      dtype=object)

UMMM yeah not balanced let alone not enough variety in crystal structures anyways....

In [13]:
inlit_df["Lowest distortion"] = inlit_df["Lowest distortion"].astype('category')
inlit_df["Lowest distortion"] = inlit_df["Lowest distortion"].cat.codes
inlit_df

Unnamed: 0,Chemical formula,A,B,In literature,Radius A [ang],Radius B [ang],Lowest distortion,Formation energy [eV/atom],Stability [eV/atom],Magnetic moment [mu_B],Volume per atom [A^3/atom],Band gap [eV],a [ang],b [ang],c [ang],alpha [deg],beta [deg],gamma [deg],Vacancy energy [eV/O atom],Crystals Featurised
109,AgNbO3,Ag,Nb,True,1.28,0.64,2,-2.009,0.043,0.000,12.263,1.269,5.611,5.611,5.611,59.2,59.2,59.2,4.672,"[0.5295302222328426, 0.7044194820253026, -0.0,..."
125,AgSbO3,Ag,Sb,True,1.28,0.60,1,-1.157,0.109,-,12.152,0.000,5.539,5.591,7.848,90.0,90.0,90.0,2.278,"[0.5210597052388797, 0.6997363278660437, -1.0,..."
131,AgTaO3,Ag,Ta,True,1.28,0.64,2,-2.247,0.018,0.000,12.114,2.030,5.584,5.584,5.584,59.3,59.3,59.3,5.586,"[0.5302239537023041, 0.7060157986914541, -0.0,..."
140,AgVO3,Ag,V,True,1.28,0.54,3,-1.582,0.125,0.000,11.419,0.341,3.684,3.684,4.208,90.0,90.0,90.0,1.832,"[0.5947225131079968, 0.8143470613673254, -1.0,..."
449,BaCeO3,Ba,Ce,True,1.61,0.87,1,-2.784,0.150,0.200,18.712,0.000,6.355,6.496,9.066,90.0,90.0,90.0,-0.006,"[0.501824555094652, 0.6611589176987488, -1.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5129,YbFeO3,Yb,Fe,True,1.04,0.65,1,-2.384,-0.123,0.799,10.360,0.000,5.221,5.341,7.429,90.0,90.0,90.0,1.336,"[0.5288278861978134, 0.7192337910962412, -1.0,..."
5143,YbMnO3,Yb,Mn,True,1.14,0.53,1,-2.610,-0.078,0.600,10.224,0.700,5.193,5.322,7.398,90.0,90.0,90.0,1.429,"[0.5294897466387292, 0.7211714988325235, -1.0,..."
5148,YbNiO3,Yb,Ni,True,1.04,0.60,1,-1.873,0.027,0.000,9.982,0.000,5.126,5.319,7.323,90.0,90.0,90.0,-0.017,"[0.5308019192930169, 0.7260711920959831, -1.0,..."
5173,YbTiO3,Yb,Ti,True,1.14,0.61,1,-3.528,-0.128,0.000,10.921,2.364,5.298,5.449,7.566,90.0,90.0,90.0,5.064,"[0.526379589684247, 0.7151210880429385, -1.0, ..."


In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MultiLabelBinarizer

features = inlit_df["Crystals Featurised"]
target = inlit_df["Lowest distortion"]

mlb = MultiLabelBinarizer()
features_transformed = mlb.fit_transform(inlit_df["Crystals Featurised"])

x_train, x_test ,y_train, y_test = train_test_split(features_transformed,target,test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

In [21]:
accuracy = round(model.score(x_test, y_test) * 100, 2)
print(f"Accuracy of the random forest: {accuracy}%")

Accuracy of the random forest: 82.22%


Add in some more evaluation numbers here like recall precision and CV and f1score (all things i can also talk about within my thesis)