### Load data

In [1]:
import pandas as pd
import joblib

from pymatgen import core
from matminer.featurizers.composition import ElementProperty, ElementFraction, Stoichiometry, IonProperty
from matminer.featurizers.structure import JarvisCFID, DensityFeatures, StructuralComplexity, GlobalSymmetryFeatures

In [3]:
def element_property_features(df, formula_col_name='formula'):

    # Create new column
    df['composition'] = df[formula_col_name] 

    # Create composition object 
    df['composition'] = df['composition'].apply(lambda x: core.Composition(x))
    
    # composition
    data_source = ['magpie', 'matminer', 'deml', 'megnet_el']

    for i in data_source:
        ep = ElementProperty.from_preset(i)
        print('Generating element property using ' + i)
        print('No. of features:', len(ep.feature_labels()))

        df = ep.featurize_dataframe(df, 'composition', ignore_errors=True)

    return df



def additional_compositional_features(df, formula_col_name='formula'):

    if formula_col_name not in df.columns:
        # Create new column
        df['composition'] = df[formula_col_name] 

        # Create composition object 
        df['composition'] = df['composition'].apply(lambda x: core.Composition(x))
        
    # composition
    descriptors_name = ['ElementFraction', 'Stoichiometry', 'IonProperty']
    descriptors = [ElementFraction(), Stoichiometry(), IonProperty()]

    for idx, des in enumerate(descriptors):

        print('Generating element property using ' + descriptors_name[idx])
        print('No. of features:', len(des.feature_labels()))

        df = des.featurize_dataframe(df, 'composition', ignore_errors=True)

    return df


def structural_features(df, struc_col_name='structure'):
        
    # Structural descriptors
    descriptors_name = ['DensityFeatures', 'StructuralComplexity', 'GlobalSymmetryFeatures', 'JarvisCFID']
    descriptors = [DensityFeatures(), StructuralComplexity(), GlobalSymmetryFeatures(), JarvisCFID()]

    for idx, des in enumerate(descriptors):

        print('Generating structural features using ' + descriptors_name[idx])
        print('No. of features:', len(des.feature_labels()))

        df = des.featurize_dataframe(df, struc_col_name, ignore_errors=True)

    return df


In [4]:
df_stable = joblib.load('mag_order_data_stable.pkl')

df_stable = df_stable.reset_index(drop=True)
df_stable

Unnamed: 0,formula,formula_pretty,parent_structure,task_id,structure,total_magnetization,total_magnetization_per_formula_unit,total_magnetization_per_unit_volume,ordering,ordering_changed,...,wf_meta.wf_name,wf_meta.wf_version,magmoms.vasp,magmoms.bader,input.structure,input.ordering,input.symmetry,input.index,input.origin,input.input_index
0,La2 Fe2 O6,LaFeO3,[[ 0.00000000e+00 -2.37510023e-07 3.39600692e...,46,[[-7.82001400e-06 -1.12207100e-06 3.38669968e...,0.000000,0.000000,0.000000e+00,AFM,False,...,MagneticOrderingsWF,2.0,"[0.0, 0.0, -4.244, 4.244, -0.0, -0.0, -0.0, 0....","[2e-06, 2e-06, -4.205593, 4.205577, -5e-06, 3e...",[[4.34678088e-07 1.60971119e-06 3.39043362e+00...,FiM,R-3c,2,afm,
1,La1 Fe4 Cu3 O12,LaFe4(CuO4)3,"[[0. 0. 0.] La, [ 1.87726055 -1.87726055 -1.87...",8,"[[0. 0. 0.] La, [ 1.877008 -1.8796705 -1.8749...",0.000023,0.000023,1.110162e-07,AFM,True,...,MagneticOrderingsWF,2.0,"[-0.0, -4.269, -4.269, 4.269, 4.269, -0.0, -0....","[-7.1e-05, -4.233391, -4.233098, 4.233435, 4.2...","[[0. 0. 0.] La, [ 1.8772605 -1.8772605 -1.8772...",FiM,Im-3,14,ferri_by_Fe,
2,V2 O4,VO2,"[[2.97386648 1.34460249 1.84605295] O, [1.9365...",161,"[[1.2111995 2.2557045 1.904173 ] V, [ 1.283501...",2.006047,4.012094,3.239287e-02,FM,False,...,MagneticOrderingsWF,2.0,"[1.073, 1.073, -0.052, -0.052, -0.052, -0.052]","[1.053744, 1.053376, -0.026466, -0.026404, -0....","[[1.20935 2.252478 1.9011065] V, [ 1.282399...",FM,Pnnm,0,fm,0.0
3,La2 Mn2 Se2 O3,La2Mn2Se2O3,[[2.66008616e-07 2.66008616e-07 5.93014472e+00...,75,"[[4.16594899 4.16592861 5.98962342] La, [2.083...",0.000000,0.000000,0.000000e+00,AFM,False,...,MagneticOrderingsWF,2.0,"[-0.0, -0.0, -4.525, 4.525, -0.0, -0.0, 0.0, -...","[-0.0, -0.0, -4.608927, 4.608923, 3e-06, -7e-0...","[[4.1984979 4.1984979 5.9301405] La, [2.099252...",FiM,I4/mmm,3,afm,
4,Sr1 Fe4 Cu3 O12,SrFe4(CuO4)3,"[[0. 0. 0.] Sr, [ 1.87094019 -1.87094019 -1.87...",108,"[[0. 0. 0.] Sr, [-1.86388225 1.87658728 -5.60...",8.996959,8.996959,4.311373e-02,FiM,True,...,MagneticOrderingsWF,2.0,"[0.003, -4.196, -4.196, -4.197, 4.237, -0.014,...","[0.002875, -4.160743, -4.160419, -4.161457, 4....","[[0. 0. 0.] Sr, [ 1.87094 -1.87094 -1.87094] F...",FiM,Im-3,4,ferri_by_Fe,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4600,Li4 Cr2 Ni2 P4 O16,Li2CrNi(PO4)2,"[[0. 0. 0.] Li, [0. 3.06495227 0. ...",81795,[[6.28734271e-04 6.14384626e+00 1.50546297e-04...,4.000000,8.000000,1.341678e-02,FiM,False,...,MagneticOrderingsWF,2.0,"[0.001, 0.001, 0.0, 0.0, 3.783, 3.784, -1.81, ...",Bader analysis failed: BaderAnalysis requires ...,"[[0. 0. 0.] Li, [0. 3.0649525 0. ...",FiM,P2_1/m,1,afm,0.0
4601,Li4 Mn2 Fe2 P4 O16,Li2MnFe(PO4)2,[[1.05032040e+01 9.09615345e-04 1.70381817e-03...,82490,[[1.04912224e+01 6.10416007e+00 4.49435081e-03...,0.000000,0.000000,0.000000e+00,AFM,False,...,MagneticOrderingsWF,2.0,"[0.0, 0.0, -0.0, -0.0, -4.658, 4.657, -3.776, ...",Bader analysis failed: BaderAnalysis requires ...,[[1.04912274e+01 6.10416138e+00 4.49289357e-03...,FiM,P1,2,afm,0.0
4602,Li4 Ni8 O4 F12,LiNi2OF3,"[[ 0.9172325 -3.05482915 -3.2857203 ] Li, [ 3...",82677,"[[ 0.91477017 -3.05591818 -3.29087249] Li, [ 3...",15.999357,63.997427,5.101179e-02,FM,False,...,MagneticOrderingsWF,2.0,"[0.003, 0.003, 0.003, 0.003, 1.808, 1.808, 1.8...",Bader analysis failed: BaderAnalysis requires ...,"[[ 0.91723147 -3.05482919 -3.28572031] Li, [ 3...",FM,Pnma,0,fm,0.0
4603,Li4 Cr4 C4 O16,LiCrCO4,"[[0.76043907 2.83791863 8.38700807] Li, [0.761...",82970,"[[0.75797695 2.86381312 8.52104199] Li, [0.766...",12.022615,48.090458,4.418863e-02,FM,False,...,MagneticOrderingsWF,2.0,"[0.002, 0.002, 0.002, 0.002, 3.004, 3.005, 3.0...",Bader analysis failed: BaderAnalysis requires ...,"[[0.75797662 2.86381081 8.52104391] Li, [0.766...",FM,Pnma,0,fm,0.0


In [5]:
df2 = element_property_features(df=df_stable, formula_col_name='formula')
df2.iloc[:3]

Generating element property using magpie
No. of features: 132


ElementProperty:   0%|          | 0/4605 [00:00<?, ?it/s]

Generating element property using matminer
No. of features: 65


ElementProperty:   0%|          | 0/4605 [00:00<?, ?it/s]

Generating element property using deml
No. of features: 80


ElementProperty:   0%|          | 0/4605 [00:00<?, ?it/s]

Generating element property using megnet_el
No. of features: 80


ElementProperty:   0%|          | 0/4605 [00:00<?, ?it/s]

Unnamed: 0,formula,formula_pretty,parent_structure,task_id,structure,total_magnetization,total_magnetization_per_formula_unit,total_magnetization_per_unit_volume,ordering,ordering_changed,...,MEGNetElementData minimum embedding 15,MEGNetElementData maximum embedding 15,MEGNetElementData range embedding 15,MEGNetElementData mean embedding 15,MEGNetElementData std_dev embedding 15,MEGNetElementData minimum embedding 16,MEGNetElementData maximum embedding 16,MEGNetElementData range embedding 16,MEGNetElementData mean embedding 16,MEGNetElementData std_dev embedding 16
0,La2 Fe2 O6,LaFeO3,[[ 0.00000000e+00 -2.37510023e-07 3.39600692e...,46,[[-7.82001400e-06 -1.12207100e-06 3.38669968e...,0.0,0.0,0.0,AFM,False,...,-0.105271,0.192867,0.298138,0.121537,0.154538,-0.333184,0.462238,0.795422,0.049152,0.336545
1,La1 Fe4 Cu3 O12,LaFe4(CuO4)3,"[[0. 0. 0.] La, [ 1.87726055 -1.87726055 -1.87...",8,"[[0. 0. 0.] La, [ 1.877008 -1.8796705 -1.8749...",2.3e-05,2.3e-05,1.110162e-07,AFM,True,...,-0.386133,0.192867,0.578999,0.043464,0.282576,-0.347532,0.462238,0.80977,-0.072314,0.284929
2,V2 O4,VO2,"[[2.97386648 1.34460249 1.84605295] O, [1.9365...",161,"[[1.2111995 2.2557045 1.904173 ] V, [ 1.283501...",2.006047,4.012094,0.03239287,FM,False,...,0.113411,0.192867,0.079455,0.166381,0.056183,0.038902,0.053152,0.01425,0.043652,0.010077


In [6]:
print(df2.columns)

Index(['formula', 'formula_pretty', 'parent_structure', 'task_id', 'structure',
       'total_magnetization', 'total_magnetization_per_formula_unit',
       'total_magnetization_per_unit_volume', 'ordering', 'ordering_changed',
       ...
       'MEGNetElementData minimum embedding 15',
       'MEGNetElementData maximum embedding 15',
       'MEGNetElementData range embedding 15',
       'MEGNetElementData mean embedding 15',
       'MEGNetElementData std_dev embedding 15',
       'MEGNetElementData minimum embedding 16',
       'MEGNetElementData maximum embedding 16',
       'MEGNetElementData range embedding 16',
       'MEGNetElementData mean embedding 16',
       'MEGNetElementData std_dev embedding 16'],
      dtype='object', length=386)


In [7]:
df3 = additional_compositional_features(df=df2, formula_col_name='formula')
df3.iloc[:3]

Generating element property using ElementFraction
No. of features: 103


ElementFraction:   0%|          | 0/4605 [00:00<?, ?it/s]

Generating element property using Stoichiometry
No. of features: 6


Stoichiometry:   0%|          | 0/4605 [00:00<?, ?it/s]

Generating element property using IonProperty
No. of features: 3


IonProperty:   0%|          | 0/4605 [00:00<?, ?it/s]

Unnamed: 0,formula,formula_pretty,parent_structure,task_id,structure,total_magnetization,total_magnetization_per_formula_unit,total_magnetization_per_unit_volume,ordering,ordering_changed,...,Lr,0-norm,2-norm,3-norm,5-norm,7-norm,10-norm,compound possible,max ionic char,avg ionic char
0,La2 Fe2 O6,LaFeO3,[[ 0.00000000e+00 -2.37510023e-07 3.39600692e...,46,[[-7.82001400e-06 -1.12207100e-06 3.38669968e...,0.0,0.0,0.0,AFM,False,...,0,3,0.663325,0.614463,0.600984,0.600078,0.600002,True,0.745613,0.151694
1,La1 Fe4 Cu3 O12,LaFe4(CuO4)3,"[[0. 0. 0.] La, [ 1.87726055 -1.87726055 -1.87...",8,"[[0. 0. 0.] La, [ 1.877008 -1.8796705 -1.8749...",2.3e-05,2.3e-05,1.110162e-07,AFM,True,...,0,4,0.65192,0.610465,0.60061,0.600044,0.600001,False,0.745613,0.122247
2,V2 O4,VO2,"[[2.97386648 1.34460249 1.84605295] O, [1.9365...",161,"[[1.2111995 2.2557045 1.904173 ] V, [ 1.283501...",2.006047,4.012094,0.03239287,FM,False,...,0,2,0.745356,0.693361,0.670782,0.667408,0.666732,False,0.559139,0.124253


In [8]:
df4 = structural_features(df=df3, struc_col_name='structure')
df4.iloc[:3]

Generating structural features using DensityFeatures
No. of features: 3


DensityFeatures:   0%|          | 0/4605 [00:00<?, ?it/s]

Generating structural features using StructuralComplexity
No. of features: 2


StructuralComplexity:   0%|          | 0/4605 [00:00<?, ?it/s]

Generating structural features using GlobalSymmetryFeatures
No. of features: 5


GlobalSymmetryFeatures:   0%|          | 0/4605 [00:00<?, ?it/s]

Generating structural features using JarvisCFID
No. of features: 1557


JarvisCFID:   0%|          | 0/4605 [00:00<?, ?it/s]

Unnamed: 0,formula,formula_pretty,parent_structure,task_id,structure,total_magnetization,total_magnetization_per_formula_unit,total_magnetization_per_unit_volume,ordering,ordering_changed,...,jml_nn_91,jml_nn_92,jml_nn_93,jml_nn_94,jml_nn_95,jml_nn_96,jml_nn_97,jml_nn_98,jml_nn_99,jml_nn_100
0,La2 Fe2 O6,LaFeO3,[[ 0.00000000e+00 -2.37510023e-07 3.39600692e...,46,[[-7.82001400e-06 -1.12207100e-06 3.38669968e...,0.0,0.0,0.0,AFM,False,...,1.2,7.2,3.6,4.8,3.6,10.8,2.4,31.2,12.0,10.8
1,La1 Fe4 Cu3 O12,LaFe4(CuO4)3,"[[0. 0. 0.] La, [ 1.87726055 -1.87726055 -1.87...",8,"[[0. 0. 0.] La, [ 1.877008 -1.8796705 -1.8749...",2.3e-05,2.3e-05,1.110162e-07,AFM,True,...,0.3,0.1,0.1,0.0,0.1,0.0,0.2,0.1,0.0,0.2
2,V2 O4,VO2,"[[2.97386648 1.34460249 1.84605295] O, [1.9365...",161,"[[1.2111995 2.2557045 1.904173 ] V, [ 1.283501...",2.006047,4.012094,0.03239287,FM,False,...,17.333333,12.666667,0.0,18.666667,2.666667,14.666667,10.666667,5.333333,20.0,10.666667


In [9]:
df4

Unnamed: 0,formula,formula_pretty,parent_structure,task_id,structure,total_magnetization,total_magnetization_per_formula_unit,total_magnetization_per_unit_volume,ordering,ordering_changed,...,jml_nn_91,jml_nn_92,jml_nn_93,jml_nn_94,jml_nn_95,jml_nn_96,jml_nn_97,jml_nn_98,jml_nn_99,jml_nn_100
0,La2 Fe2 O6,LaFeO3,[[ 0.00000000e+00 -2.37510023e-07 3.39600692e...,46,[[-7.82001400e-06 -1.12207100e-06 3.38669968e...,0.000000,0.000000,0.000000e+00,AFM,False,...,1.200000,7.200000,3.600000,4.800000,3.600000,10.800000,2.400000,31.200000,12.000000,10.800000
1,La1 Fe4 Cu3 O12,LaFe4(CuO4)3,"[[0. 0. 0.] La, [ 1.87726055 -1.87726055 -1.87...",8,"[[0. 0. 0.] La, [ 1.877008 -1.8796705 -1.8749...",0.000023,0.000023,1.110162e-07,AFM,True,...,0.300000,0.100000,0.100000,0.000000,0.100000,0.000000,0.200000,0.100000,0.000000,0.200000
2,V2 O4,VO2,"[[2.97386648 1.34460249 1.84605295] O, [1.9365...",161,"[[1.2111995 2.2557045 1.904173 ] V, [ 1.283501...",2.006047,4.012094,3.239287e-02,FM,False,...,17.333333,12.666667,0.000000,18.666667,2.666667,14.666667,10.666667,5.333333,20.000000,10.666667
3,La2 Mn2 Se2 O3,La2Mn2Se2O3,[[2.66008616e-07 2.66008616e-07 5.93014472e+00...,75,"[[4.16594899 4.16592861 5.98962342] La, [2.083...",0.000000,0.000000,0.000000e+00,AFM,False,...,7.111111,3.555556,1.777778,12.444444,0.000000,0.888889,12.444444,0.000000,8.888889,3.555556
4,Sr1 Fe4 Cu3 O12,SrFe4(CuO4)3,"[[0. 0. 0.] Sr, [ 1.87094019 -1.87094019 -1.87...",108,"[[0. 0. 0.] Sr, [-1.86388225 1.87658728 -5.60...",8.996959,8.996959,4.311373e-02,FiM,True,...,0.000000,0.200000,0.100000,0.000000,0.100000,0.100000,0.200000,0.200000,0.000000,0.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4600,Li4 Cr2 Ni2 P4 O16,Li2CrNi(PO4)2,"[[0. 0. 0.] Li, [0. 3.06495227 0. ...",81795,[[6.28734271e-04 6.14384626e+00 1.50546297e-04...,4.000000,8.000000,1.341678e-02,FiM,False,...,11.000000,9.285714,11.142857,10.428571,13.000000,10.285714,10.857143,11.857143,12.142857,9.428571
4601,Li4 Mn2 Fe2 P4 O16,Li2MnFe(PO4)2,[[1.05032040e+01 9.09615345e-04 1.70381817e-03...,82490,[[1.04912224e+01 6.10416007e+00 4.49435081e-03...,0.000000,0.000000,0.000000e+00,AFM,False,...,7.500000,10.714286,9.071429,8.642857,10.642857,9.142857,12.214286,11.285714,11.285714,12.785714
4602,Li4 Ni8 O4 F12,LiNi2OF3,"[[ 0.9172325 -3.05482915 -3.2857203 ] Li, [ 3...",82677,"[[ 0.91477017 -3.05591818 -3.29087249] Li, [ 3...",15.999357,63.997427,5.101179e-02,FM,False,...,10.285714,12.500000,6.500000,11.000000,10.357143,9.714286,9.428571,9.285714,9.785714,12.857143
4603,Li4 Cr4 C4 O16,LiCrCO4,"[[0.76043907 2.83791863 8.38700807] Li, [0.761...",82970,"[[0.75797695 2.86381312 8.52104199] Li, [0.766...",12.022615,48.090458,4.418863e-02,FM,False,...,12.428571,10.357143,7.428571,14.428571,11.428571,12.285714,10.142857,13.857143,15.357143,12.642857


In [10]:
joblib.dump(df4, 'database_mag_order.pkl')

['database_mag_order.pkl']

In [11]:
df_temp = joblib.load('database_mag_order.pkl')
df_temp

Unnamed: 0,formula,formula_pretty,parent_structure,task_id,structure,total_magnetization,total_magnetization_per_formula_unit,total_magnetization_per_unit_volume,ordering,ordering_changed,...,jml_nn_91,jml_nn_92,jml_nn_93,jml_nn_94,jml_nn_95,jml_nn_96,jml_nn_97,jml_nn_98,jml_nn_99,jml_nn_100
0,La2 Fe2 O6,LaFeO3,[[ 0.00000000e+00 -2.37510023e-07 3.39600692e...,46,[[-7.82001400e-06 -1.12207100e-06 3.38669968e...,0.000000,0.000000,0.000000e+00,AFM,False,...,1.200000,7.200000,3.600000,4.800000,3.600000,10.800000,2.400000,31.200000,12.000000,10.800000
1,La1 Fe4 Cu3 O12,LaFe4(CuO4)3,"[[0. 0. 0.] La, [ 1.87726055 -1.87726055 -1.87...",8,"[[0. 0. 0.] La, [ 1.877008 -1.8796705 -1.8749...",0.000023,0.000023,1.110162e-07,AFM,True,...,0.300000,0.100000,0.100000,0.000000,0.100000,0.000000,0.200000,0.100000,0.000000,0.200000
2,V2 O4,VO2,"[[2.97386648 1.34460249 1.84605295] O, [1.9365...",161,"[[1.2111995 2.2557045 1.904173 ] V, [ 1.283501...",2.006047,4.012094,3.239287e-02,FM,False,...,17.333333,12.666667,0.000000,18.666667,2.666667,14.666667,10.666667,5.333333,20.000000,10.666667
3,La2 Mn2 Se2 O3,La2Mn2Se2O3,[[2.66008616e-07 2.66008616e-07 5.93014472e+00...,75,"[[4.16594899 4.16592861 5.98962342] La, [2.083...",0.000000,0.000000,0.000000e+00,AFM,False,...,7.111111,3.555556,1.777778,12.444444,0.000000,0.888889,12.444444,0.000000,8.888889,3.555556
4,Sr1 Fe4 Cu3 O12,SrFe4(CuO4)3,"[[0. 0. 0.] Sr, [ 1.87094019 -1.87094019 -1.87...",108,"[[0. 0. 0.] Sr, [-1.86388225 1.87658728 -5.60...",8.996959,8.996959,4.311373e-02,FiM,True,...,0.000000,0.200000,0.100000,0.000000,0.100000,0.100000,0.200000,0.200000,0.000000,0.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4600,Li4 Cr2 Ni2 P4 O16,Li2CrNi(PO4)2,"[[0. 0. 0.] Li, [0. 3.06495227 0. ...",81795,[[6.28734271e-04 6.14384626e+00 1.50546297e-04...,4.000000,8.000000,1.341678e-02,FiM,False,...,11.000000,9.285714,11.142857,10.428571,13.000000,10.285714,10.857143,11.857143,12.142857,9.428571
4601,Li4 Mn2 Fe2 P4 O16,Li2MnFe(PO4)2,[[1.05032040e+01 9.09615345e-04 1.70381817e-03...,82490,[[1.04912224e+01 6.10416007e+00 4.49435081e-03...,0.000000,0.000000,0.000000e+00,AFM,False,...,7.500000,10.714286,9.071429,8.642857,10.642857,9.142857,12.214286,11.285714,11.285714,12.785714
4602,Li4 Ni8 O4 F12,LiNi2OF3,"[[ 0.9172325 -3.05482915 -3.2857203 ] Li, [ 3...",82677,"[[ 0.91477017 -3.05591818 -3.29087249] Li, [ 3...",15.999357,63.997427,5.101179e-02,FM,False,...,10.285714,12.500000,6.500000,11.000000,10.357143,9.714286,9.428571,9.285714,9.785714,12.857143
4603,Li4 Cr4 C4 O16,LiCrCO4,"[[0.76043907 2.83791863 8.38700807] Li, [0.761...",82970,"[[0.75797695 2.86381312 8.52104199] Li, [0.766...",12.022615,48.090458,4.418863e-02,FM,False,...,12.428571,10.357143,7.428571,14.428571,11.428571,12.285714,10.142857,13.857143,15.357143,12.642857


#### Custom features

In [1]:
import numpy as np
import pandas as pd
import joblib

from pymatgen.core import Composition, Element   

In [2]:
def OHE(df, categorical_cols):
    """
    One-hot-encoding of categorical columns

    args: 
        (1) categorical_cols (list) - list of categorical columns

    return: 
        (1) one-hot-encoded categorical features
    """

    df_ohe = pd.get_dummies(data = df, columns = categorical_cols, prefix_sep = '_ohe_', drop_first = False)

    ohe_cols = [i for i in df_ohe.columns if '_ohe_' in i]


    return df_ohe, ohe_cols

def custom_features(df):
    # Generate total molecular weight of Composition
    df['weight'] = None
    df['weight'] = df['formula'].map(lambda x: Composition(str(x)).weight)


    # Generate total electrons
    df['total_e'] = None
    df['total_e'] = df['formula'].map(lambda x: Composition(str(x)).total_electrons)


    # Generate average electronegativity of the composition
    df['avg_electroneg'] = None
    df['avg_electroneg'] = df['formula'].map(lambda x: Composition(str(x)).average_electroneg)


    # Check if Composition contains any elements matching a given category
    category = [
                'noble_gas', 'transition_metal', 'post_transition_metal', 'rare_earth_metal', 'metal', 'metalloid', \
                'alkali', 'alkaline', 'halogen', 'chalcogen', 'lanthanoid', 'actinoid', 'quadrupolar', 's-block', 'p-block', \
                'd-block', 'f-block'
                ]

    for c in category:
        df[c] = None
        df[c] = df['formula'].map(lambda x: Composition(str(x)).contains_element_type(c))
        df[c] = df[c].astype(int)

    return df
    

def movecol(dataframe, cols_to_move = [], ref_col = '', place = 'after'):

    cols = dataframe.columns.tolist()

    if place == 'after':
        s1 = cols[:list(cols).index(ref_col) + 1]
        s2 = cols_to_move


    if place == 'before':
        s1 = cols[:list(cols).index(ref_col)]
        s2 = cols_to_move + [ref_col]
    

    s1 = [i for i in s1 if i not in s2]
    s3 = [i for i in cols if i not in s1 + s2]
    

    return dataframe[s1 + s2 + s3]

In [3]:
df = joblib.load('database_mag_order.pkl')

columns = [
            'formula_pretty', 'parent_structure', 'task_id', 'structure', 'total_magnetization_per_formula_unit',
            'total_magnetization_per_unit_volume', 'symmetry', 'symmetry_changed', 'stable', 'decomposes_to', 
            'wf_meta.wf_uuid', 'wf_meta.wf_name', 'wf_meta.wf_version', 'magmoms.vasp', 'magmoms.bader',
            'input.structure', 'input.ordering', 'input.symmetry', 'input.index',
            'input.origin', 'input.input_index', 'ordering_changed', 'composition'
            ]

df = df.drop(columns, axis=1)
df = df.fillna(0)
df

Unnamed: 0,formula,total_magnetization,ordering,energy_per_atom,energy_above_ground_state_per_atom,energy_diff_relax_static,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,...,jml_nn_91,jml_nn_92,jml_nn_93,jml_nn_94,jml_nn_95,jml_nn_96,jml_nn_97,jml_nn_98,jml_nn_99,jml_nn_100
0,La2 Fe2 O6,0.000000,AFM,-7.639507,0.0,-0.000030,8.0,57.0,49.0,21.400000,...,1.200000,7.200000,3.600000,4.800000,3.600000,10.800000,2.400000,31.200000,12.000000,10.800000
1,La1 Fe4 Cu3 O12,0.000023,AFM,-6.342495,0.0,-0.000002,8.0,57.0,49.0,17.200000,...,0.300000,0.100000,0.100000,0.000000,0.100000,0.000000,0.200000,0.100000,0.000000,0.200000
2,V2 O4,2.006047,FM,-7.782158,0.0,0.000058,8.0,23.0,15.0,13.000000,...,17.333333,12.666667,0.000000,18.666667,2.666667,14.666667,10.666667,5.333333,20.000000,10.666667
3,La2 Mn2 Se2 O3,0.000000,AFM,-7.443381,0.0,-0.000077,8.0,57.0,49.0,28.444444,...,7.111111,3.555556,1.777778,12.444444,0.000000,0.888889,12.444444,0.000000,8.888889,3.555556
4,Sr1 Fe4 Cu3 O12,8.996959,FiM,-6.010237,0.0,-0.000130,8.0,38.0,30.0,16.250000,...,0.000000,0.200000,0.100000,0.000000,0.100000,0.100000,0.200000,0.200000,0.000000,0.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4600,Li4 Cr2 Ni2 P4 O16,4.000000,FiM,-6.719955,0.0,-0.000099,3.0,28.0,25.0,10.857143,...,11.000000,9.285714,11.142857,10.428571,13.000000,10.285714,10.857143,11.857143,12.142857,9.428571
4601,Li4 Mn2 Fe2 P4 O16,0.000000,AFM,-7.016451,0.0,-0.000034,3.0,26.0,23.0,10.785714,...,7.500000,10.714286,9.071429,8.642857,10.642857,9.142857,12.214286,11.285714,11.285714,12.785714
4602,Li4 Ni8 O4 F12,15.999357,FM,-4.590463,0.0,-0.000099,3.0,28.0,25.0,13.428571,...,10.285714,12.500000,6.500000,11.000000,10.357143,9.714286,9.428571,9.285714,9.785714,12.857143
4603,Li4 Cr4 C4 O16,12.022615,FM,-7.292449,0.0,-0.000050,3.0,24.0,21.0,9.285714,...,12.428571,10.357143,7.428571,14.428571,11.428571,12.285714,10.142857,13.857143,15.357143,12.642857


In [4]:
# Features by type
compositional_features = joblib.load('compositional_features.pkl')
print('No. of compositional features:', len(compositional_features))

structural_features = joblib.load('structural_features.pkl')
print('No. of structural features:', len(structural_features))

categorical_features = [col for col, dt in df.dtypes.items() if dt == object and col != 'formula']
print('No. of categorical features:', len(categorical_features))

categorical_features

No. of compositional features: 469
No. of structural features: 1567
No. of categorical features: 2


['ordering', 'crystal_system']

In [5]:
# One-hot-encoding of categorical features
df2, ohe_cols = OHE(df=df, categorical_cols=categorical_features)
ohe_cols

['ordering_ohe_AFM',
 'ordering_ohe_FM',
 'ordering_ohe_FiM',
 'ordering_ohe_NM',
 'crystal_system_ohe_cubic',
 'crystal_system_ohe_hexagonal',
 'crystal_system_ohe_monoclinic',
 'crystal_system_ohe_orthorhombic',
 'crystal_system_ohe_tetragonal',
 'crystal_system_ohe_triclinic',
 'crystal_system_ohe_trigonal']

In [6]:
df2.head()

Unnamed: 0,formula,total_magnetization,energy_per_atom,energy_above_ground_state_per_atom,energy_diff_relax_static,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,...,ordering_ohe_FM,ordering_ohe_FiM,ordering_ohe_NM,crystal_system_ohe_cubic,crystal_system_ohe_hexagonal,crystal_system_ohe_monoclinic,crystal_system_ohe_orthorhombic,crystal_system_ohe_tetragonal,crystal_system_ohe_triclinic,crystal_system_ohe_trigonal
0,La2 Fe2 O6,0.0,-7.639507,0.0,-3e-05,8.0,57.0,49.0,21.4,16.08,...,0,0,0,0,0,0,0,0,0,1
1,La1 Fe4 Cu3 O12,2.3e-05,-6.342495,0.0,-2e-06,8.0,57.0,49.0,17.2,11.04,...,0,0,0,1,0,0,0,0,0,0
2,V2 O4,2.006047,-7.782158,0.0,5.8e-05,8.0,23.0,15.0,13.0,6.666667,...,1,0,0,0,0,0,1,0,0,0
3,La2 Mn2 Se2 O3,0.0,-7.443381,0.0,-7.7e-05,8.0,57.0,49.0,28.444444,15.160494,...,0,0,0,0,0,0,0,1,0,0
4,Sr1 Fe4 Cu3 O12,8.996959,-6.010237,0.0,-0.00013,8.0,38.0,30.0,16.25,9.9,...,0,1,0,0,0,0,0,0,0,1


In [7]:
# Check if object type exists
[col for col, dt in df2.dtypes.items() if dt == object and col != 'formula']

[]

In [8]:
# Add further features 
df3 = custom_features(df2)
df3

Unnamed: 0,formula,total_magnetization,energy_per_atom,energy_above_ground_state_per_atom,energy_diff_relax_static,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,...,alkaline,halogen,chalcogen,lanthanoid,actinoid,quadrupolar,s-block,p-block,d-block,f-block
0,La2 Fe2 O6,0.000000,-7.639507,0.0,-0.000030,8.0,57.0,49.0,21.400000,16.080000,...,0,0,1,1,0,1,0,1,1,1
1,La1 Fe4 Cu3 O12,0.000023,-6.342495,0.0,-0.000002,8.0,57.0,49.0,17.200000,11.040000,...,0,0,1,1,0,1,0,1,1,1
2,V2 O4,2.006047,-7.782158,0.0,0.000058,8.0,23.0,15.0,13.000000,6.666667,...,0,0,1,0,0,1,0,1,1,0
3,La2 Mn2 Se2 O3,0.000000,-7.443381,0.0,-0.000077,8.0,57.0,49.0,28.444444,15.160494,...,0,0,1,1,0,1,0,1,1,1
4,Sr1 Fe4 Cu3 O12,8.996959,-6.010237,0.0,-0.000130,8.0,38.0,30.0,16.250000,9.900000,...,1,0,1,0,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4600,Li4 Cr2 Ni2 P4 O16,4.000000,-6.719955,0.0,-0.000099,3.0,28.0,25.0,10.857143,5.510204,...,0,0,1,0,0,1,1,1,1,0
4601,Li4 Mn2 Fe2 P4 O16,0.000000,-7.016451,0.0,-0.000034,3.0,26.0,23.0,10.785714,5.408163,...,0,0,1,0,0,1,1,1,1,0
4602,Li4 Ni8 O4 F12,15.999357,-4.590463,0.0,-0.000099,3.0,28.0,25.0,13.428571,8.326531,...,0,1,1,0,0,1,1,1,1,0
4603,Li4 Cr4 C4 O16,12.022615,-7.292449,0.0,-0.000050,3.0,24.0,21.0,9.285714,4.204082,...,0,0,1,0,0,1,1,1,1,0


In [9]:
df3.columns[2049]

'crystal_system_ohe_triclinic'

In [10]:
# Move target variable to the end
# Rearrange columns
cols_to_move = ['energy_per_atom', 'energy_above_ground_state_per_atom', 'energy_diff_relax_static', 'total_magnetization']

df3 = movecol(dataframe=df3, cols_to_move = cols_to_move, ref_col = 'f-block', place = 'after')
df3.head()

Unnamed: 0,formula,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,MagpieData maximum MendeleevNumber,MagpieData range MendeleevNumber,...,actinoid,quadrupolar,s-block,p-block,d-block,f-block,energy_per_atom,energy_above_ground_state_per_atom,energy_diff_relax_static,total_magnetization
0,La2 Fe2 O6,8.0,57.0,49.0,21.4,16.08,8.0,13.0,87.0,74.0,...,0,1,0,1,1,1,-7.639507,0.0,-3e-05,0.0
1,La1 Fe4 Cu3 O12,8.0,57.0,49.0,17.2,11.04,8.0,13.0,87.0,74.0,...,0,1,0,1,1,1,-6.342495,0.0,-2e-06,2.3e-05
2,V2 O4,8.0,23.0,15.0,13.0,6.666667,8.0,46.0,87.0,41.0,...,0,1,0,1,1,0,-7.782158,0.0,5.8e-05,2.006047
3,La2 Mn2 Se2 O3,8.0,57.0,49.0,28.444444,15.160494,8.0,13.0,89.0,76.0,...,0,1,0,1,1,1,-7.443381,0.0,-7.7e-05,0.0
4,Sr1 Fe4 Cu3 O12,8.0,38.0,30.0,16.25,9.9,8.0,8.0,87.0,79.0,...,0,1,1,1,1,0,-6.010237,0.0,-0.00013,8.996959


In [13]:
df3.columns.values[1:-1].tolist()

['MagpieData minimum Number',
 'MagpieData maximum Number',
 'MagpieData range Number',
 'MagpieData mean Number',
 'MagpieData avg_dev Number',
 'MagpieData mode Number',
 'MagpieData minimum MendeleevNumber',
 'MagpieData maximum MendeleevNumber',
 'MagpieData range MendeleevNumber',
 'MagpieData mean MendeleevNumber',
 'MagpieData avg_dev MendeleevNumber',
 'MagpieData mode MendeleevNumber',
 'MagpieData minimum AtomicWeight',
 'MagpieData maximum AtomicWeight',
 'MagpieData range AtomicWeight',
 'MagpieData mean AtomicWeight',
 'MagpieData avg_dev AtomicWeight',
 'MagpieData mode AtomicWeight',
 'MagpieData minimum MeltingT',
 'MagpieData maximum MeltingT',
 'MagpieData range MeltingT',
 'MagpieData mean MeltingT',
 'MagpieData avg_dev MeltingT',
 'MagpieData mode MeltingT',
 'MagpieData minimum Column',
 'MagpieData maximum Column',
 'MagpieData range Column',
 'MagpieData mean Column',
 'MagpieData avg_dev Column',
 'MagpieData mode Column',
 'MagpieData minimum Row',
 'MagpieDat

In [15]:
df3.columns.values[-1]

'total_magnetization'

In [16]:
# Regression example
# Define final input and target features 
features = df3.columns.values[1:-1].tolist()
target = [df3.columns.values[-1]]

len(features)

2069

In [17]:
df4 = df3[features + target]
df4

Unnamed: 0,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,MagpieData maximum MendeleevNumber,MagpieData range MendeleevNumber,MagpieData mean MendeleevNumber,...,actinoid,quadrupolar,s-block,p-block,d-block,f-block,energy_per_atom,energy_above_ground_state_per_atom,energy_diff_relax_static,total_magnetization
0,8.0,57.0,49.0,21.400000,16.080000,8.0,13.0,87.0,74.0,65.800000,...,0,1,0,1,1,1,-7.639507,0.0,-0.000030,0.000000
1,8.0,57.0,49.0,17.200000,11.040000,8.0,13.0,87.0,74.0,73.450000,...,0,1,0,1,1,1,-6.342495,0.0,-0.000002,0.000023
2,8.0,23.0,15.0,13.000000,6.666667,8.0,46.0,87.0,41.0,73.333333,...,0,1,0,1,1,0,-7.782158,0.0,0.000058,2.006047
3,8.0,57.0,49.0,28.444444,15.160494,8.0,13.0,89.0,76.0,63.222222,...,0,1,0,1,1,1,-7.443381,0.0,-0.000077,0.000000
4,8.0,38.0,30.0,16.250000,9.900000,8.0,8.0,87.0,79.0,73.200000,...,0,1,1,1,1,0,-6.010237,0.0,-0.000130,8.996959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4600,3.0,28.0,25.0,10.857143,5.510204,8.0,1.0,87.0,86.0,69.571429,...,0,1,1,1,1,0,-6.719955,0.0,-0.000099,4.000000
4601,3.0,26.0,23.0,10.785714,5.408163,8.0,1.0,87.0,86.0,69.357143,...,0,1,1,1,1,0,-7.016451,0.0,-0.000034,0.000000
4602,3.0,28.0,25.0,13.428571,8.326531,9.0,1.0,93.0,92.0,69.857143,...,0,1,1,1,1,0,-4.590463,0.0,-0.000099,15.999357
4603,3.0,24.0,21.0,9.285714,4.204082,8.0,1.0,87.0,86.0,67.857143,...,0,1,1,1,1,0,-7.292449,0.0,-0.000050,12.022615


In [18]:
joblib.dump(df4, 'database_mag_order_cleaned.pkl')

['database_mag_order_cleaned.pkl']

#### Further cleaning

In [1]:
import pandas as pd
import joblib
import re

In [2]:
# Load data
df1 = joblib.load('database_mag_order_cleaned.pkl')

df2 = joblib.load('backup/mag_order_data_stable.pkl')
df2 = df2[['task_id', 'ordering']]
df2 = df2.reset_index()

df = pd.merge(df1, df2, left_index=True, right_index=True)
df = df.drop(['task_id', 'index'], axis=1)
df.head()

Unnamed: 0,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,MagpieData maximum MendeleevNumber,MagpieData range MendeleevNumber,MagpieData mean MendeleevNumber,...,quadrupolar,s-block,p-block,d-block,f-block,energy_per_atom,energy_above_ground_state_per_atom,energy_diff_relax_static,total_magnetization,ordering
0,8.0,57.0,49.0,21.4,16.08,8.0,13.0,87.0,74.0,65.8,...,1,0,1,1,1,-7.639507,0.0,-3e-05,0.0,AFM
1,8.0,57.0,49.0,17.2,11.04,8.0,13.0,87.0,74.0,73.45,...,1,0,1,1,1,-6.342495,0.0,-2e-06,2.3e-05,AFM
2,8.0,23.0,15.0,13.0,6.666667,8.0,46.0,87.0,41.0,73.333333,...,1,0,1,1,0,-7.782158,0.0,5.8e-05,2.006047,FM
3,8.0,57.0,49.0,28.444444,15.160494,8.0,13.0,89.0,76.0,63.222222,...,1,0,1,1,1,-7.443381,0.0,-7.7e-05,0.0,AFM
4,8.0,38.0,30.0,16.25,9.9,8.0,8.0,87.0,79.0,73.2,...,1,1,1,1,0,-6.010237,0.0,-0.00013,8.996959,FiM


In [3]:
# Remove duplications
df  = df.loc[:,~df.columns.duplicated()]

# Remove space and JSON character in the column
df.columns = df.columns.str.replace(' ','_')
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

# Nan to zero
df = df.fillna(0)

In [4]:
df.columns[-5:].tolist()

['energy_per_atom',
 'energy_above_ground_state_per_atom',
 'energy_diff_relax_static',
 'total_magnetization',
 'ordering']

In [5]:
feature = df.columns[:-5].tolist()
print(len(feature))

2066


In [6]:
joblib.dump(df, 'database_mag_order_merged.pkl')
joblib.dump(feature, 'features.pkl')

['features.pkl']