### Load data

In [1]:
# Import packages
import pandas as pd
import joblib

from pymatgen import core

In [2]:
# Import custom packages
import sys
sys.path.insert(1, 'C:/Users/Songy/OneDrive/Documents/GBSFS4MPP_local/new_properties')

import create_material_objects as cmo

In [3]:
# Load dataset
df_original = joblib.load('castelli_perovskites.pkl') 
df_original

Unnamed: 0,fermi level,fermi width,e_form,gap is direct,structure,mu_b,formula,vbm,cbm,gap gllbsc
0,0.312138,0.001837,2.16,True,"[[0. 0. 0.] Rh, [1.97726555 1.97726555 1.97726...",1.974478e-02,RhTeN3,6.187694,6.187694,0.0
1,0.297083,0.001837,1.52,True,"[[2.54041798 0. 0. ] Hf, [1.020...",-2.253054e-05,HfTeO3,6.033125,6.033125,0.0
2,0.191139,0.003675,1.48,True,"[[0.60790913 0. 0. ] Re, [2.186...",4.982109e+00,ReAsO2F,6.602253,6.602253,0.0
3,0.316346,0.001837,1.24,True,"[[2.83091357 0. 0. ] W, [2.6573...",-8.684496e-01,WReO2S,5.738462,5.738462,0.0
4,0.312658,0.003675,0.62,True,"[[0.00518937 0. 0. ] Bi, [2.172...",2.164069e-15,BiHfO2F,6.074736,6.074736,0.0
...,...,...,...,...,...,...,...,...,...,...
18923,0.123028,0.001837,1.66,True,"[[4.44077598 0. 0. ] Rb, [2.652...",1.566419e+00,RbPdO2S,5.164060,5.164060,0.0
18924,0.081229,0.001837,2.12,True,[[4.56913824e-03 7.21569024e-19 0.00000000e+00...,1.993566e+00,LiBaO2N,5.062501,5.062501,0.0
18925,0.189810,0.003675,1.50,True,"[[0.0040044 0. 0. ] Zn, [1.821570...",1.599586e+00,ZnBeOFN,6.589724,6.589724,0.0
18926,0.209947,0.001837,2.48,True,"[[0. 0. 0.] Ca, [2.16744896 2.16744896 2.16744...",3.280734e+00,CaTlN3,5.411525,5.411525,0.0


In [4]:
df = df_original[['formula', 'structure']]
df

Unnamed: 0,formula,structure
0,RhTeN3,"[[0. 0. 0.] Rh, [1.97726555 1.97726555 1.97726..."
1,HfTeO3,"[[2.54041798 0. 0. ] Hf, [1.020..."
2,ReAsO2F,"[[0.60790913 0. 0. ] Re, [2.186..."
3,WReO2S,"[[2.83091357 0. 0. ] W, [2.6573..."
4,BiHfO2F,"[[0.00518937 0. 0. ] Bi, [2.172..."
...,...,...
18923,RbPdO2S,"[[4.44077598 0. 0. ] Rb, [2.652..."
18924,LiBaO2N,[[4.56913824e-03 7.21569024e-19 0.00000000e+00...
18925,ZnBeOFN,"[[0.0040044 0. 0. ] Zn, [1.821570..."
18926,CaTlN3,"[[0. 0. 0.] Ca, [2.16744896 2.16744896 2.16744..."


In [5]:
# Initialize to create/convert material object (pymatgen)
converter = cmo.material_object(df=df)

# Possible functions
# Create Composition object from chemical formula
# df = converter.formula2composition(formula_col='formula')

# Create Composition object from Structure object
# df = converter.structure2composition(structure_col='structure')

# Create Oxidation Structure object from Structure object
# df = converter.structure2oxidstructure(structure_col='structure')

# Create Oxidation Composition object from Composition object
# df = converter.composition2oxidcomposition(composition_col='composition')

# Source Structure Object from MP database using Composition object 
# df = converter.composition2structure_fromMP(composition_col='composition', mapi_key='lZCh9ke4qRxMQO16')

In [6]:
# Create Composition object from chemical formula
df = converter.formula2composition(formula_col='formula')
df

Unnamed: 0,formula,structure,composition
0,RhTeN3,"[[0. 0. 0.] Rh, [1.97726555 1.97726555 1.97726...","(Rh, Te, N)"
1,HfTeO3,"[[2.54041798 0. 0. ] Hf, [1.020...","(Hf, Te, O)"
2,ReAsO2F,"[[0.60790913 0. 0. ] Re, [2.186...","(Re, As, O, F)"
3,WReO2S,"[[2.83091357 0. 0. ] W, [2.6573...","(W, Re, O, S)"
4,BiHfO2F,"[[0.00518937 0. 0. ] Bi, [2.172...","(Bi, Hf, O, F)"
...,...,...,...
18923,RbPdO2S,"[[4.44077598 0. 0. ] Rb, [2.652...","(Rb, Pd, O, S)"
18924,LiBaO2N,[[4.56913824e-03 7.21569024e-19 0.00000000e+00...,"(Li, Ba, O, N)"
18925,ZnBeOFN,"[[0.0040044 0. 0. ] Zn, [1.821570...","(Zn, Be, O, F, N)"
18926,CaTlN3,"[[0. 0. 0.] Ca, [2.16744896 2.16744896 2.16744...","(Ca, Tl, N)"


In [7]:
# Create Oxidation Structure object from Structure object
df = converter.structure2oxidstructure(structure_col='structure')
df


StructureToOxidStructure:   0%|          | 0/18928 [00:00<?, ?it/s]

Unnamed: 0,formula,structure,composition,oxidation_structure
0,RhTeN3,"[[0. 0. 0.] Rh, [1.97726555 1.97726555 1.97726...","(Rh, Te, N)","[[0. 0. 0.] Rh3+, [1.97726555 1.97726555 1.977..."
1,HfTeO3,"[[2.54041798 0. 0. ] Hf, [1.020...","(Hf, Te, O)","[[2.54041798 0. 0. ] Hf0+, [1.0..."
2,ReAsO2F,"[[0.60790913 0. 0. ] Re, [2.186...","(Re, As, O, F)","[[0.60790913 0. 0. ] Re7+, [2.1..."
3,WReO2S,"[[2.83091357 0. 0. ] W, [2.6573...","(W, Re, O, S)","[[2.83091357 0. 0. ] W3+, [2.65..."
4,BiHfO2F,"[[0.00518937 0. 0. ] Bi, [2.172...","(Bi, Hf, O, F)","[[0.00518937 0. 0. ] Bi+, [2.17..."
...,...,...,...,...
18923,RbPdO2S,"[[4.44077598 0. 0. ] Rb, [2.652...","(Rb, Pd, O, S)","[[4.44077598 0. 0. ] Rb+, [2.65..."
18924,LiBaO2N,[[4.56913824e-03 7.21569024e-19 0.00000000e+00...,"(Li, Ba, O, N)",[[4.56913824e-03 7.21569024e-19 0.00000000e+00...
18925,ZnBeOFN,"[[0.0040044 0. 0. ] Zn, [1.821570...","(Zn, Be, O, F, N)","[[0.0040044 0. 0. ] Zn2+, [1.8215..."
18926,CaTlN3,"[[0. 0. 0.] Ca, [2.16744896 2.16744896 2.16744...","(Ca, Tl, N)","[[0. 0. 0.] Ca2+, [2.16744896 2.16744896 2.167..."


In [8]:
# Create Oxidation Composition object from Composition object
df = converter.composition2oxidcomposition(composition_col='composition')
df

CompositionToOxidComposition:   0%|          | 0/18928 [00:00<?, ?it/s]

Unnamed: 0,formula,structure,composition,oxidation_structure,oxidation_composition
0,RhTeN3,"[[0. 0. 0.] Rh, [1.97726555 1.97726555 1.97726...","(Rh, Te, N)","[[0. 0. 0.] Rh3+, [1.97726555 1.97726555 1.977...","(Rh3+, Te6+, N3-)"
1,HfTeO3,"[[2.54041798 0. 0. ] Hf, [1.020...","(Hf, Te, O)","[[2.54041798 0. 0. ] Hf0+, [1.0...","(Hf0+, Te0+, O0+)"
2,ReAsO2F,"[[0.60790913 0. 0. ] Re, [2.186...","(Re, As, O, F)","[[0.60790913 0. 0. ] Re7+, [2.1...","(Re7+, As2-, O2-, F-)"
3,WReO2S,"[[2.83091357 0. 0. ] W, [2.6573...","(W, Re, O, S)","[[2.83091357 0. 0. ] W3+, [2.65...","(W3+, Re3+, O2-, S2-)"
4,BiHfO2F,"[[0.00518937 0. 0. ] Bi, [2.172...","(Bi, Hf, O, F)","[[0.00518937 0. 0. ] Bi+, [2.17...","(Bi+, Hf4+, O2-, F-)"
...,...,...,...,...,...
18923,RbPdO2S,"[[4.44077598 0. 0. ] Rb, [2.652...","(Rb, Pd, O, S)","[[4.44077598 0. 0. ] Rb+, [2.65...","(Rb+, Pd4+, O2-, S-)"
18924,LiBaO2N,[[4.56913824e-03 7.21569024e-19 0.00000000e+00...,"(Li, Ba, O, N)",[[4.56913824e-03 7.21569024e-19 0.00000000e+00...,"(Li+, Ba2+, O2-, N+)"
18925,ZnBeOFN,"[[0.0040044 0. 0. ] Zn, [1.821570...","(Zn, Be, O, F, N)","[[0.0040044 0. 0. ] Zn2+, [1.8215...","(Zn2+, Be2+, O2-, F-, N-)"
18926,CaTlN3,"[[0. 0. 0.] Ca, [2.16744896 2.16744896 2.16744...","(Ca, Tl, N)","[[0. 0. 0.] Ca2+, [2.16744896 2.16744896 2.167...","(Ca2+, Tl+, N3+, N3-)"


In [9]:
joblib.dump(df, 'castelli_perovskites_material_objects.pkl')

['castelli_perovskites_material_objects.pkl']

#### Generate Features

##### (1) Composition

In [1]:
# Import packages
import pandas as pd
import joblib

from pymatgen import core

In [2]:
# Import custom packages
import sys
sys.path.insert(1, 'C:/Users/Songy/OneDrive/Documents/GBSFS4MPP_local/new_properties')

import featurizers as feat

In [3]:
# Load Materials object file
df_obj = joblib.load('castelli_perovskites_material_objects.pkl')
df_obj

Unnamed: 0,formula,structure,composition,oxidation_structure,oxidation_composition
0,RhTeN3,"[[0. 0. 0.] Rh, [1.97726555 1.97726555 1.97726...","(Rh, Te, N)","[[0. 0. 0.] Rh3+, [1.97726555 1.97726555 1.977...","(Rh3+, Te6+, N3-)"
1,HfTeO3,"[[2.54041798 0. 0. ] Hf, [1.020...","(Hf, Te, O)","[[2.54041798 0. 0. ] Hf0+, [1.0...","(Hf0+, Te0+, O0+)"
2,ReAsO2F,"[[0.60790913 0. 0. ] Re, [2.186...","(Re, As, O, F)","[[0.60790913 0. 0. ] Re7+, [2.1...","(Re7+, As2-, O2-, F-)"
3,WReO2S,"[[2.83091357 0. 0. ] W, [2.6573...","(W, Re, O, S)","[[2.83091357 0. 0. ] W3+, [2.65...","(W3+, Re3+, O2-, S2-)"
4,BiHfO2F,"[[0.00518937 0. 0. ] Bi, [2.172...","(Bi, Hf, O, F)","[[0.00518937 0. 0. ] Bi+, [2.17...","(Bi+, Hf4+, O2-, F-)"
...,...,...,...,...,...
18923,RbPdO2S,"[[4.44077598 0. 0. ] Rb, [2.652...","(Rb, Pd, O, S)","[[4.44077598 0. 0. ] Rb+, [2.65...","(Rb+, Pd4+, O2-, S-)"
18924,LiBaO2N,[[4.56913824e-03 7.21569024e-19 0.00000000e+00...,"(Li, Ba, O, N)",[[4.56913824e-03 7.21569024e-19 0.00000000e+00...,"(Li+, Ba2+, O2-, N+)"
18925,ZnBeOFN,"[[0.0040044 0. 0. ] Zn, [1.821570...","(Zn, Be, O, F, N)","[[0.0040044 0. 0. ] Zn2+, [1.8215...","(Zn2+, Be2+, O2-, F-, N-)"
18926,CaTlN3,"[[0. 0. 0.] Ca, [2.16744896 2.16744896 2.16744...","(Ca, Tl, N)","[[0. 0. 0.] Ca2+, [2.16744896 2.16744896 2.167...","(Ca2+, Tl+, N3+, N3-)"


In [4]:
# Initialise featurizers script
descriptors = feat.prepare_to_featurize(df=df_obj)

In [5]:
# Features based on oxidation composition
df_com = descriptors.generate_oxid_composition_features()
df_com

MultipleFeaturizer:   0%|          | 0/18928 [00:00<?, ?it/s]

Total no. of features generated: 13


Unnamed: 0,formula,structure,composition,oxidation_structure,oxidation_composition,minimum oxidation state,maximum oxidation state,range oxidation state,std_dev oxidation state,compound possible,max ionic char,avg ionic char,avg anion electron affinity,minimum EN difference,maximum EN difference,range EN difference,mean EN difference,std_dev EN difference
0,RhTeN3,"[[0. 0. 0.] Rh, [1.97726555 1.97726555 1.97726...","(Rh, Te, N)","[[0. 0. 0.] Rh3+, [1.97726555 1.97726555 1.977...","(Rh3+, Te6+, N3-)",-3,6,9,5.070926,True,0.198203,0.040242,-21000.000000,0.760000,0.940000,0.18,0.850000,0.127279
1,HfTeO3,"[[2.54041798 0. 0. ] Hf, [1.020...","(Hf, Te, O)","[[2.54041798 0. 0. ] Hf0+, [1.0...","(Hf0+, Te0+, O0+)",0,0,0,0.000000,True,0.681744,0.131124,0.000000,0.000000,0.000000,0.00,0.000000,0.000000
2,ReAsO2F,"[[0.60790913 0. 0. ] Re, [2.186...","(Re, As, O, F)","[[0.60790913 0. 0. ] Re7+, [2.1...","(Re7+, As2-, O2-, F-)",-2,7,9,4.149967,True,0.660947,0.117035,-262000.000000,1.360000,1.360000,0.00,1.360000,0.000000
3,WReO2S,"[[2.83091357 0. 0. ] W, [2.6573...","(W, Re, O, S)","[[2.83091357 0. 0. ] W3+, [2.65...","(W3+, Re3+, O2-, S2-)",-2,3,5,2.886751,True,0.447278,0.076431,-321333.333333,0.793333,1.253333,0.46,1.023333,0.325269
4,BiHfO2F,"[[0.00518937 0. 0. ] Bi, [2.172...","(Bi, Hf, O, F)","[[0.00518937 0. 0. ] Bi+, [2.17...","(Bi+, Hf4+, O2-, F-)",-2,4,6,2.687419,True,0.833972,0.154752,-297333.333333,1.600000,2.320000,0.72,1.960000,0.509117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18923,RbPdO2S,"[[4.44077598 0. 0. ] Rb, [2.652...","(Rb, Pd, O, S)","[[4.44077598 0. 0. ] Rb+, [2.65...","(Rb+, Pd4+, O2-, S-)",-2,4,6,2.687419,True,0.820234,0.142786,-254666.666667,0.953333,2.333333,1.38,1.643333,0.975807
18924,LiBaO2N,[[4.56913824e-03 7.21569024e-19 0.00000000e+00...,"(Li, Ba, O, N)",[[4.56913824e-03 7.21569024e-19 0.00000000e+00...,"(Li+, Ba2+, O2-, N+)",-2,2,4,1.972027,True,0.803211,0.183413,-282000.000000,0.400000,2.550000,2.15,1.803333,1.216155
18925,ZnBeOFN,"[[0.0040044 0. 0. ] Zn, [1.821570...","(Zn, Be, O, F, N)","[[0.0040044 0. 0. ] Zn2+, [1.8215...","(Zn2+, Be2+, O2-, F-, N-)",-2,2,4,1.870829,True,0.765904,0.150091,-205666.666667,1.836667,1.916667,0.08,1.876667,0.056569
18926,CaTlN3,"[[0. 0. 0.] Ca, [2.16744896 2.16744896 2.16744...","(Ca, Tl, N)","[[0. 0. 0.] Ca2+, [2.16744896 2.16744896 2.167...","(Ca2+, Tl+, N3+, N3-)",-3,3,6,2.981424,True,0.646687,0.045371,-21000.000000,0.000000,2.040000,2.04,1.153333,1.045817


In [6]:
# Features based on composition
df_com = descriptors.generate_composition_features()
df_com

MultipleFeaturizer:   0%|          | 0/18928 [00:00<?, ?it/s]

Total no. of features generated: 610


Unnamed: 0,formula,structure,composition,oxidation_structure,oxidation_composition,minimum oxidation state,maximum oxidation state,range oxidation state,std_dev oxidation state,compound possible,...,frac p valence electrons,frac d valence electrons,frac d valence electrons.1,frac f valence electrons,frac f valence electrons.1,mean simul. packing efficiency,mean abs simul. packing efficiency,dist from 1 clusters |APE| < 0.010,dist from 3 clusters |APE| < 0.010,dist from 5 clusters |APE| < 0.010
0,RhTeN3,"[[0. 0. 0.] Rh, [1.97726555 1.97726555 1.97726...","(Rh, Te, N)","[[0. 0. 0.] Rh3+, [1.97726555 1.97726555 1.977...","(Rh3+, Te6+, N3-)",-3,6,9,5.070926,True,...,0.325000,0.450000,0.450000,0.000000,0.000000,0.019853,0.025843,0.000000,0.014845,0.028332
1,HfTeO3,"[[2.54041798 0. 0. ] Hf, [1.020...","(Hf, Te, O)","[[2.54041798 0. 0. ] Hf0+, [1.0...","(Hf0+, Te0+, O0+)",0,0,0,0.000000,True,...,0.307692,0.230769,0.230769,0.269231,0.269231,0.033979,0.035127,0.000000,0.035044,0.049139
2,ReAsO2F,"[[0.60790913 0. 0. ] Re, [2.186...","(Re, As, O, F)","[[0.60790913 0. 0. ] Re7+, [2.1...","(Re7+, As2-, O2-, F-)",-2,7,9,4.149967,True,...,0.290909,0.272727,0.272727,0.254545,0.254545,0.000000,0.000000,0.233673,0.235030,0.239016
3,WReO2S,"[[2.83091357 0. 0. ] W, [2.6573...","(W, Re, O, S)","[[2.83091357 0. 0. ] W3+, [2.65...","(W3+, Re3+, O2-, S2-)",-2,3,5,2.886751,True,...,0.203390,0.152542,0.152542,0.474576,0.474576,0.010760,0.023126,0.000000,0.047140,0.056569
4,BiHfO2F,"[[0.00518937 0. 0. ] Bi, [2.172...","(Bi, Hf, O, F)","[[0.00518937 0. 0. ] Bi+, [2.17...","(Bi+, Hf4+, O2-, F-)",-2,4,6,2.687419,True,...,0.242424,0.181818,0.181818,0.424242,0.424242,0.000000,0.000000,0.233673,0.234238,0.235228
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18923,RbPdO2S,"[[4.44077598 0. 0. ] Rb, [2.652...","(Rb, Pd, O, S)","[[4.44077598 0. 0. ] Rb+, [2.65...","(Rb+, Pd4+, O2-, S-)",-2,4,6,2.687419,True,...,0.413793,0.344828,0.344828,0.000000,0.000000,0.035494,0.046774,0.049913,0.070896,0.081190
18924,LiBaO2N,[[4.56913824e-03 7.21569024e-19 0.00000000e+00...,"(Li, Ba, O, N)",[[4.56913824e-03 7.21569024e-19 0.00000000e+00...,"(Li+, Ba2+, O2-, N+)",-2,2,4,1.972027,True,...,0.550000,0.000000,0.000000,0.000000,0.000000,0.061107,0.061107,0.036464,0.053609,0.065163
18925,ZnBeOFN,"[[0.0040044 0. 0. ] Zn, [1.821570...","(Zn, Be, O, F, N)","[[0.0040044 0. 0. ] Zn2+, [1.8215...","(Zn2+, Be2+, O2-, F-, N-)",-2,2,4,1.870829,True,...,0.375000,0.312500,0.312500,0.000000,0.000000,0.000000,0.000000,0.227378,0.227913,0.228482
18926,CaTlN3,"[[0. 0. 0.] Ca, [2.16744896 2.16744896 2.16744...","(Ca, Tl, N)","[[0. 0. 0.] Ca2+, [2.16744896 2.16744896 2.167...","(Ca2+, Tl+, N3+, N3-)",-3,3,6,2.981424,True,...,0.227273,0.227273,0.227273,0.318182,0.318182,0.004068,0.014496,0.034015,0.054742,0.069994


In [7]:
joblib.dump(df_com, 'castelli_perovskites_composition.pkl')

['castelli_perovskites_composition.pkl']

##### (2) Structural 

In [8]:
# Import packages
import pandas as pd
import joblib

from pymatgen import core

In [9]:
# Import custom packages
import sys
sys.path.insert(1, 'C:/Users/Songy/OneDrive/Documents/GBSFS4MPP_local/new_properties')

import featurizers as feat

In [10]:
# Load Materials object file
df_obj = joblib.load('castelli_perovskites_material_objects.pkl')
df_obj

Unnamed: 0,formula,structure,composition,oxidation_structure,oxidation_composition
0,RhTeN3,"[[0. 0. 0.] Rh, [1.97726555 1.97726555 1.97726...","(Rh, Te, N)","[[0. 0. 0.] Rh3+, [1.97726555 1.97726555 1.977...","(Rh3+, Te6+, N3-)"
1,HfTeO3,"[[2.54041798 0. 0. ] Hf, [1.020...","(Hf, Te, O)","[[2.54041798 0. 0. ] Hf0+, [1.0...","(Hf0+, Te0+, O0+)"
2,ReAsO2F,"[[0.60790913 0. 0. ] Re, [2.186...","(Re, As, O, F)","[[0.60790913 0. 0. ] Re7+, [2.1...","(Re7+, As2-, O2-, F-)"
3,WReO2S,"[[2.83091357 0. 0. ] W, [2.6573...","(W, Re, O, S)","[[2.83091357 0. 0. ] W3+, [2.65...","(W3+, Re3+, O2-, S2-)"
4,BiHfO2F,"[[0.00518937 0. 0. ] Bi, [2.172...","(Bi, Hf, O, F)","[[0.00518937 0. 0. ] Bi+, [2.17...","(Bi+, Hf4+, O2-, F-)"
...,...,...,...,...,...
18923,RbPdO2S,"[[4.44077598 0. 0. ] Rb, [2.652...","(Rb, Pd, O, S)","[[4.44077598 0. 0. ] Rb+, [2.65...","(Rb+, Pd4+, O2-, S-)"
18924,LiBaO2N,[[4.56913824e-03 7.21569024e-19 0.00000000e+00...,"(Li, Ba, O, N)",[[4.56913824e-03 7.21569024e-19 0.00000000e+00...,"(Li+, Ba2+, O2-, N+)"
18925,ZnBeOFN,"[[0.0040044 0. 0. ] Zn, [1.821570...","(Zn, Be, O, F, N)","[[0.0040044 0. 0. ] Zn2+, [1.8215...","(Zn2+, Be2+, O2-, F-, N-)"
18926,CaTlN3,"[[0. 0. 0.] Ca, [2.16744896 2.16744896 2.16744...","(Ca, Tl, N)","[[0. 0. 0.] Ca2+, [2.16744896 2.16744896 2.167...","(Ca2+, Tl+, N3+, N3-)"


In [11]:
# Initialization of featurizer
descriptors = feat.prepare_to_featurize(df=df_obj)

In [12]:
# Features based on oxidation structure
df_struc = descriptors.generate_oxid_structural_features()
df_struc

MultipleFeaturizer:   0%|          | 0/18928 [00:00<?, ?it/s]

Total no. of features generated: 401


Unnamed: 0,formula,structure,composition,oxidation_structure,oxidation_composition,ReDF [0.00000 - 0.05000]A,ReDF [0.05000 - 0.10000]A,ReDF [0.10000 - 0.15000]A,ReDF [0.15000 - 0.20000]A,ReDF [0.20000 - 0.25000]A,...,ReDF [19.55000 - 19.60000]A,ReDF [19.60000 - 19.65000]A,ReDF [19.65000 - 19.70000]A,ReDF [19.70000 - 19.75000]A,ReDF [19.75000 - 19.80000]A,ReDF [19.80000 - 19.85000]A,ReDF [19.85000 - 19.90000]A,ReDF [19.90000 - 19.95000]A,ReDF [19.95000 - 20.00000]A,ReDF [20.00000 - 20.00000]A
0,RhTeN3,"[[0. 0. 0.] Rh, [1.97726555 1.97726555 1.97726...","(Rh, Te, N)","[[0. 0. 0.] Rh3+, [1.97726555 1.97726555 1.977...","(Rh3+, Te6+, N3-)",0.0,0.0,0.0,0.0,0.0,...,5.551115e-16,0.000000,26.350108,0.000000,21.848355,0.000000,-60.871793,0.000000,1.665335e-16,0.0
1,HfTeO3,"[[2.54041798 0. 0. ] Hf, [1.020...","(Hf, Te, O)","[[2.54041798 0. 0. ] Hf0+, [1.0...","(Hf0+, Te0+, O0+)",0.0,0.0,0.0,0.0,0.0,...,-2.044370e+00,-0.001655,13.675522,-0.000131,-2.587895,-1.291521,4.504139,-0.642974,-2.561728e+00,0.0
2,ReAsO2F,"[[0.60790913 0. 0. ] Re, [2.186...","(Re, As, O, F)","[[0.60790913 0. 0. ] Re7+, [2.1...","(Re7+, As2-, O2-, F-)",0.0,0.0,0.0,0.0,0.0,...,-2.450084e+00,-5.702162,29.931409,-3.160062,-0.727025,-2.666654,-1.771814,-1.925709,-5.687203e+00,0.0
3,WReO2S,"[[2.83091357 0. 0. ] W, [2.6573...","(W, Re, O, S)","[[2.83091357 0. 0. ] W3+, [2.65...","(W3+, Re3+, O2-, S2-)",0.0,0.0,0.0,0.0,0.0,...,-1.961542e+00,5.707632,-2.442015,-0.972779,-0.161696,-1.293665,-3.383842,0.804155,-2.563194e+00,0.0
4,BiHfO2F,"[[0.00518937 0. 0. ] Bi, [2.172...","(Bi, Hf, O, F)","[[0.00518937 0. 0. ] Bi+, [2.17...","(Bi+, Hf4+, O2-, F-)",0.0,0.0,0.0,0.0,0.0,...,0.000000e+00,12.722577,0.000000,-4.704008,-1.781896,1.531957,0.886075,0.000000,0.000000e+00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18923,RbPdO2S,"[[4.44077598 0. 0. ] Rb, [2.652...","(Rb, Pd, O, S)","[[4.44077598 0. 0. ] Rb+, [2.65...","(Rb+, Pd4+, O2-, S-)",0.0,0.0,0.0,0.0,0.0,...,1.185222e+00,-1.468881,-0.324901,5.672811,-1.942451,-0.322597,0.804400,-0.642368,-1.281091e+00,0.0
18924,LiBaO2N,[[4.56913824e-03 7.21569024e-19 0.00000000e+00...,"(Li, Ba, O, N)",[[4.56913824e-03 7.21569024e-19 0.00000000e+00...,"(Li+, Ba2+, O2-, N+)",0.0,0.0,0.0,0.0,0.0,...,0.000000e+00,0.000000,-1.951281,0.000000,0.000000,-2.422565,0.000000,1.444152,8.017847e-01,0.0
18925,ZnBeOFN,"[[0.0040044 0. 0. ] Zn, [1.821570...","(Zn, Be, O, F, N)","[[0.0040044 0. 0. ] Zn2+, [1.8215...","(Zn2+, Be2+, O2-, F-, N-)",0.0,0.0,0.0,0.0,0.0,...,-1.061332e+00,-0.408118,0.000000,0.000000,6.800044,-2.499721,-1.692399,-2.408598,3.200496e-01,0.0
18926,CaTlN3,"[[0. 0. 0.] Ca, [2.16744896 2.16744896 2.16744...","(Ca, Tl, N)","[[0. 0. 0.] Ca2+, [2.16744896 2.16744896 2.167...","(Ca2+, Tl+, N3+, N3-)",0.0,0.0,0.0,0.0,0.0,...,0.000000e+00,-0.652160,0.000000,8.750962,0.000000,0.000000,10.631766,0.000000,-4.804109e+00,0.0


In [13]:
# Features based on structure
df_struc = descriptors.generate_structural_features()
df_struc

MultipleFeaturizer:   0%|          | 0/18928 [00:00<?, ?it/s]

Total no. of features generated: 1282


Unnamed: 0,formula,structure,composition,oxidation_structure,oxidation_composition,ReDF [0.00000 - 0.05000]A,ReDF [0.05000 - 0.10000]A,ReDF [0.10000 - 0.15000]A,ReDF [0.15000 - 0.20000]A,ReDF [0.20000 - 0.25000]A,...,maximum local difference in SpaceGroupNumber,range local difference in SpaceGroupNumber,mean local difference in SpaceGroupNumber,avg_dev local difference in SpaceGroupNumber,spacegroup_num,crystal_system,crystal_system_int,is_centrosymmetric,n_symmetry_ops,dimensionality
0,RhTeN3,"[[0. 0. 0.] Rh, [1.97726555 1.97726555 1.97726...","(Rh, Te, N)","[[0. 0. 0.] Rh3+, [1.97726555 1.97726555 1.977...","(Rh3+, Te6+, N3-)",0.0,0.0,0.0,0.0,0.0,...,42.000000,19.578097,28.053142,6.757487,221,cubic,1,True,48,3
1,HfTeO3,"[[2.54041798 0. 0. ] Hf, [1.020...","(Hf, Te, O)","[[2.54041798 0. 0. ] Hf0+, [1.0...","(Hf0+, Te0+, O0+)",0.0,0.0,0.0,0.0,0.0,...,156.477655,73.899337,119.474465,20.731398,99,tetragonal,4,False,8,3
2,ReAsO2F,"[[0.60790913 0. 0. ] Re, [2.186...","(Re, As, O, F)","[[0.60790913 0. 0. ] Re7+, [2.1...","(Re7+, As2-, O2-, F-)",0.0,0.0,0.0,0.0,0.0,...,179.713046,86.531748,130.440327,28.668811,25,orthorhombic,5,False,4,3
3,WReO2S,"[[2.83091357 0. 0. ] W, [2.6573...","(W, Re, O, S)","[[2.83091357 0. 0. ] W3+, [2.65...","(W3+, Re3+, O2-, S2-)",0.0,0.0,0.0,0.0,0.0,...,168.302658,82.154825,129.731567,23.166228,25,orthorhombic,5,False,4,3
4,BiHfO2F,"[[0.00518937 0. 0. ] Bi, [2.172...","(Bi, Hf, O, F)","[[0.00518937 0. 0. ] Bi+, [2.17...","(Bi+, Hf4+, O2-, F-)",0.0,0.0,0.0,0.0,0.0,...,181.000252,179.978221,65.436948,46.225321,25,orthorhombic,5,False,4,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18923,RbPdO2S,"[[4.44077598 0. 0. ] Rb, [2.652...","(Rb, Pd, O, S)","[[4.44077598 0. 0. ] Rb+, [2.65...","(Rb+, Pd4+, O2-, S-)",0.0,0.0,0.0,0.0,0.0,...,193.368560,73.928707,158.406764,25.957834,25,orthorhombic,5,False,4,3
18924,LiBaO2N,[[4.56913824e-03 7.21569024e-19 0.00000000e+00...,"(Li, Ba, O, N)",[[4.56913824e-03 7.21569024e-19 0.00000000e+00...,"(Li+, Ba2+, O2-, N+)",0.0,0.0,0.0,0.0,0.0,...,170.454143,81.152840,148.574414,23.709244,25,orthorhombic,5,False,4,3
18925,ZnBeOFN,"[[0.0040044 0. 0. ] Zn, [1.821570...","(Zn, Be, O, F, N)","[[0.0040044 0. 0. ] Zn2+, [1.8215...","(Zn2+, Be2+, O2-, F-, N-)",0.0,0.0,0.0,0.0,0.0,...,148.938477,82.259355,120.553310,21.729901,25,orthorhombic,5,False,4,3
18926,CaTlN3,"[[0. 0. 0.] Ca, [2.16744896 2.16744896 2.16744...","(Ca, Tl, N)","[[0. 0. 0.] Ca2+, [2.16744896 2.16744896 2.167...","(Ca2+, Tl+, N3+, N3-)",0.0,0.0,0.0,0.0,0.0,...,31.000000,31.000000,13.070804,7.171678,221,cubic,1,True,48,3


In [14]:
joblib.dump(df_struc, 'castelli_perovskites_structural.pkl')

['castelli_perovskites_structural.pkl']

In [15]:
# Initialization of featurizer
descriptors = feat.prepare_to_featurize(df=df_obj)

# Features based on JarvisCFID
df_Jarvis = descriptors.generate_JarvisCFID_features()

In [16]:
joblib.dump(df_struc, 'castelli_perovskites_structural_jarviscfid.pkl')

##### (3) Extra Features

In [17]:
# Load Materials object file
df_obj = joblib.load('castelli_perovskites_material_objects.pkl')
df_obj

Unnamed: 0,formula,structure,composition,oxidation_structure,oxidation_composition
0,RhTeN3,"[[0. 0. 0.] Rh, [1.97726555 1.97726555 1.97726...","(Rh, Te, N)","[[0. 0. 0.] Rh3+, [1.97726555 1.97726555 1.977...","(Rh3+, Te6+, N3-)"
1,HfTeO3,"[[2.54041798 0. 0. ] Hf, [1.020...","(Hf, Te, O)","[[2.54041798 0. 0. ] Hf0+, [1.0...","(Hf0+, Te0+, O0+)"
2,ReAsO2F,"[[0.60790913 0. 0. ] Re, [2.186...","(Re, As, O, F)","[[0.60790913 0. 0. ] Re7+, [2.1...","(Re7+, As2-, O2-, F-)"
3,WReO2S,"[[2.83091357 0. 0. ] W, [2.6573...","(W, Re, O, S)","[[2.83091357 0. 0. ] W3+, [2.65...","(W3+, Re3+, O2-, S2-)"
4,BiHfO2F,"[[0.00518937 0. 0. ] Bi, [2.172...","(Bi, Hf, O, F)","[[0.00518937 0. 0. ] Bi+, [2.17...","(Bi+, Hf4+, O2-, F-)"
...,...,...,...,...,...
18923,RbPdO2S,"[[4.44077598 0. 0. ] Rb, [2.652...","(Rb, Pd, O, S)","[[4.44077598 0. 0. ] Rb+, [2.65...","(Rb+, Pd4+, O2-, S-)"
18924,LiBaO2N,[[4.56913824e-03 7.21569024e-19 0.00000000e+00...,"(Li, Ba, O, N)",[[4.56913824e-03 7.21569024e-19 0.00000000e+00...,"(Li+, Ba2+, O2-, N+)"
18925,ZnBeOFN,"[[0.0040044 0. 0. ] Zn, [1.821570...","(Zn, Be, O, F, N)","[[0.0040044 0. 0. ] Zn2+, [1.8215...","(Zn2+, Be2+, O2-, F-, N-)"
18926,CaTlN3,"[[0. 0. 0.] Ca, [2.16744896 2.16744896 2.16744...","(Ca, Tl, N)","[[0. 0. 0.] Ca2+, [2.16744896 2.16744896 2.167...","(Ca2+, Tl+, N3+, N3-)"


In [18]:
# Initialization of featurizer
descriptors = feat.prepare_to_featurize(df=df_obj)

In [19]:
df_custom = descriptors.custom_features(formula_col_exist=True)
df_custom

Unnamed: 0,formula,structure,composition,oxidation_structure,oxidation_composition,weight,total_e,avg_electroneg,noble_gas,transition_metal,...,alkaline,halogen,chalcogen,lanthanoid,actinoid,quadrupolar,s-block,p-block,d-block,f-block
0,RhTeN3,"[[0. 0. 0.] Rh, [1.97726555 1.97726555 1.97726...","(Rh, Te, N)","[[0. 0. 0.] Rh3+, [1.97726555 1.97726555 1.977...","(Rh3+, Te6+, N3-)",272.525600,118.0,2.700,0,1,...,0,0,1,0,0,1,0,1,1,0
1,HfTeO3,"[[2.54041798 0. 0. ] Hf, [1.020...","(Hf, Te, O)","[[2.54041798 0. 0. ] Hf0+, [1.0...","(Hf0+, Te0+, O0+)",354.088200,148.0,2.744,0,1,...,0,0,1,0,0,1,0,1,1,0
2,ReAsO2F,"[[0.60790913 0. 0. ] Re, [2.186...","(Re, As, O, F)","[[0.60790913 0. 0. ] Re7+, [2.1...","(Re7+, As2-, O2-, F-)",312.125803,133.0,2.988,0,1,...,0,1,1,0,0,1,0,1,1,0
3,WReO2S,"[[2.83091357 0. 0. ] W, [2.6573...","(W, Re, O, S)","[[2.83091357 0. 0. ] W3+, [2.65...","(W3+, Re3+, O2-, S2-)",434.110800,181.0,2.744,0,1,...,0,0,1,0,0,1,0,1,1,0
4,BiHfO2F,"[[0.00518937 0. 0. ] Bi, [2.172...","(Bi, Hf, O, F)","[[0.00518937 0. 0. ] Bi+, [2.17...","(Bi+, Hf4+, O2-, F-)",438.467603,180.0,2.836,0,1,...,0,1,1,0,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18923,RbPdO2S,"[[4.44077598 0. 0. ] Rb, [2.652...","(Rb, Pd, O, S)","[[4.44077598 0. 0. ] Rb+, [2.65...","(Rb+, Pd4+, O2-, S-)",255.951600,115.0,2.496,0,1,...,0,0,1,0,0,1,1,1,1,0
18924,LiBaO2N,[[4.56913824e-03 7.21569024e-19 0.00000000e+00...,"(Li, Ba, O, N)",[[4.56913824e-03 7.21569024e-19 0.00000000e+00...,"(Li+, Ba2+, O2-, N+)",190.273500,82.0,2.358,0,0,...,1,0,1,0,0,1,1,1,0,0
18925,ZnBeOFN,"[[0.0040044 0. 0. ] Zn, [1.821570...","(Zn, Be, O, F, N)","[[0.0040044 0. 0. ] Zn2+, [1.8215...","(Zn2+, Be2+, O2-, F-, N-)",123.425685,58.0,2.736,0,1,...,1,1,1,0,0,1,1,1,1,0
18926,CaTlN3,"[[0. 0. 0.] Ca, [2.16744896 2.16744896 2.16744...","(Ca, Tl, N)","[[0. 0. 0.] Ca2+, [2.16744896 2.16744896 2.167...","(Ca2+, Tl+, N3+, N3-)",286.481400,122.0,2.348,0,0,...,1,0,0,0,0,1,1,1,0,0


In [20]:
joblib.dump(df_custom, 'castelli_perovskites_additional_features.pkl')

['castelli_perovskites_additional_features.pkl']

#### Merge results

In [1]:
# Import packages
import pandas as pd
import joblib

from pymatgen import core

In [2]:
# Import custom packages
import sys
sys.path.insert(1, 'C:/Users/Songy/OneDrive/Documents/GBSFS4MPP_local/new_properties')

import featurizers as feat

In [3]:
# Load Materials object file
df1 = joblib.load('castelli_perovskites_composition.pkl')
df2 = joblib.load('castelli_perovskites_structural.pkl')
df3 = joblib.load('castelli_perovskites_structural_jarviscfid.pkl')
df4 = joblib.load('castelli_perovskites_additional_features.pkl')

In [4]:
# Concat dataframes
frames = [df1, df2, df3, df4]

df_merged = pd.concat(frames, axis=1)

# Col to drop
cols = ['formula', 'structure', 'composition', 'oxidation_structure', 'oxidation_composition']

df_merged = df_merged.drop(cols, axis=1)

# False and True -> 0 and 1
df_merged.replace({False: 0, True: 1}, inplace=True)
df_merged

Unnamed: 0,minimum oxidation state,maximum oxidation state,range oxidation state,std_dev oxidation state,compound possible,max ionic char,avg ionic char,avg anion electron affinity,minimum EN difference,maximum EN difference,...,alkaline,halogen,chalcogen,lanthanoid,actinoid,quadrupolar,s-block,p-block,d-block,f-block
0,-3,6,9,5.070926,1,0.198203,0.040242,-21000.000000,0.760000,0.940000,...,0,0,1,0,0,1,0,1,1,0
1,0,0,0,0.000000,1,0.681744,0.131124,0.000000,0.000000,0.000000,...,0,0,1,0,0,1,0,1,1,0
2,-2,7,9,4.149967,1,0.660947,0.117035,-262000.000000,1.360000,1.360000,...,0,1,1,0,0,1,0,1,1,0
3,-2,3,5,2.886751,1,0.447278,0.076431,-321333.333333,0.793333,1.253333,...,0,0,1,0,0,1,0,1,1,0
4,-2,4,6,2.687419,1,0.833972,0.154752,-297333.333333,1.600000,2.320000,...,0,1,1,0,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18923,-2,4,6,2.687419,1,0.820234,0.142786,-254666.666667,0.953333,2.333333,...,0,0,1,0,0,1,1,1,1,0
18924,-2,2,4,1.972027,1,0.803211,0.183413,-282000.000000,0.400000,2.550000,...,1,0,1,0,0,1,1,1,0,0
18925,-2,2,4,1.870829,1,0.765904,0.150091,-205666.666667,1.836667,1.916667,...,1,1,1,0,0,1,1,1,1,0
18926,-3,3,6,2.981424,1,0.646687,0.045371,-21000.000000,0.000000,2.040000,...,1,0,0,0,0,1,1,1,0,0


In [5]:
df_merged['HOMO_element']

0        Rh
1        Te
2        Re
3        Re
4        Bi
         ..
18923    Pd
18924     N
18925     N
18926     N
18927     O
Name: HOMO_element, Length: 18928, dtype: object

In [6]:
# Initialization of featurizer
descriptors = feat.prepare_to_featurize(df=df_merged)

In [7]:
# One hot encoding of categorical features
df_merged, categorical_feature = descriptors.OHE()

categorical_feature

No. of categorical features: 5


['HOMO_character_ohe_d',
 'HOMO_character_ohe_p',
 'HOMO_character_ohe_s',
 'HOMO_element_ohe_Ag',
 'HOMO_element_ohe_Al',
 'HOMO_element_ohe_As',
 'HOMO_element_ohe_Au',
 'HOMO_element_ohe_Ba',
 'HOMO_element_ohe_Be',
 'HOMO_element_ohe_Bi',
 'HOMO_element_ohe_Ca',
 'HOMO_element_ohe_Cd',
 'HOMO_element_ohe_Co',
 'HOMO_element_ohe_Cr',
 'HOMO_element_ohe_Cu',
 'HOMO_element_ohe_Fe',
 'HOMO_element_ohe_Ga',
 'HOMO_element_ohe_Ge',
 'HOMO_element_ohe_Hf',
 'HOMO_element_ohe_Hg',
 'HOMO_element_ohe_In',
 'HOMO_element_ohe_Ir',
 'HOMO_element_ohe_La',
 'HOMO_element_ohe_Mg',
 'HOMO_element_ohe_Mn',
 'HOMO_element_ohe_Mo',
 'HOMO_element_ohe_N',
 'HOMO_element_ohe_Nb',
 'HOMO_element_ohe_Ni',
 'HOMO_element_ohe_O',
 'HOMO_element_ohe_Os',
 'HOMO_element_ohe_Pb',
 'HOMO_element_ohe_Pd',
 'HOMO_element_ohe_Pt',
 'HOMO_element_ohe_Re',
 'HOMO_element_ohe_Rh',
 'HOMO_element_ohe_Ru',
 'HOMO_element_ohe_S',
 'HOMO_element_ohe_Sb',
 'HOMO_element_ohe_Sc',
 'HOMO_element_ohe_Si',
 'HOMO_element_o

In [8]:
# Load Materials object file
df_original = joblib.load('castelli_perovskites.pkl')
df_original = df_original[['fermi level', 'fermi width', 'e_form', 'gap is direct', 'mu_b', 'vbm', 'cbm', 'gap gllbsc']]

# Concat dataframes
frames = [df_merged, df_original]

df_merged = pd.concat(frames, axis=1)
df_merged

Unnamed: 0,minimum oxidation state,maximum oxidation state,range oxidation state,std_dev oxidation state,compound possible,max ionic char,avg ionic char,avg anion electron affinity,minimum EN difference,maximum EN difference,...,crystal_system_ohe_orthorhombic,crystal_system_ohe_tetragonal,fermi level,fermi width,e_form,gap is direct,mu_b,vbm,cbm,gap gllbsc
0,-3,6,9,5.070926,1,0.198203,0.040242,-21000.000000,0.760000,0.940000,...,0,0,0.312138,0.001837,2.16,True,1.974478e-02,6.187694,6.187694,0.0
1,0,0,0,0.000000,1,0.681744,0.131124,0.000000,0.000000,0.000000,...,0,1,0.297083,0.001837,1.52,True,-2.253054e-05,6.033125,6.033125,0.0
2,-2,7,9,4.149967,1,0.660947,0.117035,-262000.000000,1.360000,1.360000,...,1,0,0.191139,0.003675,1.48,True,4.982109e+00,6.602253,6.602253,0.0
3,-2,3,5,2.886751,1,0.447278,0.076431,-321333.333333,0.793333,1.253333,...,1,0,0.316346,0.001837,1.24,True,-8.684496e-01,5.738462,5.738462,0.0
4,-2,4,6,2.687419,1,0.833972,0.154752,-297333.333333,1.600000,2.320000,...,1,0,0.312658,0.003675,0.62,True,2.164069e-15,6.074736,6.074736,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18923,-2,4,6,2.687419,1,0.820234,0.142786,-254666.666667,0.953333,2.333333,...,1,0,0.123028,0.001837,1.66,True,1.566419e+00,5.164060,5.164060,0.0
18924,-2,2,4,1.972027,1,0.803211,0.183413,-282000.000000,0.400000,2.550000,...,1,0,0.081229,0.001837,2.12,True,1.993566e+00,5.062501,5.062501,0.0
18925,-2,2,4,1.870829,1,0.765904,0.150091,-205666.666667,1.836667,1.916667,...,1,0,0.189810,0.003675,1.50,True,1.599586e+00,6.589724,6.589724,0.0
18926,-3,3,6,2.981424,1,0.646687,0.045371,-21000.000000,0.000000,2.040000,...,0,0,0.209947,0.001837,2.48,True,3.280734e+00,5.411525,5.411525,0.0


In [9]:
# Save dataset 
joblib.dump(df_merged, 'castelli_perovskites_merged.pkl')

### Update

In [1]:
# Import packages
import pandas as pd
import joblib
import re

from pymatgen import core

In [2]:
# Import custom packages
import sys
sys.path.insert(1, 'C:/Users/Songy/OneDrive/Documents/GBSFS4MPP_local/new_properties')

import featurizers as feat

In [3]:
# Load Materials object file
df = joblib.load('castelli_perovskites_merged.pkl')
df

Unnamed: 0,minimum oxidation state,maximum oxidation state,range oxidation state,std_dev oxidation state,compound possible,max ionic char,avg ionic char,avg anion electron affinity,minimum EN difference,maximum EN difference,...,crystal_system_ohe_orthorhombic,crystal_system_ohe_tetragonal,fermi level,fermi width,e_form,gap is direct,mu_b,vbm,cbm,gap gllbsc
0,-3,6,9,5.070926,1,0.198203,0.040242,-21000.000000,0.760000,0.940000,...,0,0,0.312138,0.001837,2.16,True,1.974478e-02,6.187694,6.187694,0.0
1,0,0,0,0.000000,1,0.681744,0.131124,0.000000,0.000000,0.000000,...,0,1,0.297083,0.001837,1.52,True,-2.253054e-05,6.033125,6.033125,0.0
2,-2,7,9,4.149967,1,0.660947,0.117035,-262000.000000,1.360000,1.360000,...,1,0,0.191139,0.003675,1.48,True,4.982109e+00,6.602253,6.602253,0.0
3,-2,3,5,2.886751,1,0.447278,0.076431,-321333.333333,0.793333,1.253333,...,1,0,0.316346,0.001837,1.24,True,-8.684496e-01,5.738462,5.738462,0.0
4,-2,4,6,2.687419,1,0.833972,0.154752,-297333.333333,1.600000,2.320000,...,1,0,0.312658,0.003675,0.62,True,2.164069e-15,6.074736,6.074736,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18923,-2,4,6,2.687419,1,0.820234,0.142786,-254666.666667,0.953333,2.333333,...,1,0,0.123028,0.001837,1.66,True,1.566419e+00,5.164060,5.164060,0.0
18924,-2,2,4,1.972027,1,0.803211,0.183413,-282000.000000,0.400000,2.550000,...,1,0,0.081229,0.001837,2.12,True,1.993566e+00,5.062501,5.062501,0.0
18925,-2,2,4,1.870829,1,0.765904,0.150091,-205666.666667,1.836667,1.916667,...,1,0,0.189810,0.003675,1.50,True,1.599586e+00,6.589724,6.589724,0.0
18926,-3,3,6,2.981424,1,0.646687,0.045371,-21000.000000,0.000000,2.040000,...,0,0,0.209947,0.001837,2.48,True,3.280734e+00,5.411525,5.411525,0.0


In [4]:
# Remove duplications
df  = df.loc[:,~df.columns.duplicated()]

# Remove space and JSON character in the column
df.columns = df.columns.str.replace(' ','_')
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

df

Unnamed: 0,minimum_oxidation_state,maximum_oxidation_state,range_oxidation_state,std_dev_oxidation_state,compound_possible,max_ionic_char,avg_ionic_char,avg_anion_electron_affinity,minimum_EN_difference,maximum_EN_difference,...,crystal_system_ohe_orthorhombic,crystal_system_ohe_tetragonal,fermi_level,fermi_width,e_form,gap_is_direct,mu_b,vbm,cbm,gap_gllbsc
0,-3,6,9,5.070926,1,0.198203,0.040242,-21000.000000,0.760000,0.940000,...,0,0,0.312138,0.001837,2.16,True,1.974478e-02,6.187694,6.187694,0.0
1,0,0,0,0.000000,1,0.681744,0.131124,0.000000,0.000000,0.000000,...,0,1,0.297083,0.001837,1.52,True,-2.253054e-05,6.033125,6.033125,0.0
2,-2,7,9,4.149967,1,0.660947,0.117035,-262000.000000,1.360000,1.360000,...,1,0,0.191139,0.003675,1.48,True,4.982109e+00,6.602253,6.602253,0.0
3,-2,3,5,2.886751,1,0.447278,0.076431,-321333.333333,0.793333,1.253333,...,1,0,0.316346,0.001837,1.24,True,-8.684496e-01,5.738462,5.738462,0.0
4,-2,4,6,2.687419,1,0.833972,0.154752,-297333.333333,1.600000,2.320000,...,1,0,0.312658,0.003675,0.62,True,2.164069e-15,6.074736,6.074736,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18923,-2,4,6,2.687419,1,0.820234,0.142786,-254666.666667,0.953333,2.333333,...,1,0,0.123028,0.001837,1.66,True,1.566419e+00,5.164060,5.164060,0.0
18924,-2,2,4,1.972027,1,0.803211,0.183413,-282000.000000,0.400000,2.550000,...,1,0,0.081229,0.001837,2.12,True,1.993566e+00,5.062501,5.062501,0.0
18925,-2,2,4,1.870829,1,0.765904,0.150091,-205666.666667,1.836667,1.916667,...,1,0,0.189810,0.003675,1.50,True,1.599586e+00,6.589724,6.589724,0.0
18926,-3,3,6,2.981424,1,0.646687,0.045371,-21000.000000,0.000000,2.040000,...,0,0,0.209947,0.001837,2.48,True,3.280734e+00,5.411525,5.411525,0.0


In [5]:
feature = df.columns[:-8].tolist()
print(len(feature))

3978


In [6]:
df.columns[-8:].tolist()

['fermi_level',
 'fermi_width',
 'e_form',
 'gap_is_direct',
 'mu_b',
 'vbm',
 'cbm',
 'gap_gllbsc']

In [7]:
joblib.dump(df, 'castelli_perovskites_merged_v2.pkl')

['castelli_perovskites_merged_v2.pkl']

In [8]:
joblib.dump(feature, 'features.pkl')

['features.pkl']