### Load data

In [1]:
# Import packages
import pandas as pd
import joblib

from pymatgen import core

In [2]:
# Import custom packages
import sys
sys.path.insert(1, 'C:/Users/Songy/OneDrive/Documents/GBSFS4MPP_local/new_properties')

import create_material_objects as cmo

In [3]:
# Load dataset
df_original_1 = joblib.load('database_log_gvrh.pkl') 
df_original_2 = joblib.load('database_log_kvrh.pkl') 
df_original_2 = df_original_2.drop(['structure'], axis=1)

df_original_1

Unnamed: 0,structure,log10(G_VRH)
0,"[[0. 0. 0.] Ca, [1.37728887 1.57871271 3.73949...",1.447158
1,"[[3.14048493 1.09300401 1.64101398] Mg, [0.625...",1.518514
2,"[[ 2.06884519 2.40627241 -0.45891585] Si, [1....",1.740363
3,"[[2.06428082 0. 2.06428082] Pd, [0. ...",1.707570
4,"[[3.09635262 1.0689416 1.53602403] Mg, [0.593...",1.602060
...,...,...
10982,"[[0. 0. 0.] Rh, [3.2029368 3.2029368 2.09459...",1.414973
10983,"[[-1.51157821 4.4173925 1.21553922] Mg, [3....",1.431364
10984,"[[4.37546772 4.51128393 6.81784473] H, [0.4573...",1.000000
10985,"[[0. 0. 0.] Si, [ 4.55195829 4.55195829 -4.55...",1.579784


In [4]:
df_original = pd.merge(df_original_1, df_original_2, left_index=True, right_index=True)
df_original

Unnamed: 0,structure,log10(G_VRH),log10(K_VRH)
0,"[[0. 0. 0.] Ca, [1.37728887 1.57871271 3.73949...",1.447158,1.707570
1,"[[3.14048493 1.09300401 1.64101398] Mg, [0.625...",1.518514,1.633468
2,"[[ 2.06884519 2.40627241 -0.45891585] Si, [1....",1.740363,1.908485
3,"[[2.06428082 0. 2.06428082] Pd, [0. ...",1.707570,2.117271
4,"[[3.09635262 1.0689416 1.53602403] Mg, [0.593...",1.602060,1.690196
...,...,...,...
10982,"[[0. 0. 0.] Rh, [3.2029368 3.2029368 2.09459...",1.414973,1.778151
10983,"[[-1.51157821 4.4173925 1.21553922] Mg, [3....",1.431364,1.724276
10984,"[[4.37546772 4.51128393 6.81784473] H, [0.4573...",1.000000,1.342423
10985,"[[0. 0. 0.] Si, [ 4.55195829 4.55195829 -4.55...",1.579784,1.770852


In [5]:
df = df_original[['log10(G_VRH)', 'log10(K_VRH)', 'structure']]
df

Unnamed: 0,log10(G_VRH),log10(K_VRH),structure
0,1.447158,1.707570,"[[0. 0. 0.] Ca, [1.37728887 1.57871271 3.73949..."
1,1.518514,1.633468,"[[3.14048493 1.09300401 1.64101398] Mg, [0.625..."
2,1.740363,1.908485,"[[ 2.06884519 2.40627241 -0.45891585] Si, [1...."
3,1.707570,2.117271,"[[2.06428082 0. 2.06428082] Pd, [0. ..."
4,1.602060,1.690196,"[[3.09635262 1.0689416 1.53602403] Mg, [0.593..."
...,...,...,...
10982,1.414973,1.778151,"[[0. 0. 0.] Rh, [3.2029368 3.2029368 2.09459..."
10983,1.431364,1.724276,"[[-1.51157821 4.4173925 1.21553922] Mg, [3...."
10984,1.000000,1.342423,"[[4.37546772 4.51128393 6.81784473] H, [0.4573..."
10985,1.579784,1.770852,"[[0. 0. 0.] Si, [ 4.55195829 4.55195829 -4.55..."


In [6]:
# Initialize to create/convert material object (pymatgen)
converter = cmo.material_object(df=df)

# Possible functions
# Create Composition object from chemical formula
# df = converter.formula2composition(formula_col='formula')

# Create Composition object from Structure object
# df = converter.structure2composition(structure_col='structure')

# Create Oxidation Structure object from Structure object
# df = converter.structure2oxidstructure(structure_col='structure')

# Create Oxidation Composition object from Composition object
# df = converter.composition2oxidcomposition(composition_col='composition')

# Source Structure Object from MP database using Composition object 
# df = converter.composition2structure_fromMP(composition_col='composition', mapi_key='lZCh9ke4qRxMQO16')

In [7]:
# Create Composition object from chemical formula
df = converter.structure2composition(structure_col='structure')
df

StructureToComposition:   0%|          | 0/10987 [00:00<?, ?it/s]

Unnamed: 0,log10(G_VRH),log10(K_VRH),structure,composition
0,1.447158,1.707570,"[[0. 0. 0.] Ca, [1.37728887 1.57871271 3.73949...","(Ca, Ge, Ag)"
1,1.518514,1.633468,"[[3.14048493 1.09300401 1.64101398] Mg, [0.625...","(Mg, Ge, Ba)"
2,1.740363,1.908485,"[[ 2.06884519 2.40627241 -0.45891585] Si, [1....","(Si, Cu, Sr)"
3,1.707570,2.117271,"[[2.06428082 0. 2.06428082] Pd, [0. ...","(Pd, Dy)"
4,1.602060,1.690196,"[[3.09635262 1.0689416 1.53602403] Mg, [0.593...","(Mg, Si, Ba)"
...,...,...,...,...
10982,1.414973,1.778151,"[[0. 0. 0.] Rh, [3.2029368 3.2029368 2.09459...","(Rh, I)"
10983,1.431364,1.724276,"[[-1.51157821 4.4173925 1.21553922] Mg, [3....","(Mg, Co, Sn)"
10984,1.000000,1.342423,"[[4.37546772 4.51128393 6.81784473] H, [0.4573...","(H, N, O)"
10985,1.579784,1.770852,"[[0. 0. 0.] Si, [ 4.55195829 4.55195829 -4.55...","(Si, Sn)"


In [8]:
# Create Oxidation Structure object from Structure object
df = converter.structure2oxidstructure(structure_col='structure')
df

StructureToOxidStructure:   0%|          | 0/10987 [00:00<?, ?it/s]

Unnamed: 0,log10(G_VRH),log10(K_VRH),structure,composition,oxidation_structure
0,1.447158,1.707570,"[[0. 0. 0.] Ca, [1.37728887 1.57871271 3.73949...","(Ca, Ge, Ag)","[[0. 0. 0.] Ca0+, [1.37728887 1.57871271 3.739..."
1,1.518514,1.633468,"[[3.14048493 1.09300401 1.64101398] Mg, [0.625...","(Mg, Ge, Ba)","[[3.14048493 1.09300401 1.64101398] Mg0+, [0.6..."
2,1.740363,1.908485,"[[ 2.06884519 2.40627241 -0.45891585] Si, [1....","(Si, Cu, Sr)","[[ 2.06884519 2.40627241 -0.45891585] Si4-, [..."
3,1.707570,2.117271,"[[2.06428082 0. 2.06428082] Pd, [0. ...","(Pd, Dy)","[[2.06428082 0. 2.06428082] Pd0+, [0. ..."
4,1.602060,1.690196,"[[3.09635262 1.0689416 1.53602403] Mg, [0.593...","(Mg, Si, Ba)","[[3.09635262 1.0689416 1.53602403] Mg0+, [0.5..."
...,...,...,...,...,...
10982,1.414973,1.778151,"[[0. 0. 0.] Rh, [3.2029368 3.2029368 2.09459...","(Rh, I)","[[0. 0. 0.] Rh0+, [3.2029368 3.2029368 2.094..."
10983,1.431364,1.724276,"[[-1.51157821 4.4173925 1.21553922] Mg, [3....","(Mg, Co, Sn)","[[-1.51157821 4.4173925 1.21553922] Mg0+, [..."
10984,1.000000,1.342423,"[[4.37546772 4.51128393 6.81784473] H, [0.4573...","(H, N, O)","[[4.37546772 4.51128393 6.81784473] H+, [0.457..."
10985,1.579784,1.770852,"[[0. 0. 0.] Si, [ 4.55195829 4.55195829 -4.55...","(Si, Sn)","[[0. 0. 0.] Si4-, [ 4.55195829 4.55195829 -4...."


In [9]:
# Create Oxidation Composition object from Composition object
df = converter.composition2oxidcomposition(composition_col='composition')
df

CompositionToOxidComposition:   0%|          | 0/10987 [00:00<?, ?it/s]

Unnamed: 0,log10(G_VRH),log10(K_VRH),structure,composition,oxidation_structure,oxidation_composition
0,1.447158,1.707570,"[[0. 0. 0.] Ca, [1.37728887 1.57871271 3.73949...","(Ca, Ge, Ag)","[[0. 0. 0.] Ca0+, [1.37728887 1.57871271 3.739...","(Ca0+, Ge0+, Ag0+)"
1,1.518514,1.633468,"[[3.14048493 1.09300401 1.64101398] Mg, [0.625...","(Mg, Ge, Ba)","[[3.14048493 1.09300401 1.64101398] Mg0+, [0.6...","(Mg0+, Ge0+, Ba0+)"
2,1.740363,1.908485,"[[ 2.06884519 2.40627241 -0.45891585] Si, [1....","(Si, Cu, Sr)","[[ 2.06884519 2.40627241 -0.45891585] Si4-, [...","(Si4-, Cu3+, Sr2+)"
3,1.707570,2.117271,"[[2.06428082 0. 2.06428082] Pd, [0. ...","(Pd, Dy)","[[2.06428082 0. 2.06428082] Pd0+, [0. ...","(Pd0+, Dy0+)"
4,1.602060,1.690196,"[[3.09635262 1.0689416 1.53602403] Mg, [0.593...","(Mg, Si, Ba)","[[3.09635262 1.0689416 1.53602403] Mg0+, [0.5...","(Mg0+, Si0+, Ba0+)"
...,...,...,...,...,...,...
10982,1.414973,1.778151,"[[0. 0. 0.] Rh, [3.2029368 3.2029368 2.09459...","(Rh, I)","[[0. 0. 0.] Rh0+, [3.2029368 3.2029368 2.094...","(Rh0+, I0+)"
10983,1.431364,1.724276,"[[-1.51157821 4.4173925 1.21553922] Mg, [3....","(Mg, Co, Sn)","[[-1.51157821 4.4173925 1.21553922] Mg0+, [...","(Mg0+, Co0+, Sn0+)"
10984,1.000000,1.342423,"[[4.37546772 4.51128393 6.81784473] H, [0.4573...","(H, N, O)","[[4.37546772 4.51128393 6.81784473] H+, [0.457...","(H+, N5+, N3-, O2-)"
10985,1.579784,1.770852,"[[0. 0. 0.] Si, [ 4.55195829 4.55195829 -4.55...","(Si, Sn)","[[0. 0. 0.] Si4-, [ 4.55195829 4.55195829 -4....","(Si4-, Sn4+)"


In [10]:
joblib.dump(df, 'database_moduli_material_objects.pkl')

['database_moduli_material_objects.pkl']

#### Generate Features

##### (1) Compositional

In [11]:
# Import packages
import pandas as pd
import joblib

from pymatgen import core

In [12]:
# Import custom packages
import sys
sys.path.insert(1, 'C:/Users/Songy/OneDrive/Documents/GBSFS4MPP_local/new_properties')

import featurizers as feat

In [13]:
# Load Materials object file
df_obj = joblib.load('database_moduli_material_objects.pkl')
df_obj

Unnamed: 0,log10(G_VRH),log10(K_VRH),structure,composition,oxidation_structure,oxidation_composition
0,1.447158,1.707570,"[[0. 0. 0.] Ca, [1.37728887 1.57871271 3.73949...","(Ca, Ge, Ag)","[[0. 0. 0.] Ca0+, [1.37728887 1.57871271 3.739...","(Ca0+, Ge0+, Ag0+)"
1,1.518514,1.633468,"[[3.14048493 1.09300401 1.64101398] Mg, [0.625...","(Mg, Ge, Ba)","[[3.14048493 1.09300401 1.64101398] Mg0+, [0.6...","(Mg0+, Ge0+, Ba0+)"
2,1.740363,1.908485,"[[ 2.06884519 2.40627241 -0.45891585] Si, [1....","(Si, Cu, Sr)","[[ 2.06884519 2.40627241 -0.45891585] Si4-, [...","(Si4-, Cu3+, Sr2+)"
3,1.707570,2.117271,"[[2.06428082 0. 2.06428082] Pd, [0. ...","(Pd, Dy)","[[2.06428082 0. 2.06428082] Pd0+, [0. ...","(Pd0+, Dy0+)"
4,1.602060,1.690196,"[[3.09635262 1.0689416 1.53602403] Mg, [0.593...","(Mg, Si, Ba)","[[3.09635262 1.0689416 1.53602403] Mg0+, [0.5...","(Mg0+, Si0+, Ba0+)"
...,...,...,...,...,...,...
10982,1.414973,1.778151,"[[0. 0. 0.] Rh, [3.2029368 3.2029368 2.09459...","(Rh, I)","[[0. 0. 0.] Rh0+, [3.2029368 3.2029368 2.094...","(Rh0+, I0+)"
10983,1.431364,1.724276,"[[-1.51157821 4.4173925 1.21553922] Mg, [3....","(Mg, Co, Sn)","[[-1.51157821 4.4173925 1.21553922] Mg0+, [...","(Mg0+, Co0+, Sn0+)"
10984,1.000000,1.342423,"[[4.37546772 4.51128393 6.81784473] H, [0.4573...","(H, N, O)","[[4.37546772 4.51128393 6.81784473] H+, [0.457...","(H+, N5+, N3-, O2-)"
10985,1.579784,1.770852,"[[0. 0. 0.] Si, [ 4.55195829 4.55195829 -4.55...","(Si, Sn)","[[0. 0. 0.] Si4-, [ 4.55195829 4.55195829 -4....","(Si4-, Sn4+)"


In [14]:
# Initialise featurizers script
descriptors = feat.prepare_to_featurize(df=df_obj)

In [15]:
# Features based on oxidation composition
df_com = descriptors.generate_oxid_composition_features()
df_com

MultipleFeaturizer:   0%|          | 0/10987 [00:00<?, ?it/s]

Total no. of features generated: 13


Unnamed: 0,log10(G_VRH),log10(K_VRH),structure,composition,oxidation_structure,oxidation_composition,minimum oxidation state,maximum oxidation state,range oxidation state,std_dev oxidation state,compound possible,max ionic char,avg ionic char,avg anion electron affinity,minimum EN difference,maximum EN difference,range EN difference,mean EN difference,std_dev EN difference
0,1.447158,1.707570,"[[0. 0. 0.] Ca, [1.37728887 1.57871271 3.73949...","(Ca, Ge, Ag)","[[0. 0. 0.] Ca0+, [1.37728887 1.57871271 3.739...","(Ca0+, Ge0+, Ag0+)",0,0,0,0.000000,True,0.225103,0.033820,0.0,0.00,0.00,0.00,0.000000,0.000000
1,1.518514,1.633468,"[[3.14048493 1.09300401 1.64101398] Mg, [0.625...","(Mg, Ge, Ba)","[[3.14048493 1.09300401 1.64101398] Mg0+, [0.6...","(Mg0+, Ge0+, Ba0+)",0,0,0,0.000000,True,0.269189,0.043433,0.0,0.00,0.00,0.00,0.000000,0.000000
2,1.740363,1.908485,"[[ 2.06884519 2.40627241 -0.45891585] Si, [1....","(Si, Cu, Sr)","[[ 2.06884519 2.40627241 -0.45891585] Si4-, [...","(Si4-, Cu3+, Sr2+)",-4,3,7,4.107919,True,0.201983,0.032317,-534400.0,0.00,0.95,0.95,0.316667,0.671751
3,1.707570,2.117271,"[[2.06428082 0. 2.06428082] Pd, [0. ...","(Pd, Dy)","[[2.06428082 0. 2.06428082] Pd0+, [0. ...","(Pd0+, Dy0+)",0,0,0,0.000000,True,0.213451,0.040022,0.0,0.00,0.00,0.00,0.000000,0.000000
4,1.602060,1.690196,"[[3.09635262 1.0689416 1.53602403] Mg, [0.593...","(Mg, Si, Ba)","[[3.09635262 1.0689416 1.53602403] Mg0+, [0.5...","(Mg0+, Si0+, Ba0+)",0,0,0,0.000000,True,0.225103,0.034795,0.0,0.00,0.00,0.00,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10982,1.414973,1.778151,"[[0. 0. 0.] Rh, [3.2029368 3.2029368 2.09459...","(Rh, I)","[[0. 0. 0.] Rh0+, [3.2029368 3.2029368 2.094...","(Rh0+, I0+)",0,0,0,0.000000,True,0.035456,0.007879,0.0,0.00,0.00,0.00,0.000000,0.000000
10983,1.431364,1.724276,"[[-1.51157821 4.4173925 1.21553922] Mg, [3....","(Mg, Co, Sn)","[[-1.51157821 4.4173925 1.21553922] Mg0+, [...","(Mg0+, Co0+, Sn0+)",0,0,0,0.000000,True,0.100238,0.016736,0.0,0.00,0.00,0.00,0.000000,0.000000
10984,1.000000,1.342423,"[[4.37546772 4.51128393 6.81784473] H, [0.4573...","(H, N, O)","[[4.37546772 4.51128393 6.81784473] H+, [0.457...","(H+, N5+, N3-, O2-)",-3,5,8,2.886751,True,0.319141,0.024230,-216750.0,0.30,1.14,0.84,0.972000,0.593970
10985,1.579784,1.770852,"[[0. 0. 0.] Si, [ 4.55195829 4.55195829 -4.55...","(Si, Sn)","[[0. 0. 0.] Si4-, [ 4.55195829 4.55195829 -4....","(Si4-, Sn4+)",-4,4,8,5.656854,True,0.000900,0.000225,-534400.0,-0.06,-0.06,0.00,-0.060000,0.000000


In [16]:
# Features based on composition
df_com = descriptors.generate_composition_features()
df_com

MultipleFeaturizer:   0%|          | 0/10987 [00:00<?, ?it/s]

Total no. of features generated: 610


Unnamed: 0,log10(G_VRH),log10(K_VRH),structure,composition,oxidation_structure,oxidation_composition,minimum oxidation state,maximum oxidation state,range oxidation state,std_dev oxidation state,...,frac p valence electrons,frac d valence electrons,frac d valence electrons.1,frac f valence electrons,frac f valence electrons.1,mean simul. packing efficiency,mean abs simul. packing efficiency,dist from 1 clusters |APE| < 0.010,dist from 3 clusters |APE| < 0.010,dist from 5 clusters |APE| < 0.010
0,1.447158,1.707570,"[[0. 0. 0.] Ca, [1.37728887 1.57871271 3.73949...","(Ca, Ge, Ag)","[[0. 0. 0.] Ca0+, [1.37728887 1.57871271 3.739...","(Ca0+, Ge0+, Ag0+)",0,0,0,0.000000,...,0.076923,0.769231,0.769231,0.000000,0.000000,0.013381,0.017333,0.035635,0.046433,0.063577
1,1.518514,1.633468,"[[3.14048493 1.09300401 1.64101398] Mg, [0.625...","(Mg, Ge, Ba)","[[3.14048493 1.09300401 1.64101398] Mg0+, [0.6...","(Mg0+, Ge0+, Ba0+)",0,0,0,0.000000,...,0.117647,0.588235,0.588235,0.000000,0.000000,0.014021,0.021512,0.000000,0.022975,0.047210
2,1.740363,1.908485,"[[ 2.06884519 2.40627241 -0.45891585] Si, [1....","(Si, Cu, Sr)","[[ 2.06884519 2.40627241 -0.45891585] Si4-, [...","(Si4-, Cu3+, Sr2+)",-4,3,7,4.107919,...,0.125000,0.625000,0.625000,0.000000,0.000000,0.010974,0.010974,0.037684,0.056929,0.062101
3,1.707570,2.117271,"[[2.06428082 0. 2.06428082] Pd, [0. ...","(Pd, Dy)","[[2.06428082 0. 2.06428082] Pd0+, [0. ...","(Pd0+, Dy0+)",0,0,0,0.000000,...,0.000000,0.714286,0.714286,0.238095,0.238095,-0.028494,0.028494,0.020797,0.179500,0.238243
4,1.602060,1.690196,"[[3.09635262 1.0689416 1.53602403] Mg, [0.593...","(Mg, Si, Ba)","[[3.09635262 1.0689416 1.53602403] Mg0+, [0.5...","(Mg0+, Si0+, Ba0+)",0,0,0,0.000000,...,0.285714,0.000000,0.000000,0.000000,0.000000,0.010429,0.010429,0.000000,0.038128,0.056302
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10982,1.414973,1.778151,"[[0. 0. 0.] Rh, [3.2029368 3.2029368 2.09459...","(Rh, I)","[[0. 0. 0.] Rh0+, [3.2029368 3.2029368 2.094...","(Rh0+, I0+)",0,0,0,0.000000,...,0.232558,0.651163,0.651163,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000,1.000000
10983,1.431364,1.724276,"[[-1.51157821 4.4173925 1.21553922] Mg, [3....","(Mg, Co, Sn)","[[-1.51157821 4.4173925 1.21553922] Mg0+, [...","(Mg0+, Co0+, Sn0+)",0,0,0,0.000000,...,0.057143,0.485714,0.485714,0.000000,0.000000,-0.006782,0.015119,0.020412,0.062309,0.082355
10984,1.000000,1.342423,"[[4.37546772 4.51128393 6.81784473] H, [0.4573...","(H, N, O)","[[4.37546772 4.51128393 6.81784473] H+, [0.457...","(H+, N5+, N3-, O2-)",-3,5,8,2.886751,...,0.562500,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
10985,1.579784,1.770852,"[[0. 0. 0.] Si, [ 4.55195829 4.55195829 -4.55...","(Si, Sn)","[[0. 0. 0.] Si4-, [ 4.55195829 4.55195829 -4....","(Si4-, Sn4+)",-4,4,8,5.656854,...,0.222222,0.555556,0.555556,0.000000,0.000000,0.013126,0.024554,0.064282,0.188383,0.271055


In [17]:
joblib.dump(df_com, 'database_moduli_composition.pkl')

['database_moduli_composition.pkl']

##### (2) Structural 

In [18]:
# Import packages
import pandas as pd
import joblib

from pymatgen import core

In [19]:
# Import custom packages
import sys
sys.path.insert(1, 'C:/Users/Songy/OneDrive/Documents/GBSFS4MPP_local/new_properties')

import featurizers as feat

In [20]:
# Load Materials object file
df_obj = joblib.load('database_moduli_material_objects.pkl')
df_obj

Unnamed: 0,log10(G_VRH),log10(K_VRH),structure,composition,oxidation_structure,oxidation_composition
0,1.447158,1.707570,"[[0. 0. 0.] Ca, [1.37728887 1.57871271 3.73949...","(Ca, Ge, Ag)","[[0. 0. 0.] Ca0+, [1.37728887 1.57871271 3.739...","(Ca0+, Ge0+, Ag0+)"
1,1.518514,1.633468,"[[3.14048493 1.09300401 1.64101398] Mg, [0.625...","(Mg, Ge, Ba)","[[3.14048493 1.09300401 1.64101398] Mg0+, [0.6...","(Mg0+, Ge0+, Ba0+)"
2,1.740363,1.908485,"[[ 2.06884519 2.40627241 -0.45891585] Si, [1....","(Si, Cu, Sr)","[[ 2.06884519 2.40627241 -0.45891585] Si4-, [...","(Si4-, Cu3+, Sr2+)"
3,1.707570,2.117271,"[[2.06428082 0. 2.06428082] Pd, [0. ...","(Pd, Dy)","[[2.06428082 0. 2.06428082] Pd0+, [0. ...","(Pd0+, Dy0+)"
4,1.602060,1.690196,"[[3.09635262 1.0689416 1.53602403] Mg, [0.593...","(Mg, Si, Ba)","[[3.09635262 1.0689416 1.53602403] Mg0+, [0.5...","(Mg0+, Si0+, Ba0+)"
...,...,...,...,...,...,...
10982,1.414973,1.778151,"[[0. 0. 0.] Rh, [3.2029368 3.2029368 2.09459...","(Rh, I)","[[0. 0. 0.] Rh0+, [3.2029368 3.2029368 2.094...","(Rh0+, I0+)"
10983,1.431364,1.724276,"[[-1.51157821 4.4173925 1.21553922] Mg, [3....","(Mg, Co, Sn)","[[-1.51157821 4.4173925 1.21553922] Mg0+, [...","(Mg0+, Co0+, Sn0+)"
10984,1.000000,1.342423,"[[4.37546772 4.51128393 6.81784473] H, [0.4573...","(H, N, O)","[[4.37546772 4.51128393 6.81784473] H+, [0.457...","(H+, N5+, N3-, O2-)"
10985,1.579784,1.770852,"[[0. 0. 0.] Si, [ 4.55195829 4.55195829 -4.55...","(Si, Sn)","[[0. 0. 0.] Si4-, [ 4.55195829 4.55195829 -4....","(Si4-, Sn4+)"


In [21]:
# Initialization of featurizer
descriptors = feat.prepare_to_featurize(df=df_obj)

In [22]:
# Features based on oxidation structure
df_struc = descriptors.generate_oxid_structural_features()
df_struc

MultipleFeaturizer:   0%|          | 0/10987 [00:00<?, ?it/s]

Total no. of features generated: 401


Unnamed: 0,log10(G_VRH),log10(K_VRH),structure,composition,oxidation_structure,oxidation_composition,ReDF [0.00000 - 0.05000]A,ReDF [0.05000 - 0.10000]A,ReDF [0.10000 - 0.15000]A,ReDF [0.15000 - 0.20000]A,...,ReDF [19.55000 - 19.60000]A,ReDF [19.60000 - 19.65000]A,ReDF [19.65000 - 19.70000]A,ReDF [19.70000 - 19.75000]A,ReDF [19.75000 - 19.80000]A,ReDF [19.80000 - 19.85000]A,ReDF [19.85000 - 19.90000]A,ReDF [19.90000 - 19.95000]A,ReDF [19.95000 - 20.00000]A,ReDF [20.00000 - 20.00000]A
0,1.447158,1.707570,"[[0. 0. 0.] Ca, [1.37728887 1.57871271 3.73949...","(Ca, Ge, Ag)","[[0. 0. 0.] Ca0+, [1.37728887 1.57871271 3.739...","(Ca0+, Ge0+, Ag0+)",0.0,0.0,0.0,0.0,...,1.226022,2.935835,-1.300554,-1.946570,-1.294714,0.161465,-0.323344,2.248339,0.320419,0.0
1,1.518514,1.633468,"[[3.14048493 1.09300401 1.64101398] Mg, [0.625...","(Mg, Ge, Ba)","[[3.14048493 1.09300401 1.64101398] Mg0+, [0.6...","(Mg0+, Ge0+, Ba0+)",0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2,1.740363,1.908485,"[[ 2.06884519 2.40627241 -0.45891585] Si, [1....","(Si, Cu, Sr)","[[ 2.06884519 2.40627241 -0.45891585] Si4-, [...","(Si4-, Cu3+, Sr2+)",0.0,0.0,0.0,0.0,...,0.000000,1.631280,-0.976541,-1.945684,0.000000,-1.937177,-0.804554,1.885386,0.320646,0.0
3,1.707570,2.117271,"[[2.06428082 0. 2.06428082] Pd, [0. ...","(Pd, Dy)","[[2.06428082 0. 2.06428082] Pd0+, [0. ...","(Pd0+, Dy0+)",0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
4,1.602060,1.690196,"[[3.09635262 1.0689416 1.53602403] Mg, [0.593...","(Mg, Si, Ba)","[[3.09635262 1.0689416 1.53602403] Mg0+, [0.5...","(Mg0+, Si0+, Ba0+)",0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10982,1.414973,1.778151,"[[0. 0. 0.] Rh, [3.2029368 3.2029368 2.09459...","(Rh, I)","[[0. 0. 0.] Rh0+, [3.2029368 3.2029368 2.094...","(Rh0+, I0+)",0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
10983,1.431364,1.724276,"[[-1.51157821 4.4173925 1.21553922] Mg, [3....","(Mg, Co, Sn)","[[-1.51157821 4.4173925 1.21553922] Mg0+, [...","(Mg0+, Co0+, Sn0+)",0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
10984,1.000000,1.342423,"[[4.37546772 4.51128393 6.81784473] H, [0.4573...","(H, N, O)","[[4.37546772 4.51128393 6.81784473] H+, [0.457...","(H+, N5+, N3-, O2-)",0.0,0.0,0.0,0.0,...,-0.113328,-0.101733,-0.452127,0.146300,-0.258852,-0.016920,0.385824,-0.345269,0.250308,0.0
10985,1.579784,1.770852,"[[0. 0. 0.] Si, [ 4.55195829 4.55195829 -4.55...","(Si, Sn)","[[0. 0. 0.] Si4-, [ 4.55195829 4.55195829 -4....","(Si4-, Sn4+)",0.0,0.0,0.0,0.0,...,0.000000,0.000000,39.050766,0.000000,0.000000,-48.383375,0.000000,0.000000,0.000000,0.0


In [23]:
# Features based on structure
df_struc = descriptors.generate_structural_features()
df_struc

MultipleFeaturizer:   0%|          | 0/10987 [00:00<?, ?it/s]

Total no. of features generated: 1282


Unnamed: 0,log10(G_VRH),log10(K_VRH),structure,composition,oxidation_structure,oxidation_composition,ReDF [0.00000 - 0.05000]A,ReDF [0.05000 - 0.10000]A,ReDF [0.10000 - 0.15000]A,ReDF [0.15000 - 0.20000]A,...,maximum local difference in SpaceGroupNumber,range local difference in SpaceGroupNumber,mean local difference in SpaceGroupNumber,avg_dev local difference in SpaceGroupNumber,spacegroup_num,crystal_system,crystal_system_int,is_centrosymmetric,n_symmetry_ops,dimensionality
0,1.447158,1.707570,"[[0. 0. 0.] Ca, [1.37728887 1.57871271 3.73949...","(Ca, Ge, Ag)","[[0. 0. 0.] Ca0+, [1.37728887 1.57871271 3.739...","(Ca0+, Ge0+, Ag0+)",0.0,0.0,0.0,0.0,...,0.000000,0.000000e+00,0.000000,0.000000e+00,139,tetragonal,4,True,16,3
1,1.518514,1.633468,"[[3.14048493 1.09300401 1.64101398] Mg, [0.625...","(Mg, Ge, Ba)","[[3.14048493 1.09300401 1.64101398] Mg0+, [0.6...","(Mg0+, Ge0+, Ba0+)",0.0,0.0,0.0,0.0,...,24.775786,7.719126e+00,20.199378,3.661127e+00,139,tetragonal,4,True,16,3
2,1.740363,1.908485,"[[ 2.06884519 2.40627241 -0.45891585] Si, [1....","(Si, Cu, Sr)","[[ 2.06884519 2.40627241 -0.45891585] Si4-, [...","(Si4-, Cu3+, Sr2+)",0.0,0.0,0.0,0.0,...,1.663047,6.213164e-01,1.300934,2.896897e-01,139,tetragonal,4,True,16,3
3,1.707570,2.117271,"[[2.06428082 0. 2.06428082] Pd, [0. ...","(Pd, Dy)","[[2.06428082 0. 2.06428082] Pd0+, [0. ...","(Pd0+, Dy0+)",0.0,0.0,0.0,0.0,...,31.000000,2.066667e+01,15.500000,7.750000e+00,221,cubic,1,True,48,3
4,1.602060,1.690196,"[[3.09635262 1.0689416 1.53602403] Mg, [0.593...","(Mg, Si, Ba)","[[3.09635262 1.0689416 1.53602403] Mg0+, [0.5...","(Mg0+, Si0+, Ba0+)",0.0,0.0,0.0,0.0,...,26.049871,9.231468e+00,20.806035,4.195069e+00,139,tetragonal,4,True,16,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10982,1.414973,1.778151,"[[0. 0. 0.] Rh, [3.2029368 3.2029368 2.09459...","(Rh, I)","[[0. 0. 0.] Rh0+, [3.2029368 3.2029368 2.094...","(Rh0+, I0+)",0.0,0.0,0.0,0.0,...,161.000000,9.501058e+01,97.659643,4.222690e+01,136,tetragonal,4,True,16,3
10983,1.431364,1.724276,"[[-1.51157821 4.4173925 1.21553922] Mg, [3....","(Mg, Co, Sn)","[[-1.51157821 4.4173925 1.21553922] Mg0+, [...","(Mg0+, Co0+, Sn0+)",0.0,0.0,0.0,0.0,...,53.000000,5.299352e+01,13.283257,9.929186e+00,38,orthorhombic,5,False,4,3
10984,1.000000,1.342423,"[[4.37546772 4.51128393 6.81784473] H, [0.4573...","(H, N, O)","[[4.37546772 4.51128393 6.81784473] H+, [0.457...","(H+, N5+, N3-, O2-)",0.0,0.0,0.0,0.0,...,175.036553,1.750366e+02,109.238749,4.333938e+01,56,orthorhombic,5,True,8,0
10985,1.579784,1.770852,"[[0. 0. 0.] Si, [ 4.55195829 4.55195829 -4.55...","(Si, Sn)","[[0. 0. 0.] Si4-, [ 4.55195829 4.55195829 -4....","(Si4-, Sn4+)",0.0,0.0,0.0,0.0,...,71.421207,1.421085e-14,71.421207,7.105427e-15,216,cubic,1,False,24,3


In [24]:
joblib.dump(df_struc, 'database_moduli_structural.pkl')

['database_moduli_structural.pkl']

In [25]:
# Initialization of featurizer
descriptors = feat.prepare_to_featurize(df=df_obj)

# Features based on JarvisCFID
df_Jarvis = descriptors.generate_JarvisCFID_features()

In [26]:
joblib.dump(df_struc, 'database_moduli_structural_jarviscfid.pkl')

##### (3) Extra Features

In [27]:
# Import packages
import pandas as pd
import joblib

from pymatgen import core

In [28]:
# Import custom packages
import sys
sys.path.insert(1, 'C:/Users/Songy/OneDrive/Documents/GBSFS4MPP_local/new_properties')

import featurizers as feat

In [29]:
# Load Materials object file
df_obj = joblib.load('database_moduli_material_objects.pkl')
df_obj

Unnamed: 0,log10(G_VRH),log10(K_VRH),structure,composition,oxidation_structure,oxidation_composition
0,1.447158,1.707570,"[[0. 0. 0.] Ca, [1.37728887 1.57871271 3.73949...","(Ca, Ge, Ag)","[[0. 0. 0.] Ca0+, [1.37728887 1.57871271 3.739...","(Ca0+, Ge0+, Ag0+)"
1,1.518514,1.633468,"[[3.14048493 1.09300401 1.64101398] Mg, [0.625...","(Mg, Ge, Ba)","[[3.14048493 1.09300401 1.64101398] Mg0+, [0.6...","(Mg0+, Ge0+, Ba0+)"
2,1.740363,1.908485,"[[ 2.06884519 2.40627241 -0.45891585] Si, [1....","(Si, Cu, Sr)","[[ 2.06884519 2.40627241 -0.45891585] Si4-, [...","(Si4-, Cu3+, Sr2+)"
3,1.707570,2.117271,"[[2.06428082 0. 2.06428082] Pd, [0. ...","(Pd, Dy)","[[2.06428082 0. 2.06428082] Pd0+, [0. ...","(Pd0+, Dy0+)"
4,1.602060,1.690196,"[[3.09635262 1.0689416 1.53602403] Mg, [0.593...","(Mg, Si, Ba)","[[3.09635262 1.0689416 1.53602403] Mg0+, [0.5...","(Mg0+, Si0+, Ba0+)"
...,...,...,...,...,...,...
10982,1.414973,1.778151,"[[0. 0. 0.] Rh, [3.2029368 3.2029368 2.09459...","(Rh, I)","[[0. 0. 0.] Rh0+, [3.2029368 3.2029368 2.094...","(Rh0+, I0+)"
10983,1.431364,1.724276,"[[-1.51157821 4.4173925 1.21553922] Mg, [3....","(Mg, Co, Sn)","[[-1.51157821 4.4173925 1.21553922] Mg0+, [...","(Mg0+, Co0+, Sn0+)"
10984,1.000000,1.342423,"[[4.37546772 4.51128393 6.81784473] H, [0.4573...","(H, N, O)","[[4.37546772 4.51128393 6.81784473] H+, [0.457...","(H+, N5+, N3-, O2-)"
10985,1.579784,1.770852,"[[0. 0. 0.] Si, [ 4.55195829 4.55195829 -4.55...","(Si, Sn)","[[0. 0. 0.] Si4-, [ 4.55195829 4.55195829 -4....","(Si4-, Sn4+)"


In [30]:
# Initialization of featurizer
descriptors = feat.prepare_to_featurize(df=df_obj)

df_custom = descriptors.custom_features(formula_col_exist=False, composition_col_exist=True)
df_custom

Unnamed: 0,log10(G_VRH),log10(K_VRH),structure,composition,oxidation_structure,oxidation_composition,weight,total_e,avg_electroneg,noble_gas,...,alkaline,halogen,chalcogen,lanthanoid,actinoid,quadrupolar,s-block,p-block,d-block,f-block
0,1.447158,1.707570,"[[0. 0. 0.] Ca, [1.37728887 1.57871271 3.73949...","(Ca, Ge, Ag)","[[0. 0. 0.] Ca0+, [1.37728887 1.57871271 3.739...","(Ca0+, Ge0+, Ag0+)",401.094400,178.0,1.776000,0,...,1,0,0,0,0,1,1,1,1,0
1,1.518514,1.633468,"[[3.14048493 1.09300401 1.64101398] Mg, [0.625...","(Mg, Ge, Ba)","[[3.14048493 1.09300401 1.64101398] Mg0+, [0.6...","(Mg0+, Ge0+, Ba0+)",331.217000,144.0,1.506000,0,...,1,0,0,0,0,1,1,1,0,0
2,1.740363,1.908485,"[[ 2.06884519 2.40627241 -0.45891585] Si, [1....","(Si, Cu, Sr)","[[ 2.06884519 2.40627241 -0.45891585] Si4-, [...","(Si4-, Cu3+, Sr2+)",270.883000,124.0,1.710000,0,...,1,0,0,0,0,1,1,1,1,0
3,1.707570,2.117271,"[[2.06428082 0. 2.06428082] Pd, [0. ...","(Pd, Dy)","[[2.06428082 0. 2.06428082] Pd0+, [0. ...","(Pd0+, Dy0+)",481.760000,204.0,1.955000,0,...,0,0,0,1,0,0,0,0,1,1
4,1.602060,1.690196,"[[3.09635262 1.0689416 1.53602403] Mg, [0.593...","(Mg, Si, Ba)","[[3.09635262 1.0689416 1.53602403] Mg0+, [0.5...","(Mg0+, Si0+, Ba0+)",242.108000,108.0,1.462000,0,...,1,0,0,0,0,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10982,1.414973,1.778151,"[[0. 0. 0.] Rh, [3.2029368 3.2029368 2.09459...","(Rh, I)","[[0. 0. 0.] Rh0+, [3.2029368 3.2029368 2.094...","(Rh0+, I0+)",713.428880,302.0,2.533333,0,...,0,1,0,0,0,1,0,1,1,0
10983,1.431364,1.724276,"[[-1.51157821 4.4173925 1.21553922] Mg, [3....","(Mg, Co, Sn)","[[-1.51157821 4.4173925 1.21553922] Mg0+, [...","(Mg0+, Co0+, Sn0+)",323.473195,149.0,1.462500,0,...,1,0,0,0,0,1,1,1,1,0
10984,1.000000,1.342423,"[[4.37546772 4.51128393 6.81784473] H, [0.4573...","(H, N, O)","[[4.37546772 4.51128393 6.81784473] H+, [0.457...","(H+, N5+, N3-, O2-)",640.346880,336.0,2.800000,0,...,0,0,1,0,0,1,1,1,0,0
10985,1.579784,1.770852,"[[0. 0. 0.] Si, [ 4.55195829 4.55195829 -4.55...","(Si, Sn)","[[0. 0. 0.] Si4-, [ 4.55195829 4.55195829 -4....","(Si4-, Sn4+)",146.795500,64.0,1.930000,0,...,0,0,0,0,0,1,0,1,0,0


In [31]:
joblib.dump(df_custom, 'database_moduli_additional_features.pkl')

['database_moduli_additional_features.pkl']

#### Merge Results

In [32]:
# Import packages
import pandas as pd
import joblib
import re

from pymatgen import core

In [33]:
# Import custom packages
import sys
sys.path.insert(1, 'C:/Users/Songy/OneDrive/Documents/GBSFS4MPP_local/new_properties')

import featurizers as feat

In [34]:
# Load Materials object file
df1 = joblib.load('database_moduli_composition.pkl')
df2 = joblib.load('database_moduli_structural.pkl')
df3 = joblib.load('database_moduli_structural_jarviscfid.pkl')
df4 = joblib.load('database_moduli_additional_features.pkl')

# Concat dataframes
frames = [df1, df2, df3, df4]

df_merged = pd.concat(frames, axis=1)
df_merged.head()

Unnamed: 0,log10(G_VRH),log10(K_VRH),structure,composition,oxidation_structure,oxidation_composition,minimum oxidation state,maximum oxidation state,range oxidation state,std_dev oxidation state,...,alkaline,halogen,chalcogen,lanthanoid,actinoid,quadrupolar,s-block,p-block,d-block,f-block
0,1.447158,1.70757,"[[0. 0. 0.] Ca, [1.37728887 1.57871271 3.73949...","(Ca, Ge, Ag)","[[0. 0. 0.] Ca0+, [1.37728887 1.57871271 3.739...","(Ca0+, Ge0+, Ag0+)",0,0,0,0.0,...,1,0,0,0,0,1,1,1,1,0
1,1.518514,1.633468,"[[3.14048493 1.09300401 1.64101398] Mg, [0.625...","(Mg, Ge, Ba)","[[3.14048493 1.09300401 1.64101398] Mg0+, [0.6...","(Mg0+, Ge0+, Ba0+)",0,0,0,0.0,...,1,0,0,0,0,1,1,1,0,0
2,1.740363,1.908485,"[[ 2.06884519 2.40627241 -0.45891585] Si, [1....","(Si, Cu, Sr)","[[ 2.06884519 2.40627241 -0.45891585] Si4-, [...","(Si4-, Cu3+, Sr2+)",-4,3,7,4.107919,...,1,0,0,0,0,1,1,1,1,0
3,1.70757,2.117271,"[[2.06428082 0. 2.06428082] Pd, [0. ...","(Pd, Dy)","[[2.06428082 0. 2.06428082] Pd0+, [0. ...","(Pd0+, Dy0+)",0,0,0,0.0,...,0,0,0,1,0,0,0,0,1,1
4,1.60206,1.690196,"[[3.09635262 1.0689416 1.53602403] Mg, [0.593...","(Mg, Si, Ba)","[[3.09635262 1.0689416 1.53602403] Mg0+, [0.5...","(Mg0+, Si0+, Ba0+)",0,0,0,0.0,...,1,0,0,0,0,1,1,1,0,0


In [35]:
# Remove duplications
df_merged  = df_merged.loc[:,~df_merged.columns.duplicated()]

# Remove space and JSON character in the column
df_merged.columns = df_merged.columns.str.replace(' ','_')
df_merged = df_merged.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

# Drop chemical & structural objects
# full_list = ['formula', 'structure', 'composition', 'oxidation_structure', 'oxidation_composition']
cols = ['structure', 'composition', 'oxidation_structure', 'oxidation_composition']

df_merged = df_merged.drop(cols, axis=1)

# False and True -> 0 and 1
df_merged.replace({False: 0, True: 1}, inplace=True)
df_merged

Unnamed: 0,log10G_VRH,log10K_VRH,minimum_oxidation_state,maximum_oxidation_state,range_oxidation_state,std_dev_oxidation_state,compound_possible,max_ionic_char,avg_ionic_char,avg_anion_electron_affinity,...,alkaline,halogen,chalcogen,lanthanoid,actinoid,quadrupolar,sblock,pblock,dblock,fblock
0,1.447158,1.707570,0,0,0,0.000000,1,0.225103,0.033820,0.0,...,1,0,0,0,0,1,1,1,1,0
1,1.518514,1.633468,0,0,0,0.000000,1,0.269189,0.043433,0.0,...,1,0,0,0,0,1,1,1,0,0
2,1.740363,1.908485,-4,3,7,4.107919,1,0.201983,0.032317,-534400.0,...,1,0,0,0,0,1,1,1,1,0
3,1.707570,2.117271,0,0,0,0.000000,1,0.213451,0.040022,0.0,...,0,0,0,1,0,0,0,0,1,1
4,1.602060,1.690196,0,0,0,0.000000,1,0.225103,0.034795,0.0,...,1,0,0,0,0,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10982,1.414973,1.778151,0,0,0,0.000000,1,0.035456,0.007879,0.0,...,0,1,0,0,0,1,0,1,1,0
10983,1.431364,1.724276,0,0,0,0.000000,1,0.100238,0.016736,0.0,...,1,0,0,0,0,1,1,1,1,0
10984,1.000000,1.342423,-3,5,8,2.886751,1,0.319141,0.024230,-216750.0,...,0,0,1,0,0,1,1,1,0,0
10985,1.579784,1.770852,-4,4,8,5.656854,1,0.000900,0.000225,-534400.0,...,0,0,0,0,0,1,0,1,0,0


One-Hot-Encoding Categorical Features

In [36]:
# Initialization of featurizer
descriptors = feat.prepare_to_featurize(df=df_merged)

# One hot encoding of categorical features
df_merged, categorical_feature = descriptors.OHE()

categorical_feature

No. of categorical features: 5


['HOMO_character_ohe_0',
 'HOMO_character_ohe_d',
 'HOMO_character_ohe_f',
 'HOMO_character_ohe_p',
 'HOMO_character_ohe_s',
 'HOMO_element_ohe_0',
 'HOMO_element_ohe_Ac',
 'HOMO_element_ohe_Ag',
 'HOMO_element_ohe_Al',
 'HOMO_element_ohe_As',
 'HOMO_element_ohe_Au',
 'HOMO_element_ohe_B',
 'HOMO_element_ohe_Ba',
 'HOMO_element_ohe_Be',
 'HOMO_element_ohe_Bi',
 'HOMO_element_ohe_Br',
 'HOMO_element_ohe_C',
 'HOMO_element_ohe_Ca',
 'HOMO_element_ohe_Cd',
 'HOMO_element_ohe_Ce',
 'HOMO_element_ohe_Cl',
 'HOMO_element_ohe_Co',
 'HOMO_element_ohe_Cr',
 'HOMO_element_ohe_Cs',
 'HOMO_element_ohe_Cu',
 'HOMO_element_ohe_Dy',
 'HOMO_element_ohe_Er',
 'HOMO_element_ohe_Eu',
 'HOMO_element_ohe_F',
 'HOMO_element_ohe_Fe',
 'HOMO_element_ohe_Ga',
 'HOMO_element_ohe_Gd',
 'HOMO_element_ohe_Ge',
 'HOMO_element_ohe_H',
 'HOMO_element_ohe_Hf',
 'HOMO_element_ohe_Hg',
 'HOMO_element_ohe_Ho',
 'HOMO_element_ohe_I',
 'HOMO_element_ohe_In',
 'HOMO_element_ohe_Ir',
 'HOMO_element_ohe_K',
 'HOMO_element_ohe

In [37]:
df_merged = descriptors.movecol(['log10G_VRH', 'log10K_VRH'], ref_col='crystal_system_ohe_trigonal')
df_merged.head()

Unnamed: 0,minimum_oxidation_state,maximum_oxidation_state,range_oxidation_state,std_dev_oxidation_state,compound_possible,max_ionic_char,avg_ionic_char,avg_anion_electron_affinity,minimum_EN_difference,maximum_EN_difference,...,LUMO_element_ohe_Zr,crystal_system_ohe_cubic,crystal_system_ohe_hexagonal,crystal_system_ohe_monoclinic,crystal_system_ohe_orthorhombic,crystal_system_ohe_tetragonal,crystal_system_ohe_triclinic,crystal_system_ohe_trigonal,log10G_VRH,log10K_VRH
0,0,0,0,0.0,1,0.225103,0.03382,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,1.447158,1.70757
1,0,0,0,0.0,1,0.269189,0.043433,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,1.518514,1.633468
2,-4,3,7,4.107919,1,0.201983,0.032317,-534400.0,0.0,0.95,...,0,0,0,0,0,1,0,0,1.740363,1.908485
3,0,0,0,0.0,1,0.213451,0.040022,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,1.70757,2.117271
4,0,0,0,0.0,1,0.225103,0.034795,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,1.60206,1.690196


In [38]:
feature = df_merged.columns[:-2].tolist()
print(len(feature))

4053


In [39]:
df_merged.columns[-2:].tolist()

['log10G_VRH', 'log10K_VRH']

Save Data

In [40]:
joblib.dump(df_merged, 'database_moduli_merged.pkl')
joblib.dump(feature, 'features.pkl')

['features.pkl']