### Load data

In [1]:
# Import packages
import pandas as pd
import joblib

In [2]:
# Import custom packages
import sys
sys.path.insert(1, 'C:/Users/Songy/OneDrive/Documents/GBSFS4MPP_local/new_properties')

import create_material_objects as cmo

In [3]:
# Load dataset
df_original = joblib.load('dielectric_constant.pkl') 
df_original = df_original.drop(['material_id', 'e_electronic', 'e_total', 'cif', 'meta', 'poscar'], axis=1)

df_original

Unnamed: 0,formula,nsites,space_group,volume,structure,band_gap,n,poly_electronic,poly_total,pot_ferroelectric
0,Rb2Te,3,225,159.501208,"[[1.75725875 1.2425695 3.04366125] Rb, [5.271...",1.88,1.86,3.44,6.23,False
1,CdCl2,3,166,84.298097,"[[0. 0. 0.] Cd, [ 4.27210959 2.64061969 13.13...",3.52,1.78,3.16,6.73,False
2,MnI2,3,164,108.335875,"[[0. 0. 0.] Mn, [-2.07904300e-06 2.40067320e+...",1.17,2.23,4.97,10.64,False
3,LaN,4,186,88.162562,[[-1.73309900e-06 2.38611186e+00 5.95256328e...,1.12,2.65,7.04,17.99,False
4,MnF2,6,136,82.826401,"[[1.677294 2.484476 2.484476] Mn, [0. 0. 0.] M...",2.87,1.53,2.35,7.12,False
...,...,...,...,...,...,...,...,...,...,...
1051,Cd(InSe2)2,7,111,212.493121,"[[0. 0. 0.] Cd, [2.9560375 0. 3.03973 ...",0.87,2.77,7.67,11.76,True
1052,LaHBr2,8,194,220.041363,"[[2.068917 3.58317965 3.70992025] La, [4.400...",3.60,2.00,3.99,7.08,True
1053,Li2AgSb,4,216,73.882306,"[[1.35965225 0.96141925 2.354987 ] Li, [2.719...",0.14,14.58,212.61,232.60,True
1054,Rb3AuO,5,221,177.269065,"[[0. 2.808758 2.808758] Rb, [2.808758 2....",0.21,2.53,6.41,22.44,True


In [4]:
df = df_original[['formula', 'structure']]
df

Unnamed: 0,formula,structure
0,Rb2Te,"[[1.75725875 1.2425695 3.04366125] Rb, [5.271..."
1,CdCl2,"[[0. 0. 0.] Cd, [ 4.27210959 2.64061969 13.13..."
2,MnI2,"[[0. 0. 0.] Mn, [-2.07904300e-06 2.40067320e+..."
3,LaN,[[-1.73309900e-06 2.38611186e+00 5.95256328e...
4,MnF2,"[[1.677294 2.484476 2.484476] Mn, [0. 0. 0.] M..."
...,...,...
1051,Cd(InSe2)2,"[[0. 0. 0.] Cd, [2.9560375 0. 3.03973 ..."
1052,LaHBr2,"[[2.068917 3.58317965 3.70992025] La, [4.400..."
1053,Li2AgSb,"[[1.35965225 0.96141925 2.354987 ] Li, [2.719..."
1054,Rb3AuO,"[[0. 2.808758 2.808758] Rb, [2.808758 2...."


In [5]:
# Initialize to create/convert material object (pymatgen)
converter = cmo.material_object(df=df)

# Possible functions
# Create Composition object from chemical formula
# df = converter.formula2composition(formula_col='formula')

# Create Composition object from Structure object
# df = converter.structure2composition(structure_col='structure')

# Create Oxidation Structure object from Structure object
# df = converter.structure2oxidstructure(structure_col='structure')

# Create Oxidation Composition object from Composition object
# df = converter.composition2oxidcomposition(composition_col='composition')

# Source Structure Object from MP database using Composition object 
# df = converter.composition2structure_fromMP(composition_col='composition', mapi_key='lZCh9ke4qRxMQO16')

In [6]:
# Create Composition object from chemical formula
df = converter.structure2composition(structure_col='structure')
df

StructureToComposition:   0%|          | 0/1056 [00:00<?, ?it/s]

Unnamed: 0,formula,structure,composition
0,Rb2Te,"[[1.75725875 1.2425695 3.04366125] Rb, [5.271...","(Rb, Te)"
1,CdCl2,"[[0. 0. 0.] Cd, [ 4.27210959 2.64061969 13.13...","(Cd, Cl)"
2,MnI2,"[[0. 0. 0.] Mn, [-2.07904300e-06 2.40067320e+...","(Mn, I)"
3,LaN,[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La, N)"
4,MnF2,"[[1.677294 2.484476 2.484476] Mn, [0. 0. 0.] M...","(Mn, F)"
...,...,...,...
1051,Cd(InSe2)2,"[[0. 0. 0.] Cd, [2.9560375 0. 3.03973 ...","(Cd, In, Se)"
1052,LaHBr2,"[[2.068917 3.58317965 3.70992025] La, [4.400...","(La, H, Br)"
1053,Li2AgSb,"[[1.35965225 0.96141925 2.354987 ] Li, [2.719...","(Li, Ag, Sb)"
1054,Rb3AuO,"[[0. 2.808758 2.808758] Rb, [2.808758 2....","(Rb, Au, O)"


In [7]:
# Create Oxidation Structure object from Structure object
df = converter.structure2oxidstructure(structure_col='structure')
df

StructureToOxidStructure:   0%|          | 0/1056 [00:00<?, ?it/s]

Unnamed: 0,formula,structure,composition,oxidation_structure
0,Rb2Te,"[[1.75725875 1.2425695 3.04366125] Rb, [5.271...","(Rb, Te)","[[1.75725875 1.2425695 3.04366125] Rb+, [5.27..."
1,CdCl2,"[[0. 0. 0.] Cd, [ 4.27210959 2.64061969 13.13...","(Cd, Cl)","[[0. 0. 0.] Cd2+, [ 4.27210959 2.64061969 13...."
2,MnI2,"[[0. 0. 0.] Mn, [-2.07904300e-06 2.40067320e+...","(Mn, I)","[[0. 0. 0.] Mn2+, [-2.07904300e-06 2.40067320..."
3,LaN,[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La, N)",[[-1.73309900e-06 2.38611186e+00 5.95256328e...
4,MnF2,"[[1.677294 2.484476 2.484476] Mn, [0. 0. 0.] M...","(Mn, F)","[[1.677294 2.484476 2.484476] Mn2+, [0. 0. 0.]..."
...,...,...,...,...
1051,Cd(InSe2)2,"[[0. 0. 0.] Cd, [2.9560375 0. 3.03973 ...","(Cd, In, Se)","[[0. 0. 0.] Cd2+, [2.9560375 0. 3.03973..."
1052,LaHBr2,"[[2.068917 3.58317965 3.70992025] La, [4.400...","(La, H, Br)","[[2.068917 3.58317965 3.70992025] La3+, [4.4..."
1053,Li2AgSb,"[[1.35965225 0.96141925 2.354987 ] Li, [2.719...","(Li, Ag, Sb)","[[1.35965225 0.96141925 2.354987 ] Li+, [2.71..."
1054,Rb3AuO,"[[0. 2.808758 2.808758] Rb, [2.808758 2....","(Rb, Au, O)","[[0. 2.808758 2.808758] Rb+, [2.808758 2..."


In [8]:
# Create Oxidation Composition object from Composition object
df = converter.composition2oxidcomposition(composition_col='composition')
df

CompositionToOxidComposition:   0%|          | 0/1056 [00:00<?, ?it/s]

Unnamed: 0,formula,structure,composition,oxidation_structure,oxidation_composition
0,Rb2Te,"[[1.75725875 1.2425695 3.04366125] Rb, [5.271...","(Rb, Te)","[[1.75725875 1.2425695 3.04366125] Rb+, [5.27...","(Rb+, Te2-)"
1,CdCl2,"[[0. 0. 0.] Cd, [ 4.27210959 2.64061969 13.13...","(Cd, Cl)","[[0. 0. 0.] Cd2+, [ 4.27210959 2.64061969 13....","(Cd2+, Cl-)"
2,MnI2,"[[0. 0. 0.] Mn, [-2.07904300e-06 2.40067320e+...","(Mn, I)","[[0. 0. 0.] Mn2+, [-2.07904300e-06 2.40067320...","(Mn2+, I-)"
3,LaN,[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La, N)",[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La3+, N3-)"
4,MnF2,"[[1.677294 2.484476 2.484476] Mn, [0. 0. 0.] M...","(Mn, F)","[[1.677294 2.484476 2.484476] Mn2+, [0. 0. 0.]...","(Mn2+, F-)"
...,...,...,...,...,...
1051,Cd(InSe2)2,"[[0. 0. 0.] Cd, [2.9560375 0. 3.03973 ...","(Cd, In, Se)","[[0. 0. 0.] Cd2+, [2.9560375 0. 3.03973...","(Cd2+, In3+, Se2-)"
1052,LaHBr2,"[[2.068917 3.58317965 3.70992025] La, [4.400...","(La, H, Br)","[[2.068917 3.58317965 3.70992025] La3+, [4.4...","(La3+, H-, Br-)"
1053,Li2AgSb,"[[1.35965225 0.96141925 2.354987 ] Li, [2.719...","(Li, Ag, Sb)","[[1.35965225 0.96141925 2.354987 ] Li+, [2.71...","(Li+, Ag+, Sb3-)"
1054,Rb3AuO,"[[0. 2.808758 2.808758] Rb, [2.808758 2....","(Rb, Au, O)","[[0. 2.808758 2.808758] Rb+, [2.808758 2...","(Rb+, Au-, O2-)"


In [9]:
joblib.dump(df, 'dielectric_constant_material_objects.pkl')

['dielectric_constant_material_objects.pkl']

#### Generate Features

##### (1) Composition

In [10]:
# Import packages
import pandas as pd
import joblib

from pymatgen import core

In [11]:
# Import custom packages
import sys
sys.path.insert(1, 'C:/Users/Songy/OneDrive/Documents/GBSFS4MPP_local/new_properties')

import featurizers as feat

In [12]:
# Load Materials object file
df_obj = joblib.load('dielectric_constant_material_objects.pkl')
df_obj

Unnamed: 0,formula,structure,composition,oxidation_structure,oxidation_composition
0,Rb2Te,"[[1.75725875 1.2425695 3.04366125] Rb, [5.271...","(Rb, Te)","[[1.75725875 1.2425695 3.04366125] Rb+, [5.27...","(Rb+, Te2-)"
1,CdCl2,"[[0. 0. 0.] Cd, [ 4.27210959 2.64061969 13.13...","(Cd, Cl)","[[0. 0. 0.] Cd2+, [ 4.27210959 2.64061969 13....","(Cd2+, Cl-)"
2,MnI2,"[[0. 0. 0.] Mn, [-2.07904300e-06 2.40067320e+...","(Mn, I)","[[0. 0. 0.] Mn2+, [-2.07904300e-06 2.40067320...","(Mn2+, I-)"
3,LaN,[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La, N)",[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La3+, N3-)"
4,MnF2,"[[1.677294 2.484476 2.484476] Mn, [0. 0. 0.] M...","(Mn, F)","[[1.677294 2.484476 2.484476] Mn2+, [0. 0. 0.]...","(Mn2+, F-)"
...,...,...,...,...,...
1051,Cd(InSe2)2,"[[0. 0. 0.] Cd, [2.9560375 0. 3.03973 ...","(Cd, In, Se)","[[0. 0. 0.] Cd2+, [2.9560375 0. 3.03973...","(Cd2+, In3+, Se2-)"
1052,LaHBr2,"[[2.068917 3.58317965 3.70992025] La, [4.400...","(La, H, Br)","[[2.068917 3.58317965 3.70992025] La3+, [4.4...","(La3+, H-, Br-)"
1053,Li2AgSb,"[[1.35965225 0.96141925 2.354987 ] Li, [2.719...","(Li, Ag, Sb)","[[1.35965225 0.96141925 2.354987 ] Li+, [2.71...","(Li+, Ag+, Sb3-)"
1054,Rb3AuO,"[[0. 2.808758 2.808758] Rb, [2.808758 2....","(Rb, Au, O)","[[0. 2.808758 2.808758] Rb+, [2.808758 2...","(Rb+, Au-, O2-)"


In [13]:
# Initialise featurizers script
descriptors = feat.prepare_to_featurize(df=df_obj)

In [14]:
# Features based on oxidation composition
df_com = descriptors.generate_oxid_composition_features()
df_com

MultipleFeaturizer:   0%|          | 0/1056 [00:00<?, ?it/s]

Total no. of features generated: 13


Unnamed: 0,formula,structure,composition,oxidation_structure,oxidation_composition,minimum oxidation state,maximum oxidation state,range oxidation state,std_dev oxidation state,compound possible,max ionic char,avg ionic char,avg anion electron affinity,minimum EN difference,maximum EN difference,range EN difference,mean EN difference,std_dev EN difference
0,Rb2Te,"[[1.75725875 1.2425695 3.04366125] Rb, [5.271...","(Rb, Te)","[[1.75725875 1.2425695 3.04366125] Rb+, [5.27...","(Rb+, Te2-)",-2,1,3,2.121320,True,0.336084,0.074685,-380400.000000,1.280000,1.280000,0.00,1.280000,0.000000
1,CdCl2,"[[0. 0. 0.] Cd, [ 4.27210959 2.64061969 13.13...","(Cd, Cl)","[[0. 0. 0.] Cd2+, [ 4.27210959 2.64061969 13....","(Cd2+, Cl-)",-1,2,3,2.121320,True,0.417383,0.092752,-349000.000000,1.470000,1.470000,0.00,1.470000,0.000000
2,MnI2,"[[0. 0. 0.] Mn, [-2.07904300e-06 2.40067320e+...","(Mn, I)","[[0. 0. 0.] Mn2+, [-2.07904300e-06 2.40067320...","(Mn2+, I-)",-1,2,3,2.121320,True,0.265103,0.058912,-295200.000000,1.110000,1.110000,0.00,1.110000,0.000000
3,LaN,[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La, N)",[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La3+, N3-)",-3,3,6,4.242641,True,0.609724,0.152431,-21000.000000,1.940000,1.940000,0.00,1.940000,0.000000
4,MnF2,"[[1.677294 2.484476 2.484476] Mn, [0. 0. 0.] M...","(Mn, F)","[[1.677294 2.484476 2.484476] Mn2+, [0. 0. 0.]...","(Mn2+, F-)",-1,2,3,2.121320,True,0.771501,0.171445,-328000.000000,2.430000,2.430000,0.00,2.430000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1051,Cd(InSe2)2,"[[0. 0. 0.] Cd, [2.9560375 0. 3.03973 ...","(Cd, In, Se)","[[0. 0. 0.] Cd2+, [2.9560375 0. 3.03973...","(Cd2+, In3+, Se2-)",-2,3,5,3.082207,True,0.168813,0.036355,-390000.000000,0.770000,0.860000,0.09,0.800000,0.063640
1052,LaHBr2,"[[2.068917 3.58317965 3.70992025] La, [4.400...","(La, H, Br)","[[2.068917 3.58317965 3.70992025] La3+, [4.4...","(La3+, H-, Br-)",-1,3,4,2.190890,True,0.578906,0.105485,-240666.666667,1.606667,1.606667,0.00,1.606667,0.000000
1053,Li2AgSb,"[[1.35965225 0.96141925 2.354987 ] Li, [2.719...","(Li, Ag, Sb)","[[1.35965225 0.96141925 2.354987 ] Li+, [2.71...","(Li+, Ag+, Sb3-)",-3,1,4,2.190890,True,0.248906,0.056586,-309600.000000,0.120000,1.070000,0.95,0.753333,0.671751
1054,Rb3AuO,"[[0. 2.808758 2.808758] Rb, [2.808758 2....","(Rb, Au, O)","[[0. 2.808758 2.808758] Rb+, [2.808758 2...","(Rb+, Au-, O2-)",-2,1,3,1.690309,True,0.820234,0.168484,-252400.000000,2.170000,2.170000,0.00,2.170000,0.000000


In [15]:
# Features based on composition
df_com = descriptors.generate_composition_features()
df_com

MultipleFeaturizer:   0%|          | 0/1056 [00:00<?, ?it/s]

Total no. of features generated: 610


Unnamed: 0,formula,structure,composition,oxidation_structure,oxidation_composition,minimum oxidation state,maximum oxidation state,range oxidation state,std_dev oxidation state,compound possible,...,frac p valence electrons,frac d valence electrons,frac d valence electrons.1,frac f valence electrons,frac f valence electrons.1,mean simul. packing efficiency,mean abs simul. packing efficiency,dist from 1 clusters |APE| < 0.010,dist from 3 clusters |APE| < 0.010,dist from 5 clusters |APE| < 0.010
0,Rb2Te,"[[1.75725875 1.2425695 3.04366125] Rb, [5.271...","(Rb, Te)","[[1.75725875 1.2425695 3.04366125] Rb+, [5.27...","(Rb+, Te2-)",-2,1,3,2.121320,True,...,0.222222,0.555556,0.555556,0.000000,0.000000,-0.030549,0.030549,0.027730,0.140189,0.214122
1,CdCl2,"[[0. 0. 0.] Cd, [ 4.27210959 2.64061969 13.13...","(Cd, Cl)","[[0. 0. 0.] Cd2+, [ 4.27210959 2.64061969 13....","(Cd2+, Cl-)",-1,2,3,2.121320,True,...,0.384615,0.384615,0.384615,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000,1.000000
2,MnI2,"[[0. 0. 0.] Mn, [-2.07904300e-06 2.40067320e+...","(Mn, I)","[[0. 0. 0.] Mn2+, [-2.07904300e-06 2.40067320...","(Mn2+, I-)",-1,2,3,2.121320,True,...,0.243902,0.609756,0.609756,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000,1.000000
3,LaN,[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La, N)",[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La3+, N3-)",-3,3,6,4.242641,True,...,0.375000,0.125000,0.125000,0.000000,0.000000,0.037958,0.037958,0.000000,0.050508,0.142391
4,MnF2,"[[1.677294 2.484476 2.484476] Mn, [0. 0. 0.] M...","(Mn, F)","[[1.677294 2.484476 2.484476] Mn2+, [0. 0. 0.]...","(Mn2+, F-)",-1,2,3,2.121320,True,...,0.476190,0.238095,0.238095,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1051,Cd(InSe2)2,"[[0. 0. 0.] Cd, [2.9560375 0. 3.03973 ...","(Cd, In, Se)","[[0. 0. 0.] Cd2+, [2.9560375 0. 3.03973...","(Cd2+, In3+, Se2-)",-2,3,5,3.082207,True,...,0.176471,0.686275,0.686275,0.000000,0.000000,-0.011231,0.011231,0.031442,0.041923,0.056596
1052,LaHBr2,"[[2.068917 3.58317965 3.70992025] La, [4.400...","(La, H, Br)","[[2.068917 3.58317965 3.70992025] La3+, [4.4...","(La3+, H-, Br-)",-1,3,4,2.190890,True,...,0.263158,0.552632,0.552632,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000,1.000000
1053,Li2AgSb,"[[1.35965225 0.96141925 2.354987 ] Li, [2.719...","(Li, Ag, Sb)","[[1.35965225 0.96141925 2.354987 ] Li+, [2.71...","(Li+, Ag+, Sb3-)",-3,1,4,2.190890,True,...,0.107143,0.714286,0.714286,0.000000,0.000000,0.005918,0.026102,0.147196,0.175736,0.195380
1054,Rb3AuO,"[[0. 2.808758 2.808758] Rb, [2.808758 2....","(Rb, Au, O)","[[0. 2.808758 2.808758] Rb+, [2.808758 2...","(Rb+, Au-, O2-)",-2,1,3,1.690309,True,...,0.117647,0.294118,0.294118,0.411765,0.411765,-0.027615,0.027615,0.053674,0.067719,0.075503


In [16]:
joblib.dump(df_com, 'dielectric_constant_composition.pkl')

['dielectric_constant_composition.pkl']

##### (2) Structural 

In [1]:
# Import packages
import pandas as pd
import joblib

from pymatgen import core

In [2]:
# Import custom packages
import sys
sys.path.insert(1, 'C:/Users/Songy/OneDrive/Documents/GBSFS4MPP_local/new_properties')

import featurizers as feat

In [3]:
# Load Materials object file
df_obj = joblib.load('dielectric_constant_material_objects.pkl')
df_obj

Unnamed: 0,formula,structure,composition,oxidation_structure,oxidation_composition
0,Rb2Te,"[[1.75725875 1.2425695 3.04366125] Rb, [5.271...","(Rb, Te)","[[1.75725875 1.2425695 3.04366125] Rb+, [5.27...","(Rb+, Te2-)"
1,CdCl2,"[[0. 0. 0.] Cd, [ 4.27210959 2.64061969 13.13...","(Cd, Cl)","[[0. 0. 0.] Cd2+, [ 4.27210959 2.64061969 13....","(Cd2+, Cl-)"
2,MnI2,"[[0. 0. 0.] Mn, [-2.07904300e-06 2.40067320e+...","(Mn, I)","[[0. 0. 0.] Mn2+, [-2.07904300e-06 2.40067320...","(Mn2+, I-)"
3,LaN,[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La, N)",[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La3+, N3-)"
4,MnF2,"[[1.677294 2.484476 2.484476] Mn, [0. 0. 0.] M...","(Mn, F)","[[1.677294 2.484476 2.484476] Mn2+, [0. 0. 0.]...","(Mn2+, F-)"
...,...,...,...,...,...
1051,Cd(InSe2)2,"[[0. 0. 0.] Cd, [2.9560375 0. 3.03973 ...","(Cd, In, Se)","[[0. 0. 0.] Cd2+, [2.9560375 0. 3.03973...","(Cd2+, In3+, Se2-)"
1052,LaHBr2,"[[2.068917 3.58317965 3.70992025] La, [4.400...","(La, H, Br)","[[2.068917 3.58317965 3.70992025] La3+, [4.4...","(La3+, H-, Br-)"
1053,Li2AgSb,"[[1.35965225 0.96141925 2.354987 ] Li, [2.719...","(Li, Ag, Sb)","[[1.35965225 0.96141925 2.354987 ] Li+, [2.71...","(Li+, Ag+, Sb3-)"
1054,Rb3AuO,"[[0. 2.808758 2.808758] Rb, [2.808758 2....","(Rb, Au, O)","[[0. 2.808758 2.808758] Rb+, [2.808758 2...","(Rb+, Au-, O2-)"


In [4]:
# Initialization of featurizer
descriptors = feat.prepare_to_featurize(df=df_obj)

In [5]:
# Features based on oxidation structure
df_struc = descriptors.generate_oxid_structural_features()
df_struc

MultipleFeaturizer:   0%|          | 0/1056 [00:00<?, ?it/s]

Total no. of features generated: 401


Unnamed: 0,formula,structure,composition,oxidation_structure,oxidation_composition,ReDF [0.00000 - 0.05000]A,ReDF [0.05000 - 0.10000]A,ReDF [0.10000 - 0.15000]A,ReDF [0.15000 - 0.20000]A,ReDF [0.20000 - 0.25000]A,...,ReDF [19.55000 - 19.60000]A,ReDF [19.60000 - 19.65000]A,ReDF [19.65000 - 19.70000]A,ReDF [19.70000 - 19.75000]A,ReDF [19.75000 - 19.80000]A,ReDF [19.80000 - 19.85000]A,ReDF [19.85000 - 19.90000]A,ReDF [19.90000 - 19.95000]A,ReDF [19.95000 - 20.00000]A,ReDF [20.00000 - 20.00000]A
0,Rb2Te,"[[1.75725875 1.2425695 3.04366125] Rb, [5.271...","(Rb, Te)","[[1.75725875 1.2425695 3.04366125] Rb+, [5.27...","(Rb+, Te2-)",0.0,0.0,0.0,0.0,0.0,...,0.000000,-4.896106,0.000000,1.622292,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.0
1,CdCl2,"[[0. 0. 0.] Cd, [ 4.27210959 2.64061969 13.13...","(Cd, Cl)","[[0. 0. 0.] Cd2+, [ 4.27210959 2.64061969 13....","(Cd2+, Cl-)",0.0,0.0,0.0,0.0,0.0,...,0.815201,0.611329,0.305005,-0.405318,0.000000,-1.615153,0.000000e+00,0.000000,-1.203220,0.0
2,MnI2,"[[0. 0. 0.] Mn, [-2.07904300e-06 2.40067320e+...","(Mn, I)","[[0. 0. 0.] Mn2+, [-2.07904300e-06 2.40067320...","(Mn2+, I-)",0.0,0.0,0.0,0.0,0.0,...,-0.817948,0.102039,0.000000,-0.811345,0.202176,0.302345,0.000000e+00,0.200714,-0.400036,0.0
3,LaN,[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La, N)",[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La3+, N3-)",0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,5.482701,-2.736803,2.737384,-2.723270,5.447680e+00,0.000000,2.708751,0.0
4,MnF2,"[[1.677294 2.484476 2.484476] Mn, [0. 0. 0.] M...","(Mn, F)","[[1.677294 2.484476 2.484476] Mn2+, [0. 0. 0.]...","(Mn2+, F-)",0.0,0.0,0.0,0.0,0.0,...,0.341416,-0.611922,0.000000,0.947294,-1.887994,0.538734,-1.346421e-01,-0.134090,-0.267314,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1051,Cd(InSe2)2,"[[0. 0. 0.] Cd, [2.9560375 0. 3.03973 ...","(Cd, In, Se)","[[0. 0. 0.] Cd2+, [2.9560375 0. 3.03973...","(Cd2+, In3+, Se2-)",0.0,0.0,0.0,0.0,0.0,...,-1.636214,-2.095994,4.185305,-2.548964,-1.617008,-1.383274,-6.897888e-01,-0.917385,-1.374122,0.0
1052,LaHBr2,"[[2.068917 3.58317965 3.70992025] La, [4.400...","(La, H, Br)","[[2.068917 3.58317965 3.70992025] La3+, [4.4...","(La3+, H-, Br-)",0.0,0.0,0.0,0.0,0.0,...,-0.460248,-0.918006,-0.610653,0.608003,-0.607538,0.151161,-3.017240e-01,-0.903234,0.000000,0.0
1053,Li2AgSb,"[[1.35965225 0.96141925 2.354987 ] Li, [2.719...","(Li, Ag, Sb)","[[1.35965225 0.96141925 2.354987 ] Li+, [2.71...","(Li+, Ag+, Sb3-)",0.0,0.0,0.0,0.0,0.0,...,0.000000,-3.667344,0.000000,-2.436148,0.000000,0.000000,0.000000e+00,0.000000,4.503890,0.0
1054,Rb3AuO,"[[0. 2.808758 2.808758] Rb, [2.808758 2....","(Rb, Au, O)","[[0. 2.808758 2.808758] Rb+, [2.808758 2...","(Rb+, Au-, O2-)",0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,-2.197209,0.000000,0.000000,0.000000,-2.775558e-17,0.000000,0.000000,0.0


In [6]:
# Features based on structure
df_struc = descriptors.generate_structural_features()
df_struc

MultipleFeaturizer:   0%|          | 0/1056 [00:00<?, ?it/s]

Total no. of features generated: 1282


Unnamed: 0,formula,structure,composition,oxidation_structure,oxidation_composition,ReDF [0.00000 - 0.05000]A,ReDF [0.05000 - 0.10000]A,ReDF [0.10000 - 0.15000]A,ReDF [0.15000 - 0.20000]A,ReDF [0.20000 - 0.25000]A,...,maximum local difference in SpaceGroupNumber,range local difference in SpaceGroupNumber,mean local difference in SpaceGroupNumber,avg_dev local difference in SpaceGroupNumber,spacegroup_num,crystal_system,crystal_system_int,is_centrosymmetric,n_symmetry_ops,dimensionality
0,Rb2Te,"[[1.75725875 1.2425695 3.04366125] Rb, [5.271...","(Rb, Te)","[[1.75725875 1.2425695 3.04366125] Rb+, [5.27...","(Rb+, Te2-)",0.0,0.0,0.0,0.0,0.0,...,77.000000,44.194265,47.537157,19.641896,225,cubic,1,True,48,3
1,CdCl2,"[[0. 0. 0.] Cd, [ 4.27210959 2.64061969 13.13...","(Cd, Cl)","[[0. 0. 0.] Cd2+, [ 4.27210959 2.64061969 13....","(Cd2+, Cl-)",0.0,0.0,0.0,0.0,0.0,...,130.000000,79.338011,77.107993,35.261338,166,trigonal,3,True,12,2
2,MnI2,"[[0. 0. 0.] Mn, [-2.07904300e-06 2.40067320e+...","(Mn, I)","[[0. 0. 0.] Mn2+, [-2.07904300e-06 2.40067320...","(Mn2+, I-)",0.0,0.0,0.0,0.0,0.0,...,153.000000,93.740856,90.506096,41.662603,164,trigonal,3,True,12,2
3,LaN,[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La, N)",[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La3+, N3-)",0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,186,hexagonal,2,False,12,3
4,MnF2,"[[1.677294 2.484476 2.484476] Mn, [0. 0. 0.] M...","(Mn, F)","[[1.677294 2.484476 2.484476] Mn2+, [0. 0. 0.]...","(Mn2+, F-)",0.0,0.0,0.0,0.0,0.0,...,202.000000,118.441000,123.039333,52.640445,136,tetragonal,4,True,16,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1051,Cd(InSe2)2,"[[0. 0. 0.] Cd, [2.9560375 0. 3.03973 ...","(Cd, In, Se)","[[0. 0. 0.] Cd2+, [2.9560375 0. 3.03973...","(Cd2+, In3+, Se2-)",0.0,0.0,0.0,0.0,0.0,...,159.038156,72.830009,102.304974,18.396373,111,tetragonal,4,False,8,3
1052,LaHBr2,"[[2.068917 3.58317965 3.70992025] La, [4.400...","(La, H, Br)","[[2.068917 3.58317965 3.70992025] La3+, [4.4...","(La3+, H-, Br-)",0.0,0.0,0.0,0.0,0.0,...,75.312723,25.297122,62.664162,12.648561,194,hexagonal,2,True,24,2
1053,Li2AgSb,"[[1.35965225 0.96141925 2.354987 ] Li, [2.719...","(Li, Ag, Sb)","[[1.35965225 0.96141925 2.354987 ] Li+, [2.71...","(Li+, Ag+, Sb3-)",0.0,0.0,0.0,0.0,0.0,...,61.448018,45.783455,31.948018,14.750000,216,cubic,1,False,24,3
1054,Rb3AuO,"[[0. 2.808758 2.808758] Rb, [2.808758 2....","(Rb, Au, O)","[[0. 2.808758 2.808758] Rb+, [2.808758 2...","(Rb+, Au-, O2-)",0.0,0.0,0.0,0.0,0.0,...,217.000000,213.000000,79.095300,55.161880,221,cubic,1,True,48,3


In [7]:
joblib.dump(df_struc, 'dielectric_constant_structural.pkl')

['dielectric_constant_structural.pkl']

In [8]:
# Initialization of featurizer
descriptors = feat.prepare_to_featurize(df=df_obj)

# Features based on JarvisCFID
df_Jarvis = descriptors.generate_JarvisCFID_features()

In [9]:
joblib.dump(df_struc, 'dielectric_constant_structural_jarviscfid.pkl')

##### (3) Extra Features

In [1]:
# Import packages
import pandas as pd
import joblib

from pymatgen import core

In [2]:
# Import custom packages
import sys
sys.path.insert(1, 'C:/Users/Songy/OneDrive/Documents/GBSFS4MPP_local/new_properties')

import featurizers as feat

In [3]:
# Load Materials object file
df_obj = joblib.load('dielectric_constant_material_objects.pkl')
df_obj

Unnamed: 0,formula,structure,composition,oxidation_structure,oxidation_composition
0,Rb2Te,"[[1.75725875 1.2425695 3.04366125] Rb, [5.271...","(Rb, Te)","[[1.75725875 1.2425695 3.04366125] Rb+, [5.27...","(Rb+, Te2-)"
1,CdCl2,"[[0. 0. 0.] Cd, [ 4.27210959 2.64061969 13.13...","(Cd, Cl)","[[0. 0. 0.] Cd2+, [ 4.27210959 2.64061969 13....","(Cd2+, Cl-)"
2,MnI2,"[[0. 0. 0.] Mn, [-2.07904300e-06 2.40067320e+...","(Mn, I)","[[0. 0. 0.] Mn2+, [-2.07904300e-06 2.40067320...","(Mn2+, I-)"
3,LaN,[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La, N)",[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La3+, N3-)"
4,MnF2,"[[1.677294 2.484476 2.484476] Mn, [0. 0. 0.] M...","(Mn, F)","[[1.677294 2.484476 2.484476] Mn2+, [0. 0. 0.]...","(Mn2+, F-)"
...,...,...,...,...,...
1051,Cd(InSe2)2,"[[0. 0. 0.] Cd, [2.9560375 0. 3.03973 ...","(Cd, In, Se)","[[0. 0. 0.] Cd2+, [2.9560375 0. 3.03973...","(Cd2+, In3+, Se2-)"
1052,LaHBr2,"[[2.068917 3.58317965 3.70992025] La, [4.400...","(La, H, Br)","[[2.068917 3.58317965 3.70992025] La3+, [4.4...","(La3+, H-, Br-)"
1053,Li2AgSb,"[[1.35965225 0.96141925 2.354987 ] Li, [2.719...","(Li, Ag, Sb)","[[1.35965225 0.96141925 2.354987 ] Li+, [2.71...","(Li+, Ag+, Sb3-)"
1054,Rb3AuO,"[[0. 2.808758 2.808758] Rb, [2.808758 2....","(Rb, Au, O)","[[0. 2.808758 2.808758] Rb+, [2.808758 2...","(Rb+, Au-, O2-)"


In [4]:
# Initialization of featurizer
descriptors = feat.prepare_to_featurize(df=df_obj)

df_custom = descriptors.custom_features(formula_col_exist=True, composition_col_exist=False)
df_custom

Unnamed: 0,formula,structure,composition,oxidation_structure,oxidation_composition
0,Rb2Te,"[[1.75725875 1.2425695 3.04366125] Rb, [5.271...","(Rb, Te)","[[1.75725875 1.2425695 3.04366125] Rb+, [5.27...","(Rb+, Te2-)"
1,CdCl2,"[[0. 0. 0.] Cd, [ 4.27210959 2.64061969 13.13...","(Cd, Cl)","[[0. 0. 0.] Cd2+, [ 4.27210959 2.64061969 13....","(Cd2+, Cl-)"
2,MnI2,"[[0. 0. 0.] Mn, [-2.07904300e-06 2.40067320e+...","(Mn, I)","[[0. 0. 0.] Mn2+, [-2.07904300e-06 2.40067320...","(Mn2+, I-)"
3,LaN,[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La, N)",[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La3+, N3-)"
4,MnF2,"[[1.677294 2.484476 2.484476] Mn, [0. 0. 0.] M...","(Mn, F)","[[1.677294 2.484476 2.484476] Mn2+, [0. 0. 0.]...","(Mn2+, F-)"
...,...,...,...,...,...
1051,Cd(InSe2)2,"[[0. 0. 0.] Cd, [2.9560375 0. 3.03973 ...","(Cd, In, Se)","[[0. 0. 0.] Cd2+, [2.9560375 0. 3.03973...","(Cd2+, In3+, Se2-)"
1052,LaHBr2,"[[2.068917 3.58317965 3.70992025] La, [4.400...","(La, H, Br)","[[2.068917 3.58317965 3.70992025] La3+, [4.4...","(La3+, H-, Br-)"
1053,Li2AgSb,"[[1.35965225 0.96141925 2.354987 ] Li, [2.719...","(Li, Ag, Sb)","[[1.35965225 0.96141925 2.354987 ] Li+, [2.71...","(Li+, Ag+, Sb3-)"
1054,Rb3AuO,"[[0. 2.808758 2.808758] Rb, [2.808758 2....","(Rb, Au, O)","[[0. 2.808758 2.808758] Rb+, [2.808758 2...","(Rb+, Au-, O2-)"


In [5]:
joblib.dump(df_custom, 'dielectric_constant_additional_features.pkl')

['dielectric_constant_additional_features.pkl']

#### Merge results

In [1]:
# Import packages
import pandas as pd
import joblib
import re

from pymatgen import core

In [2]:
# Import custom packages
import sys
sys.path.insert(1, 'C:/Users/Songy/OneDrive/Documents/GBSFS4MPP_local/new_properties')

import featurizers as feat

In [3]:
# Load Materials object file
df1 = joblib.load('dielectric_constant_composition.pkl')
df2 = joblib.load('dielectric_constant_structural.pkl')
df3 = joblib.load('dielectric_constant_structural_jarviscfid.pkl')
df4 = joblib.load('dielectric_constant_additional_features.pkl')

# Concat dataframes
frames = [df1, df2, df3, df4]

df_merged = pd.concat(frames, axis=1)
df_merged.head()

Unnamed: 0,formula,structure,composition,oxidation_structure,oxidation_composition,minimum oxidation state,maximum oxidation state,range oxidation state,std_dev oxidation state,compound possible,...,jml_nn_96,jml_nn_97,jml_nn_98,jml_nn_99,jml_nn_100,formula.1,structure.1,composition.1,oxidation_structure.1,oxidation_composition.1
0,Rb2Te,"[[1.75725875 1.2425695 3.04366125] Rb, [5.271...","(Rb, Te)","[[1.75725875 1.2425695 3.04366125] Rb+, [5.27...","(Rb+, Te2-)",-2,1,3,2.12132,True,...,0.0,16.0,0.0,0.0,0.0,Rb2Te,"[[1.75725875 1.2425695 3.04366125] Rb, [5.271...","(Rb, Te)","[[1.75725875 1.2425695 3.04366125] Rb+, [5.27...","(Rb+, Te2-)"
1,CdCl2,"[[0. 0. 0.] Cd, [ 4.27210959 2.64061969 13.13...","(Cd, Cl)","[[0. 0. 0.] Cd2+, [ 4.27210959 2.64061969 13....","(Cd2+, Cl-)",-1,2,3,2.12132,True,...,2.0,0.0,10.666667,0.0,12.0,CdCl2,"[[0. 0. 0.] Cd, [ 4.27210959 2.64061969 13.13...","(Cd, Cl)","[[0. 0. 0.] Cd2+, [ 4.27210959 2.64061969 13....","(Cd2+, Cl-)"
2,MnI2,"[[0. 0. 0.] Mn, [-2.07904300e-06 2.40067320e+...","(Mn, I)","[[0. 0. 0.] Mn2+, [-2.07904300e-06 2.40067320...","(Mn2+, I-)",-1,2,3,2.12132,True,...,0.0,0.0,4.0,0.0,0.0,MnI2,"[[0. 0. 0.] Mn, [-2.07904300e-06 2.40067320e+...","(Mn, I)","[[0. 0. 0.] Mn2+, [-2.07904300e-06 2.40067320...","(Mn2+, I-)"
3,LaN,[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La, N)",[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La3+, N3-)",-3,3,6,4.242641,True,...,3.0,0.0,0.0,0.0,6.0,LaN,[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La, N)",[[-1.73309900e-06 2.38611186e+00 5.95256328e...,"(La3+, N3-)"
4,MnF2,"[[1.677294 2.484476 2.484476] Mn, [0. 0. 0.] M...","(Mn, F)","[[1.677294 2.484476 2.484476] Mn2+, [0. 0. 0.]...","(Mn2+, F-)",-1,2,3,2.12132,True,...,0.0,0.0,19.333333,13.333333,14.666667,MnF2,"[[1.677294 2.484476 2.484476] Mn, [0. 0. 0.] M...","(Mn, F)","[[1.677294 2.484476 2.484476] Mn2+, [0. 0. 0.]...","(Mn2+, F-)"


In [4]:
# Remove duplications
df_merged  = df_merged.loc[:,~df_merged.columns.duplicated()]

# Remove space and JSON character in the column
df_merged.columns = df_merged.columns.str.replace(' ','_')
df_merged = df_merged.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

# Col to drop
cols = ['formula', 'structure', 'composition', 'oxidation_structure', 'oxidation_composition']

df_merged = df_merged.drop(cols, axis=1)

# False and True -> 0 and 1
df_merged.replace({False: 0, True: 1}, inplace=True)
df_merged

Unnamed: 0,minimum_oxidation_state,maximum_oxidation_state,range_oxidation_state,std_dev_oxidation_state,compound_possible,max_ionic_char,avg_ionic_char,avg_anion_electron_affinity,minimum_EN_difference,maximum_EN_difference,...,jml_nn_91,jml_nn_92,jml_nn_93,jml_nn_94,jml_nn_95,jml_nn_96,jml_nn_97,jml_nn_98,jml_nn_99,jml_nn_100
0,-2,1,3,2.121320,1,0.336084,0.074685,-380400.000000,1.280000,1.280000,...,0.000000,0.000000,0.0,16.000000,0.000000,0.000000,16.000000,0.000000,0.000000,0.000000
1,-1,2,3,2.121320,1,0.417383,0.092752,-349000.000000,1.470000,1.470000,...,0.000000,4.666667,8.0,0.000000,0.000000,2.000000,0.000000,10.666667,0.000000,12.000000
2,-1,2,3,2.121320,1,0.265103,0.058912,-295200.000000,1.110000,1.110000,...,0.000000,0.000000,8.0,0.000000,4.000000,0.000000,0.000000,4.000000,0.000000,0.000000
3,-3,3,6,4.242641,1,0.609724,0.152431,-21000.000000,1.940000,1.940000,...,6.000000,12.000000,6.0,18.000000,1.000000,3.000000,0.000000,0.000000,0.000000,6.000000
4,-1,2,3,2.121320,1,0.771501,0.171445,-328000.000000,2.430000,2.430000,...,8.000000,17.333333,0.0,10.666667,10.666667,0.000000,0.000000,19.333333,13.333333,14.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1051,-2,3,5,3.082207,1,0.168813,0.036355,-390000.000000,0.770000,0.860000,...,4.571429,2.285714,0.0,5.714286,1.142857,6.857143,4.571429,0.000000,5.714286,2.285714
1052,-1,3,4,2.190890,1,0.578906,0.105485,-240666.666667,1.606667,1.606667,...,6.000000,3.000000,0.0,4.000000,0.000000,1.500000,3.000000,21.000000,0.000000,6.000000
1053,-3,1,4,2.190890,1,0.248906,0.056586,-309600.000000,0.120000,1.070000,...,0.000000,0.000000,0.0,0.000000,12.000000,0.000000,0.000000,0.000000,48.000000,30.000000
1054,-2,1,3,1.690309,1,0.820234,0.168484,-252400.000000,2.170000,2.170000,...,0.000000,0.000000,0.0,9.600000,0.000000,0.000000,0.000000,8.000000,0.000000,0.000000


In [5]:
# Initialization of featurizer
descriptors = feat.prepare_to_featurize(df=df_merged)

# One hot encoding of categorical features
df_merged, categorical_feature = descriptors.OHE()

categorical_feature

No. of categorical features: 5


['HOMO_character_ohe_d',
 'HOMO_character_ohe_p',
 'HOMO_character_ohe_s',
 'HOMO_element_ohe_Ag',
 'HOMO_element_ohe_Al',
 'HOMO_element_ohe_As',
 'HOMO_element_ohe_Au',
 'HOMO_element_ohe_B',
 'HOMO_element_ohe_Bi',
 'HOMO_element_ohe_Br',
 'HOMO_element_ohe_C',
 'HOMO_element_ohe_Cd',
 'HOMO_element_ohe_Cl',
 'HOMO_element_ohe_Co',
 'HOMO_element_ohe_Cr',
 'HOMO_element_ohe_Cu',
 'HOMO_element_ohe_F',
 'HOMO_element_ohe_Fe',
 'HOMO_element_ohe_Ge',
 'HOMO_element_ohe_H',
 'HOMO_element_ohe_Hf',
 'HOMO_element_ohe_Hg',
 'HOMO_element_ohe_I',
 'HOMO_element_ohe_Ir',
 'HOMO_element_ohe_Mg',
 'HOMO_element_ohe_Mn',
 'HOMO_element_ohe_Mo',
 'HOMO_element_ohe_N',
 'HOMO_element_ohe_Nb',
 'HOMO_element_ohe_Ni',
 'HOMO_element_ohe_O',
 'HOMO_element_ohe_Os',
 'HOMO_element_ohe_P',
 'HOMO_element_ohe_Pb',
 'HOMO_element_ohe_Pd',
 'HOMO_element_ohe_Pt',
 'HOMO_element_ohe_Re',
 'HOMO_element_ohe_Rh',
 'HOMO_element_ohe_Ru',
 'HOMO_element_ohe_S',
 'HOMO_element_ohe_Sb',
 'HOMO_element_ohe_Se'

In [8]:
# Load Materials object file
df_original = joblib.load('database_dielectric_constant.pkl')
df_original = df_original[['nsites', 'space_group', 'volume', 'band_gap', 'n', 'poly_electronic', 'poly_total', 'pot_ferroelectric']]

# Concat dataframes
frames = [df_merged, df_original]

df_merged = pd.concat(frames, axis=1)
df_merged

Unnamed: 0,minimum_oxidation_state,maximum_oxidation_state,range_oxidation_state,std_dev_oxidation_state,compound_possible,max_ionic_char,avg_ionic_char,avg_anion_electron_affinity,minimum_EN_difference,maximum_EN_difference,...,crystal_system_ohe_triclinic,crystal_system_ohe_trigonal,nsites,space_group,volume,band_gap,n,poly_electronic,poly_total,pot_ferroelectric
0,-2,1,3,2.121320,1,0.336084,0.074685,-380400.000000,1.280000,1.280000,...,0,0,3,225,159.501208,1.88,1.86,3.44,6.23,False
1,-1,2,3,2.121320,1,0.417383,0.092752,-349000.000000,1.470000,1.470000,...,0,1,3,166,84.298097,3.52,1.78,3.16,6.73,False
2,-1,2,3,2.121320,1,0.265103,0.058912,-295200.000000,1.110000,1.110000,...,0,1,3,164,108.335875,1.17,2.23,4.97,10.64,False
3,-3,3,6,4.242641,1,0.609724,0.152431,-21000.000000,1.940000,1.940000,...,0,0,4,186,88.162562,1.12,2.65,7.04,17.99,False
4,-1,2,3,2.121320,1,0.771501,0.171445,-328000.000000,2.430000,2.430000,...,0,0,6,136,82.826401,2.87,1.53,2.35,7.12,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1051,-2,3,5,3.082207,1,0.168813,0.036355,-390000.000000,0.770000,0.860000,...,0,0,7,111,212.493121,0.87,2.77,7.67,11.76,True
1052,-1,3,4,2.190890,1,0.578906,0.105485,-240666.666667,1.606667,1.606667,...,0,0,8,194,220.041363,3.60,2.00,3.99,7.08,True
1053,-3,1,4,2.190890,1,0.248906,0.056586,-309600.000000,0.120000,1.070000,...,0,0,4,216,73.882306,0.14,14.58,212.61,232.60,True
1054,-2,1,3,1.690309,1,0.820234,0.168484,-252400.000000,2.170000,2.170000,...,0,0,5,221,177.269065,0.21,2.53,6.41,22.44,True


In [9]:
# df_merged = descriptors.movecol(['log10G_VRH', 'log10K_VRH'], ref_col='crystal_system_ohe_trigonal')
# df_merged.head()

In [10]:
df_merged.columns[-8:].tolist()

['nsites',
 'space_group',
 'volume',
 'band_gap',
 'n',
 'poly_electronic',
 'poly_total',
 'pot_ferroelectric']

In [11]:
feature = df_merged.columns[:-8].tolist()
print(len(feature))

3974


In [12]:
joblib.dump(df_merged, 'database_dielectric_constant_merged.pkl')
joblib.dump(feature, 'features.pkl')

['features.pkl']