### Load data

In [1]:
# Import packages
import pandas as pd
import joblib

In [2]:
# Import custom packages
import sys
sys.path.insert(1, 'C:/Users/Songy/OneDrive/Documents/GBSFS4MPP_local/new_properties')

import create_material_objects as cmo

In [3]:
# Load dataset
df = joblib.load('refractive_index.pkl') 
df

Unnamed: 0,structure,n
0,"[[4.29304147 2.4785886 1.07248561] S, [4.2930...",1.752064
1,"[[3.95051434 4.51121437 0.28035002] K, [4.3099...",1.652859
2,"[[-1.78688104 4.79604117 1.53044621] Rb, [-1...",1.867858
3,"[[4.51438064 4.51438064 0. ] Mn, [0.133...",2.676887
4,"[[-4.36731958 6.8886097 0.50929706] Li, [-2...",1.793232
...,...,...
4759,"[[ 2.79280881 0.12499663 -1.84045389] Ca, [-2...",2.136837
4760,"[[0. 5.50363806 3.84192106] O, [4.7662...",2.690619
4761,"[[0. 0. 0.] Ba, [ 0.23821924 4.32393487 -0.35...",2.811494
4762,"[[0. 0.18884638 0. ] K, [0. ...",1.832887


In [4]:
# Initialize to create/convert material object (pymatgen)
converter = cmo.material_object(df=df)

# Possible functions
# Create Composition object from chemical formula
# df = converter.formula2composition(formula_col='formula')

# Create Composition object from Structure object
# df = converter.structure2composition(structure_col='structure')

# Create Oxidation Structure object from Structure object
# df = converter.structure2oxidstructure(structure_col='structure')

# Create Oxidation Composition object from Composition object
# df = converter.composition2oxidcomposition(composition_col='composition')

# Source Structure Object from MP database using Composition object 
# df = converter.composition2structure_fromMP(composition_col='composition', mapi_key='lZCh9ke4qRxMQO16')

In [5]:
# Create Composition object from chemical formula
df = converter.structure2composition(structure_col='structure')
df

StructureToComposition:   0%|          | 0/4764 [00:00<?, ?it/s]

Unnamed: 0,structure,n,composition
0,"[[4.29304147 2.4785886 1.07248561] S, [4.2930...",1.752064,"(S, K)"
1,"[[3.95051434 4.51121437 0.28035002] K, [4.3099...",1.652859,"(K, V, O)"
2,"[[-1.78688104 4.79604117 1.53044621] Rb, [-1...",1.867858,"(Rb, Zr, O)"
3,"[[4.51438064 4.51438064 0. ] Mn, [0.133...",2.676887,"(Mn, O, F)"
4,"[[-4.36731958 6.8886097 0.50929706] Li, [-2...",1.793232,"(Li, Co, Si, O)"
...,...,...,...
4759,"[[ 2.79280881 0.12499663 -1.84045389] Ca, [-2...",2.136837,"(Ca, Fe, W, O)"
4760,"[[0. 5.50363806 3.84192106] O, [4.7662...",2.690619,"(O, S, Mn, La)"
4761,"[[0. 0. 0.] Ba, [ 0.23821924 4.32393487 -0.35...",2.811494,"(Ba, Ag, Ge, Se)"
4762,"[[0. 0.18884638 0. ] K, [0. ...",1.832887,"(K, Zn, H, I, O)"


In [6]:
# # Optional - Create Oxidation Structure object from Structure object
# # Needed for generate_oxid_structural_features function, which has the ElectronicRadialDistributionFunction

# df = converter.structure2oxidstructure(structure_col='structure') 
# df

In [7]:
# # Optional - Create Oxidation Composition object from Composition object
# # Create Oxidation Composition object from Composition object
# df = converter.composition2oxidcomposition(composition_col='composition')
# df

In [8]:
joblib.dump(df, 'refractive_index_material_objects.pkl')

['refractive_index_material_objects.pkl']

#### Generate Features

##### (1) Composition

In [1]:
# Import packages
import pandas as pd
import joblib

In [2]:
# Import custom packages
import sys
sys.path.insert(1, 'C:/Users/Songy/OneDrive/Documents/GBSFS4MPP_local/new_properties')

import featurizers as feat

In [3]:
# Load Materials object file
df_obj = joblib.load('refractive_index_material_objects.pkl')
df_obj

Unnamed: 0,structure,n,composition
0,"[[4.29304147 2.4785886 1.07248561] S, [4.2930...",1.752064,"(S, K)"
1,"[[3.95051434 4.51121437 0.28035002] K, [4.3099...",1.652859,"(K, V, O)"
2,"[[-1.78688104 4.79604117 1.53044621] Rb, [-1...",1.867858,"(Rb, Zr, O)"
3,"[[4.51438064 4.51438064 0. ] Mn, [0.133...",2.676887,"(Mn, O, F)"
4,"[[-4.36731958 6.8886097 0.50929706] Li, [-2...",1.793232,"(Li, Co, Si, O)"
...,...,...,...
4759,"[[ 2.79280881 0.12499663 -1.84045389] Ca, [-2...",2.136837,"(Ca, Fe, W, O)"
4760,"[[0. 5.50363806 3.84192106] O, [4.7662...",2.690619,"(O, S, Mn, La)"
4761,"[[0. 0. 0.] Ba, [ 0.23821924 4.32393487 -0.35...",2.811494,"(Ba, Ag, Ge, Se)"
4762,"[[0. 0.18884638 0. ] K, [0. ...",1.832887,"(K, Zn, H, I, O)"


In [4]:
# Initialise featurizers script
descriptors = feat.prepare_to_featurize(df=df_obj)

In [5]:
# Features based on oxidation composition
df_com = descriptors.generate_oxid_composition_features_with_composition()
df_com

# # Features based on oxidation composition
# df_com = descriptors.generate_oxid_composition_features()
# df_com

MultipleFeaturizer:   0%|          | 0/4764 [00:00<?, ?it/s]

Total no. of features generated: 13


Unnamed: 0,structure,n,composition,minimum oxidation state,maximum oxidation state,range oxidation state,std_dev oxidation state,compound possible,max ionic char,avg ionic char,avg anion electron affinity,minimum EN difference,maximum EN difference,range EN difference,mean EN difference,std_dev EN difference
0,"[[4.29304147 2.4785886 1.07248561] S, [4.2930...",1.752064,"(S, K)",0.0,0.0,0.0,0.0,True,0.539020,0.134755,0.0,0.0,0.0,0.0,0.0,0.0
1,"[[3.95051434 4.51121437 0.28035002] K, [4.3099...",1.652859,"(K, V, O)",0.0,0.0,0.0,0.0,True,0.820234,0.195831,0.0,0.0,0.0,0.0,0.0,0.0
2,"[[-1.78688104 4.79604117 1.53044621] Rb, [-1...",1.867858,"(Rb, Zr, O)",0.0,0.0,0.0,0.0,True,0.820234,0.196156,0.0,0.0,0.0,0.0,0.0,0.0
3,"[[4.51438064 4.51438064 0. ] Mn, [0.133...",2.676887,"(Mn, O, F)",0.0,0.0,0.0,0.0,True,0.771501,0.159155,0.0,0.0,0.0,0.0,0.0,0.0
4,"[[-4.36731958 6.8886097 0.50929706] Li, [-2...",1.793232,"(Li, Co, Si, O)",0.0,0.0,0.0,0.0,True,0.779730,0.165597,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4759,"[[ 2.79280881 0.12499663 -1.84045389] Ca, [-2...",2.136837,"(Ca, Fe, W, O)",0.0,0.0,0.0,0.0,True,0.774266,0.147950,0.0,0.0,0.0,0.0,0.0,0.0
4760,"[[0. 5.50363806 3.84192106] O, [4.7662...",2.690619,"(O, S, Mn, La)",0.0,0.0,0.0,0.0,True,0.745613,0.113202,0.0,0.0,0.0,0.0,0.0,0.0
4761,"[[0. 0. 0.] Ba, [ 0.23821924 4.32393487 -0.35...",2.811494,"(Ba, Ag, Ge, Se)",0.0,0.0,0.0,0.0,True,0.497872,0.058624,0.0,0.0,0.0,0.0,0.0,0.0
4762,"[[0. 0.18884638 0. ] K, [0. ...",1.832887,"(K, Zn, H, I, O)",0.0,0.0,0.0,0.0,True,0.820234,0.106219,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Features based on composition
df_com = descriptors.generate_composition_features()
df_com

MultipleFeaturizer:   0%|          | 0/4764 [00:00<?, ?it/s]

Total no. of features generated: 610


Unnamed: 0,structure,n,composition,minimum oxidation state,maximum oxidation state,range oxidation state,std_dev oxidation state,compound possible,max ionic char,avg ionic char,...,frac p valence electrons,frac d valence electrons,frac d valence electrons.1,frac f valence electrons,frac f valence electrons.1,mean simul. packing efficiency,mean abs simul. packing efficiency,dist from 1 clusters |APE| < 0.010,dist from 3 clusters |APE| < 0.010,dist from 5 clusters |APE| < 0.010
0,"[[4.29304147 2.4785886 1.07248561] S, [4.2930...",1.752064,"(S, K)",0.0,0.0,0.0,0.0,True,0.539020,0.134755,...,0.571429,0.000000,0.000000,0.000000,0.000000,0.004837,0.004837,0.033672,0.088653,0.156901
1,"[[3.95051434 4.51121437 0.28035002] K, [4.3099...",1.652859,"(K, V, O)",0.0,0.0,0.0,0.0,True,0.820234,0.195831,...,0.500000,0.093750,0.093750,0.000000,0.000000,-0.034586,0.058377,0.054006,0.069793,0.085433
2,"[[-1.78688104 4.79604117 1.53044621] Rb, [-1...",1.867858,"(Rb, Zr, O)",0.0,0.0,0.0,0.0,True,0.820234,0.196156,...,0.500000,0.083333,0.083333,0.000000,0.000000,0.066999,0.066999,0.058926,0.085031,0.099957
3,"[[4.51438064 4.51438064 0. ] Mn, [0.133...",2.676887,"(Mn, O, F)",0.0,0.0,0.0,0.0,True,0.771501,0.159155,...,0.450000,0.250000,0.250000,0.000000,0.000000,0.000000,0.000000,0.408248,0.457062,0.482942
4,"[[-4.36731958 6.8886097 0.50929706] Li, [-2...",1.793232,"(Li, Co, Si, O)",0.0,0.0,0.0,0.0,True,0.779730,0.165597,...,0.461538,0.179487,0.179487,0.000000,0.000000,0.018185,0.026468,0.000000,0.028139,0.040454
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4759,"[[ 2.79280881 0.12499663 -1.84045389] Ca, [-2...",2.136837,"(Ca, Fe, W, O)",0.0,0.0,0.0,0.0,True,0.774266,0.147950,...,0.352941,0.147059,0.147059,0.205882,0.205882,0.022140,0.034476,0.038490,0.049613,0.058052
4760,"[[0. 5.50363806 3.84192106] O, [4.7662...",2.690619,"(O, S, Mn, La)",0.0,0.0,0.0,0.0,True,0.745613,0.113202,...,0.459016,0.147541,0.147541,0.000000,0.000000,-0.011112,0.012945,0.018557,0.048436,0.057746
4761,"[[0. 0. 0.] Ba, [ 0.23821924 4.32393487 -0.35...",2.811494,"(Ba, Ag, Ge, Se)",0.0,0.0,0.0,0.0,True,0.497872,0.058624,...,0.176471,0.686275,0.686275,0.000000,0.000000,-0.006409,0.020870,0.036742,0.044569,0.050312
4762,"[[0. 0.18884638 0. ] K, [0. ...",1.832887,"(K, Zn, H, I, O)",0.0,0.0,0.0,0.0,True,0.820234,0.106219,...,0.447059,0.294118,0.294118,0.000000,0.000000,0.000000,0.000000,0.297993,0.299267,0.302284


In [7]:
joblib.dump(df_com, 'refractive_index_composition.pkl')

['refractive_index_composition.pkl']

##### (2) Structural 

In [8]:
# Import packages
import pandas as pd
import joblib

In [9]:
# Import custom packages
import sys
sys.path.insert(1, 'C:/Users/Songy/OneDrive/Documents/GBSFS4MPP_local/new_properties')

import featurizers as feat

In [10]:
# Load Materials object file
df_obj = joblib.load('refractive_index_material_objects.pkl')
df_obj

Unnamed: 0,structure,n,composition
0,"[[4.29304147 2.4785886 1.07248561] S, [4.2930...",1.752064,"(S, K)"
1,"[[3.95051434 4.51121437 0.28035002] K, [4.3099...",1.652859,"(K, V, O)"
2,"[[-1.78688104 4.79604117 1.53044621] Rb, [-1...",1.867858,"(Rb, Zr, O)"
3,"[[4.51438064 4.51438064 0. ] Mn, [0.133...",2.676887,"(Mn, O, F)"
4,"[[-4.36731958 6.8886097 0.50929706] Li, [-2...",1.793232,"(Li, Co, Si, O)"
...,...,...,...
4759,"[[ 2.79280881 0.12499663 -1.84045389] Ca, [-2...",2.136837,"(Ca, Fe, W, O)"
4760,"[[0. 5.50363806 3.84192106] O, [4.7662...",2.690619,"(O, S, Mn, La)"
4761,"[[0. 0. 0.] Ba, [ 0.23821924 4.32393487 -0.35...",2.811494,"(Ba, Ag, Ge, Se)"
4762,"[[0. 0.18884638 0. ] K, [0. ...",1.832887,"(K, Zn, H, I, O)"


In [11]:
# Initialization of featurizer
descriptors = feat.prepare_to_featurize(df=df_obj)

In [12]:
# # Features based on oxidation structure
# df_struc = descriptors.generate_oxid_structural_features()
# df_struc

In [13]:
# Features based on structure
df_struc = descriptors.generate_structural_features()
df_struc

MultipleFeaturizer:   0%|          | 0/4764 [00:00<?, ?it/s]

Total no. of features generated: 1282


Unnamed: 0,structure,n,composition,OFM: s^1 - s^1,OFM: s^1 - s^2,OFM: s^1 - p^1,OFM: s^1 - p^2,OFM: s^1 - p^3,OFM: s^1 - p^4,OFM: s^1 - p^5,...,maximum local difference in SpaceGroupNumber,range local difference in SpaceGroupNumber,mean local difference in SpaceGroupNumber,avg_dev local difference in SpaceGroupNumber,spacegroup_num,crystal_system,crystal_system_int,is_centrosymmetric,n_symmetry_ops,dimensionality
0,"[[4.29304147 2.4785886 1.07248561] S, [4.2930...",1.752064,"(S, K)",0.131072,0.264416,0.0,0.000000,0.0,0.264416,0.000000,...,123.863273,17.298038,115.403982,8.416219,189,hexagonal,2,False,12,3
1,"[[3.95051434 4.51121437 0.28035002] K, [4.3099...",1.652859,"(K, V, O)",0.063095,0.230404,0.0,0.000000,0.0,0.228522,0.000000,...,207.991802,50.605819,180.243358,10.819786,198,cubic,1,False,12,3
2,"[[-1.78688104 4.79604117 1.53044621] Rb, [-1...",1.867858,"(Rb, Zr, O)",0.067217,0.223552,0.0,0.000000,0.0,0.220774,0.000000,...,177.930446,13.839708,169.535819,4.653116,36,orthorhombic,5,False,4,3
3,"[[4.51438064 4.51438064 0. ] Mn, [0.133...",2.676887,"(Mn, O, F)",0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,...,203.624127,125.863849,125.488061,52.090711,92,tetragonal,4,False,4,3
4,"[[-4.36731958 6.8886097 0.50929706] Li, [-2...",1.793232,"(Li, Co, Si, O)",0.014272,0.253695,0.0,0.002138,0.0,0.235609,0.000000,...,199.607238,50.671573,172.426886,8.234036,5,monoclinic,6,False,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4759,"[[ 2.79280881 0.12499663 -1.84045389] Ca, [-2...",2.136837,"(Ca, Fe, W, O)",0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,...,216.943731,118.865526,161.997460,41.646915,1,triclinic,7,False,1,3
4760,"[[0. 5.50363806 3.84192106] O, [4.7662...",2.690619,"(O, S, Mn, La)",0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,...,146.295768,71.066670,109.283663,24.252291,186,hexagonal,2,False,1,3
4761,"[[0. 0. 0.] Ba, [ 0.23821924 4.32393487 -0.35...",2.811494,"(Ba, Ag, Ge, Se)",0.019294,0.189824,0.0,0.001798,0.0,0.169136,0.000000,...,204.386277,36.905592,179.671609,6.981151,23,orthorhombic,5,False,4,3
4762,"[[0. 0.18884638 0. ] K, [0. ...",1.832887,"(K, Zn, H, I, O)",0.021919,0.225346,0.0,0.000000,0.0,0.221278,0.004068,...,209.984563,154.122943,108.903067,38.762922,5,monoclinic,6,False,2,3


In [14]:
joblib.dump(df_struc, 'refractive_index_structural.pkl')

['refractive_index_structural.pkl']

In [15]:
# Initialization of featurizer
descriptors = feat.prepare_to_featurize(df=df_obj)

# Features based on JarvisCFID
df_Jarvis = descriptors.generate_JarvisCFID_features()

In [16]:
joblib.dump(df_struc, 'refractive_index_structural_jarviscfid.pkl')

##### (3) Extra Features

In [1]:
# Import packages
import pandas as pd
import joblib

In [2]:
# Import custom packages
import sys
sys.path.insert(1, 'C:/Users/Songy/OneDrive/Documents/GBSFS4MPP_local/new_properties')

import featurizers as feat

In [3]:
# Load Materials object file
df_obj = joblib.load('refractive_index_material_objects.pkl')
df_obj

Unnamed: 0,structure,n,composition
0,"[[4.29304147 2.4785886 1.07248561] S, [4.2930...",1.752064,"(S, K)"
1,"[[3.95051434 4.51121437 0.28035002] K, [4.3099...",1.652859,"(K, V, O)"
2,"[[-1.78688104 4.79604117 1.53044621] Rb, [-1...",1.867858,"(Rb, Zr, O)"
3,"[[4.51438064 4.51438064 0. ] Mn, [0.133...",2.676887,"(Mn, O, F)"
4,"[[-4.36731958 6.8886097 0.50929706] Li, [-2...",1.793232,"(Li, Co, Si, O)"
...,...,...,...
4759,"[[ 2.79280881 0.12499663 -1.84045389] Ca, [-2...",2.136837,"(Ca, Fe, W, O)"
4760,"[[0. 5.50363806 3.84192106] O, [4.7662...",2.690619,"(O, S, Mn, La)"
4761,"[[0. 0. 0.] Ba, [ 0.23821924 4.32393487 -0.35...",2.811494,"(Ba, Ag, Ge, Se)"
4762,"[[0. 0.18884638 0. ] K, [0. ...",1.832887,"(K, Zn, H, I, O)"


In [5]:
# Initialization of featurizer
descriptors = feat.prepare_to_featurize(df=df_obj)

df_custom = descriptors.custom_features(composition_col_exist=True)
df_custom

Unnamed: 0,structure,n,composition,weight,total_e,avg_electroneg,noble_gas,transition_metal,post_transition_metal,rare_earth_metal,...,alkaline,halogen,chalcogen,lanthanoid,actinoid,quadrupolar,s-block,p-block,d-block,f-block
0,"[[4.29304147 2.4785886 1.07248561] S, [4.2930...",1.752064,"(S, K)",426.979800,210.0,1.70000,0,0,0,0,...,0,0,1,0,0,1,1,1,0,0
1,"[[3.95051434 4.51121437 0.28035002] K, [4.3099...",1.652859,"(K, V, O)",928.936000,448.0,2.23125,0,1,0,0,...,0,0,1,0,0,1,1,1,1,0
2,"[[-1.78688104 4.79604117 1.53044621] Rb, [-1...",1.867858,"(Rb, Zr, O)",620.315600,276.0,2.21500,0,1,0,0,...,0,0,1,0,0,1,1,1,1,0
3,"[[4.51438064 4.51438064 0. ] Mn, [0.133...",2.676887,"(Mn, O, F)",359.743393,168.0,2.99000,0,1,0,0,...,0,1,1,0,0,1,0,1,1,0
4,"[[-4.36731958 6.8886097 0.50929706] Li, [-2...",1.793232,"(Li, Co, Si, O)",164.898295,79.0,2.43750,0,1,0,0,...,0,0,1,0,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4759,"[[ 2.79280881 0.12499663 -1.84045389] Ca, [-2...",2.136837,"(Ca, Fe, W, O)",415.837400,188.0,2.68300,0,1,0,0,...,1,0,1,0,0,1,1,1,1,0
4760,"[[0. 5.50363806 3.84192106] O, [4.7662...",2.690619,"(O, S, Mn, La)",1637.898650,714.0,2.07250,0,1,0,1,...,0,0,1,1,0,1,0,1,1,1
4761,"[[0. 0. 0.] Ba, [ 0.23821924 4.32393487 -0.35...",2.811494,"(Ba, Ag, Ge, Se)",741.543400,318.0,2.12000,0,1,0,0,...,1,0,1,0,0,0,1,1,1,0
4762,"[[0. 0.18884638 0. ] K, [0. ...",1.832887,"(K, Zn, H, I, O)",879.246840,396.0,2.83560,0,1,0,0,...,0,1,1,0,0,1,1,1,1,0


In [6]:
joblib.dump(df_custom, 'refractive_index_additional_features.pkl')

['refractive_index_additional_features.pkl']

#### Merge results

In [1]:
# Import packages
import pandas as pd
import joblib
import re

In [2]:
# Import custom packages
import sys
sys.path.insert(1, 'C:/Users/Songy/OneDrive/Documents/GBSFS4MPP_local/new_properties')

import featurizers as feat

In [3]:
# Load Materials object file
df1 = joblib.load('refractive_index_composition.pkl')
df2 = joblib.load('refractive_index_structural.pkl')
df3 = joblib.load('refractive_index_structural_jarviscfid.pkl')
df4 = joblib.load('refractive_index_additional_features.pkl')

# Concat dataframes
frames = [df1, df2, df3, df4]

df_merged = pd.concat(frames, axis=1)
df_merged.head()

Unnamed: 0,structure,n,composition,minimum oxidation state,maximum oxidation state,range oxidation state,std_dev oxidation state,compound possible,max ionic char,avg ionic char,...,alkaline,halogen,chalcogen,lanthanoid,actinoid,quadrupolar,s-block,p-block,d-block,f-block
0,"[[4.29304147 2.4785886 1.07248561] S, [4.2930...",1.752064,"(S, K)",0.0,0.0,0.0,0.0,True,0.53902,0.134755,...,0,0,1,0,0,1,1,1,0,0
1,"[[3.95051434 4.51121437 0.28035002] K, [4.3099...",1.652859,"(K, V, O)",0.0,0.0,0.0,0.0,True,0.820234,0.195831,...,0,0,1,0,0,1,1,1,1,0
2,"[[-1.78688104 4.79604117 1.53044621] Rb, [-1...",1.867858,"(Rb, Zr, O)",0.0,0.0,0.0,0.0,True,0.820234,0.196156,...,0,0,1,0,0,1,1,1,1,0
3,"[[4.51438064 4.51438064 0. ] Mn, [0.133...",2.676887,"(Mn, O, F)",0.0,0.0,0.0,0.0,True,0.771501,0.159155,...,0,1,1,0,0,1,0,1,1,0
4,"[[-4.36731958 6.8886097 0.50929706] Li, [-2...",1.793232,"(Li, Co, Si, O)",0.0,0.0,0.0,0.0,True,0.77973,0.165597,...,0,0,1,0,0,1,1,1,1,0


In [5]:
# Remove duplications
df_merged  = df_merged.loc[:,~df_merged.columns.duplicated()]

# Remove space and JSON character in the column
df_merged.columns = df_merged.columns.str.replace(' ','_')
df_merged = df_merged.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

# Col to drop
# cols = ['formula', 'structure', 'composition', 'oxidation_structure', 'oxidation_composition']
cols = ['structure', 'composition']

df_merged = df_merged.drop(cols, axis=1)

# False and True -> 0 and 1
df_merged.replace({False: 0, True: 1}, inplace=True)
df_merged

Unnamed: 0,n,minimum_oxidation_state,maximum_oxidation_state,range_oxidation_state,std_dev_oxidation_state,compound_possible,max_ionic_char,avg_ionic_char,avg_anion_electron_affinity,minimum_EN_difference,...,alkaline,halogen,chalcogen,lanthanoid,actinoid,quadrupolar,sblock,pblock,dblock,fblock
0,1.752064,0.0,0.0,0.0,0.0,1,0.539020,0.134755,0.0,0.0,...,0,0,1,0,0,1,1,1,0,0
1,1.652859,0.0,0.0,0.0,0.0,1,0.820234,0.195831,0.0,0.0,...,0,0,1,0,0,1,1,1,1,0
2,1.867858,0.0,0.0,0.0,0.0,1,0.820234,0.196156,0.0,0.0,...,0,0,1,0,0,1,1,1,1,0
3,2.676887,0.0,0.0,0.0,0.0,1,0.771501,0.159155,0.0,0.0,...,0,1,1,0,0,1,0,1,1,0
4,1.793232,0.0,0.0,0.0,0.0,1,0.779730,0.165597,0.0,0.0,...,0,0,1,0,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4759,2.136837,0.0,0.0,0.0,0.0,1,0.774266,0.147950,0.0,0.0,...,1,0,1,0,0,1,1,1,1,0
4760,2.690619,0.0,0.0,0.0,0.0,1,0.745613,0.113202,0.0,0.0,...,0,0,1,1,0,1,0,1,1,1
4761,2.811494,0.0,0.0,0.0,0.0,1,0.497872,0.058624,0.0,0.0,...,1,0,1,0,0,0,1,1,1,0
4762,1.832887,0.0,0.0,0.0,0.0,1,0.820234,0.106219,0.0,0.0,...,0,1,1,0,0,1,1,1,1,0


In [6]:
# Initialization of featurizer
descriptors = feat.prepare_to_featurize(df=df_merged)

# One hot encoding of categorical features
df_merged, categorical_feature = descriptors.OHE()

categorical_feature

No. of categorical features: 5


['HOMO_character_ohe_0',
 'HOMO_character_ohe_d',
 'HOMO_character_ohe_f',
 'HOMO_character_ohe_p',
 'HOMO_character_ohe_s',
 'HOMO_element_ohe_0',
 'HOMO_element_ohe_Ag',
 'HOMO_element_ohe_Al',
 'HOMO_element_ohe_As',
 'HOMO_element_ohe_Au',
 'HOMO_element_ohe_B',
 'HOMO_element_ohe_Ba',
 'HOMO_element_ohe_Be',
 'HOMO_element_ohe_Bi',
 'HOMO_element_ohe_Br',
 'HOMO_element_ohe_C',
 'HOMO_element_ohe_Ca',
 'HOMO_element_ohe_Cd',
 'HOMO_element_ohe_Ce',
 'HOMO_element_ohe_Cl',
 'HOMO_element_ohe_Co',
 'HOMO_element_ohe_Cr',
 'HOMO_element_ohe_Cu',
 'HOMO_element_ohe_Dy',
 'HOMO_element_ohe_Er',
 'HOMO_element_ohe_Eu',
 'HOMO_element_ohe_F',
 'HOMO_element_ohe_Fe',
 'HOMO_element_ohe_Ga',
 'HOMO_element_ohe_Ge',
 'HOMO_element_ohe_H',
 'HOMO_element_ohe_Hf',
 'HOMO_element_ohe_Hg',
 'HOMO_element_ohe_Ho',
 'HOMO_element_ohe_I',
 'HOMO_element_ohe_In',
 'HOMO_element_ohe_Ir',
 'HOMO_element_ohe_La',
 'HOMO_element_ohe_Mg',
 'HOMO_element_ohe_Mn',
 'HOMO_element_ohe_Mo',
 'HOMO_element_oh

In [7]:
df_merged = descriptors.movecol(['n'], ref_col='crystal_system_ohe_trigonal')
df_merged.head()

Unnamed: 0,minimum_oxidation_state,maximum_oxidation_state,range_oxidation_state,std_dev_oxidation_state,compound_possible,max_ionic_char,avg_ionic_char,avg_anion_electron_affinity,minimum_EN_difference,maximum_EN_difference,...,LUMO_element_ohe_Zn,LUMO_element_ohe_Zr,crystal_system_ohe_cubic,crystal_system_ohe_hexagonal,crystal_system_ohe_monoclinic,crystal_system_ohe_orthorhombic,crystal_system_ohe_tetragonal,crystal_system_ohe_triclinic,crystal_system_ohe_trigonal,n
0,0.0,0.0,0.0,0.0,1,0.53902,0.134755,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,1.752064
1,0.0,0.0,0.0,0.0,1,0.820234,0.195831,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,1.652859
2,0.0,0.0,0.0,0.0,1,0.820234,0.196156,0.0,0.0,0.0,...,0,1,0,0,0,1,0,0,0,1.867858
3,0.0,0.0,0.0,0.0,1,0.771501,0.159155,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,2.676887
4,0.0,0.0,0.0,0.0,1,0.77973,0.165597,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,1.793232


In [8]:
feature = df_merged.columns[:-1].tolist()
print(len(feature))

3638


In [9]:
df_merged.columns[-1:].tolist()

['n']

In [10]:
joblib.dump(df_merged, 'refractive_index_merged.pkl')
joblib.dump(feature, 'features.pkl')

['features.pkl']