# Compute Features for the Data
Use the basic feature set of [Ward et al.](https://www.nature.com/articles/npjcompumats201628). Build a dataset for O_{2p} center

In [4]:
from matminer.featurizers import composition as cf
from matminer.featurizers.base import MultipleFeaturizer
from matminer.utils.data import MagpieData
from pymatgen.core import Composition, Element
from pathlib import Path
import pandas as pd

  from tqdm.autonotebook import tqdm


Configuration

In [5]:
elem_feature_names = [
    "Number", "MendeleevNumber", "AtomicWeight", "MeltingT", "Column", "Row", "CovalentRadius", "Electronegativity", 
    "NsValence", "NpValence", "NdValence", "NfValence", "NValence", "NsUnfilled", "NpUnfilled", "NdUnfilled", "NfUnfilled", "NUnfilled",
    "GSvolume_pa", "GSbandgap", "GSmagmom", "SpaceGroupNumber"
]

## Load the Dataset and Featurize the Formula
Load a dataset, compute the features, save it as a new file

In [6]:
featurizer = MultipleFeaturizer([
      cf.Stoichiometry(),
      cf.ElementProperty.from_preset('magpie'),
      cf.ValenceOrbital(props=['frac']),
      cf.IonProperty(fast=True)
])

Load the band center dataset

In [7]:
data = pd.read_csv('band-centers.csv')
print(f'Loaded a total of {len(data)} O2p energies')
data.head()

Loaded a total of 130 O2p energies


Unnamed: 0,name,comp,o2p_center,d_center,gap
0,Ce(Al7Tb1)O3,Al7Ce8TbO24,-7.660965,-13.407655,0.162368
1,(Sr7Pb1)MnO3,Mn8PbSr7O24,-6.699999,-7.38947,0.697309
2,Eu(Cr7Zr1)O3,Cr7Eu8ZrO24,-9.038988,-9.870905,0.361716
3,Cu(Ta7Sb1)O3,Cu8Ta7O24Sb,-6.51952,-6.1564,0.156627
4,CdZrO3,CdZrO3,-5.233573,-6.918262,1.296169


Compute features for the bulk material

In [8]:
data['comp_obj'] = data['comp'].apply(Composition)
bulk_features = featurizer.featurize_dataframe(data, 'comp_obj')
bulk_features = bulk_features.iloc[:, len(data.columns):]  # Only get the new columns

MultipleFeaturizer: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 130/130 [00:00<00:00, 759.48it/s]


Rename features to have the word "bulk" in front of them

In [9]:
bulk_features.rename(columns=lambda x: f'bulk_{x}', inplace=True)
bulk_features.head()

Unnamed: 0,bulk_0-norm,bulk_2-norm,bulk_3-norm,bulk_5-norm,bulk_7-norm,bulk_10-norm,bulk_MagpieData minimum Number,bulk_MagpieData maximum Number,bulk_MagpieData range Number,bulk_MagpieData mean Number,...,bulk_MagpieData mean SpaceGroupNumber,bulk_MagpieData avg_dev SpaceGroupNumber,bulk_MagpieData mode SpaceGroupNumber,bulk_frac s valence electrons,bulk_frac p valence electrons,bulk_frac d valence electrons,bulk_frac f valence electrons,bulk_compound possible,bulk_max ionic char,bulk_avg ionic char
0,4,0.656696,0.612137,0.600745,0.600055,0.600001,8.0,65.0,57.0,20.3,...,90.225,93.87,12.0,0.384615,0.495192,0.038462,0.081731,True,0.745613,0.161798
1,4,0.656696,0.612137,0.600745,0.600055,0.600001,8.0,82.0,74.0,18.5,...,95.6,100.32,12.0,0.330579,0.404959,0.206612,0.057851,True,0.787757,0.162936
2,4,0.656696,0.612137,0.600745,0.600055,0.600001,8.0,63.0,55.0,22.6,...,97.925,103.11,12.0,0.278626,0.366412,0.141221,0.21374,False,0.714753,0.15523
3,4,0.656696,0.612137,0.600745,0.600055,0.600001,8.0,73.0,65.0,24.65,...,96.425,101.31,12.0,0.189474,0.260526,0.292105,0.257895,True,0.609724,0.12516
4,3,0.663325,0.614463,0.600984,0.600078,0.600002,8.0,48.0,40.0,22.4,...,84.8,87.36,12.0,0.294118,0.352941,0.352941,0.0,True,0.671436,0.146042


Concatentate and save

In [10]:
pd.concat([data.drop(columns=['comp_obj']), bulk_features], axis=1).to_csv('../datasets/band-centers.csv', index=False)