# Compute Features for the Data
Use the basic feature set of [Ward et al.](https://www.nature.com/articles/npjcompumats201628)

In [1]:
from matminer.featurizers import composition as cf
from matminer.featurizers.base import MultipleFeaturizer
from matminer.utils.data import MagpieData
from pymatgen.core import Composition, Element
from pathlib import Path
import pandas as pd

  from tqdm.autonotebook import tqdm


Configuration

In [2]:
elem_feature_names = [
    "Number", "MendeleevNumber", "AtomicWeight", "MeltingT", "Column", "Row", "CovalentRadius", "Electronegativity", 
    "NsValence", "NpValence", "NdValence", "NfValence", "NValence", "NsUnfilled", "NpUnfilled", "NdUnfilled", "NfUnfilled", "NUnfilled",
    "GSvolume_pa", "GSbandgap", "GSmagmom", "SpaceGroupNumber"
]

## Load the Dataset and Featurize the Formula
Load a dataset, compute the features, save it as a new file

In [3]:
featurizer = MultipleFeaturizer([
      cf.Stoichiometry(),
      cf.ElementProperty.from_preset('magpie'),
      cf.ValenceOrbital(props=['frac']),
      cf.IonProperty(fast=True)
])

Load the adsorption dataset

In [4]:
data = pd.read_csv('adsorbates.csv')
data.query('site_charge > 0', inplace=True)  # Ignore unblaanced charges
print(f'Loaded {len(data)} adsorption energies for charge-balanced materials')
data.head()

Loaded 305 adsorption energies for charge-balanced materials


Unnamed: 0,comp,term,site,surface_comp,site_atom,site_charge,adsorption_energy
2,LaFeO3,0,mono_metal,LaO,La,3,0.398794
3,LaFeO3,0,pi_metal,LaO,La,3,0.285397
4,TbAlO3,0,mono_metal,TbO,Tb,3,16.352424
5,BiPdO3,0,mono_oxygen,BiO,O,4,0.464751
6,YbWO3,1,mono_metal,WO2,W,4,0.147383


Compute features for the bulk material

In [5]:
data['comp_obj'] = data['comp'].apply(Composition)
bulk_features = featurizer.featurize_dataframe(data, 'comp_obj')
bulk_features = bulk_features.iloc[:, len(data.columns):]  # Only get the new columns

MultipleFeaturizer: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 305/305 [00:00<00:00, 1529.71it/s]


Rename features to have the word "bulk" in front of them

In [6]:
bulk_features.rename(columns=lambda x: f'bulk_{x}', inplace=True)
bulk_features.head()

Unnamed: 0,bulk_0-norm,bulk_2-norm,bulk_3-norm,bulk_5-norm,bulk_7-norm,bulk_10-norm,bulk_MagpieData minimum Number,bulk_MagpieData maximum Number,bulk_MagpieData range Number,bulk_MagpieData mean Number,...,bulk_MagpieData mean SpaceGroupNumber,bulk_MagpieData avg_dev SpaceGroupNumber,bulk_MagpieData mode SpaceGroupNumber,bulk_frac s valence electrons,bulk_frac p valence electrons,bulk_frac d valence electrons,bulk_frac f valence electrons,bulk_compound possible,bulk_max ionic char,bulk_avg ionic char
2,3,0.663325,0.614463,0.600984,0.600078,0.600002,8.0,57.0,49.0,21.4,...,91.8,95.76,12.0,0.344828,0.413793,0.241379,0.0,True,0.745613,0.151694
3,3,0.663325,0.614463,0.600984,0.600078,0.600002,8.0,57.0,49.0,21.4,...,91.8,95.76,12.0,0.344828,0.413793,0.241379,0.0,True,0.745613,0.151694
4,3,0.663325,0.614463,0.600984,0.600078,0.600002,8.0,65.0,57.0,20.4,...,91.0,94.8,12.0,0.3125,0.40625,0.0,0.28125,True,0.745613,0.160043
5,3,0.663325,0.614463,0.600984,0.600078,0.600002,8.0,83.0,75.0,30.6,...,54.6,68.16,12.0,0.140351,0.263158,0.350877,0.245614,False,0.395951,0.086134
6,3,0.663325,0.614463,0.600984,0.600078,0.600002,8.0,74.0,66.0,33.6,...,98.0,103.2,12.0,0.185185,0.222222,0.074074,0.518519,False,0.745613,0.132929


Compute features for the element on the surface

In [7]:
elem_features = pd.DataFrame()
elems = data['site_atom'].apply(Element)
magpie_data = MagpieData()
for e in elem_feature_names:
    elem_features[f'site_{e}'] = magpie_data.get_elemental_properties(elems, e)

In [8]:
elem_features.head()

Unnamed: 0,site_Number,site_MendeleevNumber,site_AtomicWeight,site_MeltingT,site_Column,site_Row,site_CovalentRadius,site_Electronegativity,site_NsValence,site_NpValence,...,site_NValence,site_NsUnfilled,site_NpUnfilled,site_NdUnfilled,site_NfUnfilled,site_NUnfilled,site_GSvolume_pa,site_GSbandgap,site_GSmagmom,site_SpaceGroupNumber
0,57.0,13.0,138.90547,1193.0,3.0,6.0,207.0,1.1,2.0,0.0,...,3.0,0.0,0.0,9.0,0.0,9.0,36.8975,0.0,0.0,194.0
1,57.0,13.0,138.90547,1193.0,3.0,6.0,207.0,1.1,2.0,0.0,...,3.0,0.0,0.0,9.0,0.0,9.0,36.8975,0.0,0.0,194.0
2,65.0,29.0,158.92535,1629.0,3.0,6.0,194.0,1.21,2.0,0.0,...,11.0,0.0,0.0,0.0,5.0,5.0,31.736667,0.0,0.0,194.0
3,8.0,87.0,15.9994,54.8,16.0,2.0,66.0,3.44,2.0,4.0,...,6.0,0.0,2.0,0.0,0.0,2.0,9.105,0.0,0.0,12.0
4,74.0,51.0,183.84,3695.0,6.0,6.0,162.0,2.36,2.0,0.0,...,20.0,0.0,0.0,6.0,0.0,6.0,16.05,0.0,0.0,229.0


Concatentate and save

In [9]:
pd.concat([data.drop(columns=['comp_obj']), bulk_features, elem_features], axis=1).to_csv('../datasets/adsorption-dataset.csv', index=False)