# Compute Features for the Data
Use the basic feature set of [Ward et al.](https://www.nature.com/articles/npjcompumats201628)

In [14]:
from matminer.featurizers import composition as cf
from matminer.featurizers.base import MultipleFeaturizer
from matminer.utils.data import MagpieData
from pymatgen.core import Composition, Element
from pathlib import Path
import pandas as pd

Configuration

In [15]:
elem_feature_names = [
    "Number", "MendeleevNumber", "AtomicWeight", "MeltingT", "Column", "Row", "CovalentRadius", "Electronegativity", 
    "NsValence", "NpValence", "NdValence", "NfValence", "NValence", "NsUnfilled", "NpUnfilled", "NdUnfilled", "NfUnfilled", "NUnfilled",
    "GSvolume_pa", "GSbandgap", "GSmagmom", "SpaceGroupNumber"
]

## Load the Dataset and Featurize the Formula
Load a dataset, compute the features, save it as a new file

In [16]:
featurizer = MultipleFeaturizer([
      cf.Stoichiometry(),
      cf.ElementProperty.from_preset('magpie'),
      cf.ValenceOrbital(props=['frac']),
      cf.IonProperty(fast=True)
])

Load the adsorption dataset

In [17]:
data = pd.read_csv('adsorbates.csv')
print(f'Loaded a total of {len(data)} adsorption energies')
data.query('site_charge > 0', inplace=True)  # Ignore unblaanced charges
print(f'Loaded {len(data)} adsorption energies for charge-balanced materials')
data.head()

Loaded a total of 1825 adsorption energies
Loaded 1599 adsorption energies for charge-balanced materials


Unnamed: 0,comp,term,site,surface_comp,site_atom,site_charge,a,b,a_val,b_val,supercell,adsorption_energy,gap,o2p_center,d_center,A_charge,B_charge,O_charge
0,NdRhO3,1,mono_metal,RhO2,Rh,4,Nd,Rh,2,4,2,0.689499,,,,,,
1,NdRhO3,1,pi_oxygen,RhO2,Rh,4,Nd,Rh,2,4,2,0.187609,,,,,,
2,NdRhO3,0,mono_metal,NdO,Nd,2,Nd,Rh,2,4,2,-0.506608,,,,,,
3,NdRhO3,0,pi_metal,NdO,Nd,2,Nd,Rh,2,4,2,0.149078,,,,,,
4,CaGeO3,1,pi_metal,GeO2,Ge,4,Ca,Ge,2,4,2,0.07724,,,,,,


Compute features for the bulk material

In [18]:
data['comp_obj'] = data['comp'].apply(Composition)
bulk_features = featurizer.featurize_dataframe(data, 'comp_obj')
bulk_features = bulk_features.iloc[:, len(data.columns):]  # Only get the new columns

MultipleFeaturizer: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1599/1599 [00:00<00:00, 4557.26it/s]


Rename features to have the word "bulk" in front of them

In [19]:
bulk_features.rename(columns=lambda x: f'bulk_{x}', inplace=True)
bulk_features.head()

Unnamed: 0,bulk_0-norm,bulk_2-norm,bulk_3-norm,bulk_5-norm,bulk_7-norm,bulk_10-norm,bulk_MagpieData minimum Number,bulk_MagpieData maximum Number,bulk_MagpieData range Number,bulk_MagpieData mean Number,...,bulk_MagpieData mean SpaceGroupNumber,bulk_MagpieData avg_dev SpaceGroupNumber,bulk_MagpieData mode SpaceGroupNumber,bulk_frac s valence electrons,bulk_frac p valence electrons,bulk_frac d valence electrons,bulk_frac f valence electrons,bulk_compound possible,bulk_max ionic char,bulk_avg ionic char
0,3,0.663325,0.614463,0.600984,0.600078,0.600002,8.0,60.0,52.0,25.8,...,91.0,94.8,12.0,0.272727,0.363636,0.242424,0.121212,True,0.733532,0.133399
1,3,0.663325,0.614463,0.600984,0.600078,0.600002,8.0,60.0,52.0,25.8,...,91.0,94.8,12.0,0.272727,0.363636,0.242424,0.121212,True,0.733532,0.133399
2,3,0.663325,0.614463,0.600984,0.600078,0.600002,8.0,60.0,52.0,25.8,...,91.0,94.8,12.0,0.272727,0.363636,0.242424,0.121212,True,0.733532,0.133399
3,3,0.663325,0.614463,0.600984,0.600078,0.600002,8.0,60.0,52.0,25.8,...,91.0,94.8,12.0,0.272727,0.363636,0.242424,0.121212,True,0.733532,0.133399
4,3,0.663325,0.614463,0.600984,0.600078,0.600002,8.0,32.0,24.0,15.2,...,97.2,102.24,12.0,0.294118,0.411765,0.294118,0.0,True,0.774266,0.149945


Compute features for the element on the surface

In [20]:
elem_features = pd.DataFrame()
elems = data['site_atom'].apply(Element)
magpie_data = MagpieData()
for e in elem_feature_names:
    elem_features[f'site_{e}'] = magpie_data.get_elemental_properties(elems, e)

In [21]:
elem_features.head()

Unnamed: 0,site_Number,site_MendeleevNumber,site_AtomicWeight,site_MeltingT,site_Column,site_Row,site_CovalentRadius,site_Electronegativity,site_NsValence,site_NpValence,...,site_NValence,site_NsUnfilled,site_NpUnfilled,site_NdUnfilled,site_NfUnfilled,site_NUnfilled,site_GSvolume_pa,site_GSbandgap,site_GSmagmom,site_SpaceGroupNumber
0,45.0,59.0,102.9055,2237.0,9.0,5.0,142.0,2.28,1.0,0.0,...,9.0,1.0,0.0,2.0,0.0,3.0,13.64,0.0,0.0,225.0
1,45.0,59.0,102.9055,2237.0,9.0,5.0,142.0,2.28,1.0,0.0,...,9.0,1.0,0.0,2.0,0.0,3.0,13.64,0.0,0.0,225.0
2,60.0,19.0,144.242,1294.0,3.0,6.0,201.0,1.14,2.0,0.0,...,6.0,0.0,0.0,0.0,10.0,10.0,34.81,0.0,0.0,194.0
3,60.0,19.0,144.242,1294.0,3.0,6.0,201.0,1.14,2.0,0.0,...,6.0,0.0,0.0,0.0,10.0,10.0,34.81,0.0,0.0,194.0
4,32.0,79.0,72.64,1211.4,14.0,4.0,120.0,2.01,2.0,2.0,...,14.0,0.0,4.0,0.0,0.0,4.0,23.005,0.383,0.0,225.0


Concatentate and save

In [22]:
pd.concat([data.drop(columns=['comp_obj']), bulk_features, elem_features], axis=1).to_csv('../datasets/adsorption-regression.csv', index=False)

## Make a classification version
Get the most-stable surface site for each perovskite and its energy

In [23]:
by_comp = data.value_counts('comp').to_dict()

In [24]:
data['fraction_complete'] = data['comp'].apply(lambda x: by_comp[x] / 8)
data['class'] = [f't{t}-{s}' for t, s in zip(data['term'], data['site'])]

In [25]:
top_surface = pd.concat([data.drop(columns=['comp_obj']), bulk_features], axis=1).sort_values('adsorption_energy', ascending=True).drop_duplicates('comp', keep='first')

In [26]:
top_surface.to_csv('../datasets/adsorption-classification.csv', index=False)