# Compute Features for the Data
Use the basic feature set of [Ward et al.](https://www.nature.com/articles/npjcompumats201628)

In [1]:
from matminer.featurizers import composition as cf
from matminer.featurizers.base import MultipleFeaturizer
from matminer.utils.data import MagpieData
from pymatgen.core import Composition, Element
from pathlib import Path
import pandas as pd

  from tqdm.autonotebook import tqdm


Configuration

In [2]:
elem_feature_names = [
    "Number", "MendeleevNumber", "AtomicWeight", "MeltingT", "Column", "Row", "CovalentRadius", "Electronegativity", 
    "NsValence", "NpValence", "NdValence", "NfValence", "NValence", "NsUnfilled", "NpUnfilled", "NdUnfilled", "NfUnfilled", "NUnfilled",
    "GSvolume_pa", "GSbandgap", "GSmagmom", "SpaceGroupNumber"
]

## Load the Dataset and Featurize the Formula
Load a dataset, compute the features, save it as a new file

In [3]:
featurizer = MultipleFeaturizer([
      cf.Stoichiometry(),
      cf.ElementProperty.from_preset('magpie'),
      cf.ValenceOrbital(props=['frac']),
      cf.IonProperty(fast=True)
])

Load the adsorption dataset

In [4]:
data = pd.read_csv('adsorbates.csv')
print(f'Loaded a total of {len(data)} adsorption energies')
data.query('site_charge > 0', inplace=True)  # Ignore unblaanced charges
print(f'Loaded {len(data)} adsorption energies for charge-balanced materials')
data.head()

Loaded a total of 796 adsorption energies
Loaded 703 adsorption energies for charge-balanced materials


Unnamed: 0,comp,term,site,surface_comp,site_atom,site_charge,adsorption_energy
0,NdRhO3,1,pi_oxygen,RhO2,Rh,4,0.187609
1,LaLuO3,0,pi_oxygen,LaO,La,3,0.150659
5,CeCrO3,0,mono_metal,CeO,Ce,3,0.47693
6,YCoO3,1,pi_metal,CoO2,Co,3,-0.355691
7,YCoO3,1,pi_oxygen,CoO2,Co,3,-0.340291


Compute features for the bulk material

In [5]:
data['comp_obj'] = data['comp'].apply(Composition)
bulk_features = featurizer.featurize_dataframe(data, 'comp_obj')
bulk_features = bulk_features.iloc[:, len(data.columns):]  # Only get the new columns

MultipleFeaturizer: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 703/703 [00:00<00:00, 3076.31it/s]


Rename features to have the word "bulk" in front of them

In [6]:
bulk_features.rename(columns=lambda x: f'bulk_{x}', inplace=True)
bulk_features.head()

Unnamed: 0,bulk_0-norm,bulk_2-norm,bulk_3-norm,bulk_5-norm,bulk_7-norm,bulk_10-norm,bulk_MagpieData minimum Number,bulk_MagpieData maximum Number,bulk_MagpieData range Number,bulk_MagpieData mean Number,...,bulk_MagpieData mean SpaceGroupNumber,bulk_MagpieData avg_dev SpaceGroupNumber,bulk_MagpieData mode SpaceGroupNumber,bulk_frac s valence electrons,bulk_frac p valence electrons,bulk_frac d valence electrons,bulk_frac f valence electrons,bulk_compound possible,bulk_max ionic char,bulk_avg ionic char
0,3,0.663325,0.614463,0.600984,0.600078,0.600002,8.0,60.0,52.0,25.8,...,91.0,94.8,12.0,0.272727,0.363636,0.242424,0.121212,True,0.733532,0.133399
1,3,0.663325,0.614463,0.600984,0.600078,0.600002,8.0,71.0,63.0,30.4,...,84.8,87.36,12.0,0.263158,0.315789,0.052632,0.368421,True,0.745613,0.172786
5,3,0.663325,0.614463,0.600984,0.600078,0.600002,8.0,58.0,50.0,21.2,...,91.8,95.76,12.0,0.321429,0.428571,0.214286,0.035714,True,0.739617,0.157219
6,3,0.663325,0.614463,0.600984,0.600078,0.600002,8.0,39.0,31.0,18.0,...,84.8,87.36,12.0,0.333333,0.4,0.266667,0.0,True,0.708321,0.143819
7,3,0.663325,0.614463,0.600984,0.600078,0.600002,8.0,39.0,31.0,18.0,...,84.8,87.36,12.0,0.333333,0.4,0.266667,0.0,True,0.708321,0.143819


Compute features for the element on the surface

In [7]:
elem_features = pd.DataFrame()
elems = data['site_atom'].apply(Element)
magpie_data = MagpieData()
for e in elem_feature_names:
    elem_features[f'site_{e}'] = magpie_data.get_elemental_properties(elems, e)

In [8]:
elem_features.head()

Unnamed: 0,site_Number,site_MendeleevNumber,site_AtomicWeight,site_MeltingT,site_Column,site_Row,site_CovalentRadius,site_Electronegativity,site_NsValence,site_NpValence,...,site_NValence,site_NsUnfilled,site_NpUnfilled,site_NdUnfilled,site_NfUnfilled,site_NUnfilled,site_GSvolume_pa,site_GSbandgap,site_GSmagmom,site_SpaceGroupNumber
0,45.0,59.0,102.9055,2237.0,9.0,5.0,142.0,2.28,1.0,0.0,...,9.0,1.0,0.0,2.0,0.0,3.0,13.64,0.0,0.0,225.0
1,57.0,13.0,138.90547,1193.0,3.0,6.0,207.0,1.1,2.0,0.0,...,3.0,0.0,0.0,9.0,0.0,9.0,36.8975,0.0,0.0,194.0
2,58.0,15.0,140.116,1071.0,3.0,6.0,204.0,1.12,2.0,0.0,...,4.0,0.0,0.0,9.0,13.0,22.0,37.24,0.0,0.0,194.0
3,27.0,58.0,58.933195,1768.0,9.0,4.0,126.0,1.88,2.0,0.0,...,9.0,0.0,0.0,3.0,0.0,3.0,10.245,0.0,1.548471,194.0
4,27.0,58.0,58.933195,1768.0,9.0,4.0,126.0,1.88,2.0,0.0,...,9.0,0.0,0.0,3.0,0.0,3.0,10.245,0.0,1.548471,194.0


Concatentate and save

In [9]:
pd.concat([data.drop(columns=['comp_obj']), bulk_features, elem_features], axis=1).to_csv('../datasets/adsorption-regression.csv', index=False)

## Make a classification version
Get the most-stable surface site for each perovskite and its energy

In [10]:
by_comp = data.value_counts('comp').to_dict()

In [11]:
data['fraction_complete'] = data['comp'].apply(lambda x: by_comp[x] / 8)
data['class'] = [f't{t}-{s}' for t, s in zip(data['term'], data['site'])]

In [12]:
top_surface = pd.concat([data.drop(columns=['comp_obj']), bulk_features], axis=1).sort_values('adsorption_energy', ascending=True).drop_duplicates('comp', keep='first')

In [13]:
top_surface.to_csv('../datasets/adsorption-classification.csv', index=False)