# Compute Features for the Data
Use the basic feature set of [Ward et al.](https://www.nature.com/articles/npjcompumats201628). Build a dataset for O_{2p} center

In [1]:
from matminer.featurizers import composition as cf
from matminer.featurizers.base import MultipleFeaturizer
from matminer.utils.data import MagpieData
from pymatgen.core import Composition, Element
from pathlib import Path
import pandas as pd
import re

  from tqdm.autonotebook import tqdm


Configuration

In [2]:
elem_feature_names = [
    "Number", "MendeleevNumber", "AtomicWeight", "MeltingT", "Column", "Row", "CovalentRadius", "Electronegativity", 
    "NsValence", "NpValence", "NdValence", "NfValence", "NValence", "NsUnfilled", "NpUnfilled", "NdUnfilled", "NfUnfilled", "NUnfilled",
    "GSvolume_pa", "GSbandgap", "GSmagmom", "SpaceGroupNumber"
]

## Load the Dataset and Featurize the Formula
Load a dataset, compute the features, save it as a new file

In [3]:
featurizer = MultipleFeaturizer([
      cf.Stoichiometry(),
      cf.ElementProperty.from_preset('magpie'),
      cf.ValenceOrbital(props=['frac']),
      cf.IonProperty(fast=True)
])

Load the band center dataset

In [4]:
data = pd.read_csv('band-centers.csv').query('not cubic')
print(f'Loaded a total of {len(data)} O2p energies')
data.head()

Loaded a total of 1652 O2p energies


Unnamed: 0,name,comp,cubic,o2p_center,d_center,gap
0,Yb(Ta7Sn1)O3,SnTa7Yb8O24,False,-9.685946,-38.76868,0.110994
1,Eu(Ir7Zr1)O3,Eu8Ir7ZrO24,False,-10.427894,-10.842672,0.130613
2,Ba(Fe7Sn1)O3,Ba8Fe7SnO24,False,-6.299251,-7.051795,0.548002
3,Eu(Re7Rh1)O3,Eu8Re7RhO24,False,-10.66624,-11.172753,0.043238
4,EuRuO3,EuRuO3,False,-10.101252,-10.578915,0.009796


Add in the oxidation states

In [5]:
ox_data = pd.read_csv('oxidation-states.csv').query('not cubic')
ox_data.head()

Unnamed: 0,name,cubic,ox_A,ox_B,ox_O
0,Yb(Ta7Sn1)O3,False,1.687196,2.707607,-1.464946
1,Eu(Ir7Zr1)O3,False,1.898437,1.362979,-1.087066
2,Ba(Fe7Sn1)O3,False,1.615809,1.923458,-1.179853
3,Eu(Re7Rh1)O3,False,1.934932,1.573754,-1.169506
4,EuRuO3,False,1.814971,1.774318,-1.196428


In [6]:
data = data.merge(ox_data.drop(columns=['cubic']), on='name')
data.head()

Unnamed: 0,name,comp,cubic,o2p_center,d_center,gap,ox_A,ox_B,ox_O
0,Yb(Ta7Sn1)O3,SnTa7Yb8O24,False,-9.685946,-38.76868,0.110994,1.687196,2.707607,-1.464946
1,Eu(Ir7Zr1)O3,Eu8Ir7ZrO24,False,-10.427894,-10.842672,0.130613,1.898437,1.362979,-1.087066
2,Ba(Fe7Sn1)O3,Ba8Fe7SnO24,False,-6.299251,-7.051795,0.548002,1.615809,1.923458,-1.179853
3,Eu(Re7Rh1)O3,Eu8Re7RhO24,False,-10.66624,-11.172753,0.043238,1.934932,1.573754,-1.169506
4,EuRuO3,EuRuO3,False,-10.101252,-10.578915,0.009796,1.814971,1.774318,-1.196428


Compute the mixing-law answer for each compound

In [7]:
_perov_re = re.compile(r'(?P<A>[A-Z][a-z]?|\(\w+\))(?P<B>[A-Z][a-z]?|\(\w+\))O3')
_comp_re = re.compile(r'([A-Z][a-z]?)(\d+)')
def decompose_name(name: str) -> list[tuple[str, float]]:
    """Determine the parent compounds and their fractions
    given the name of a perovskite

    Args:
        name: Name to parse
    Returns:
        - List of (parent, fraction) tuples
    """

    a, b = _perov_re.findall(name)[0]
    assert not (a.startswith("(") and b.startswith("("))

    if a.startswith("("):
        output = []
        for el, am in _comp_re.findall(a):
            output.append((f'{el}{b}O3', float(am)))
    elif b.startswith("("):
        output = []
        for el, am in _comp_re.findall(b):
            output.append((f'{a}{el}O3', float(am)))
    else:
        output = [(name, 1.)]

    total = sum([x[1] for x in output])
    return [(n, a / total) for n, a in output]
decompose_name('Yb(Ta7Sn1)O3')

[('YbTaO3', 0.875), ('YbSnO3', 0.125)]

In [8]:
def compute_mixing_rules(name: str, data: pd.DataFrame, columns: list[str]) -> dict[str, float]:
    """Computing the value for a properties expected from mixing rules

    Args:
        name: Name of the compound
        data: Dataset holding data for the end members
        columns: Which columns to mix
    Returns:
        Mixed values, empty dict for pure compounds
    """

    parents = decompose_name(name)
    if len(parents) == 1:
        return {}

    # Get the rows for each parent
    try:
        parent = dict(
            (p, data.query(f'name == "{p}"').iloc[0]) for p, _ in parents
        )
    except:
        # No match for parent. Sus
        return {}

    # Perform the mixing
    output = {}
    for col in columns:
        value = 0
        for p, a in parents:
            value += parent[p][col] * a
        output[f'{col}_mix'] = value
    return output
compute_mixing_rules('Yb(Ta7Sn1)O3', data, ['gap'])

{'gap_mix': 0.1944191936250008}

In [9]:
prop_cols = data.columns[3:]

In [10]:
mixed_data = data.name.apply(lambda x: compute_mixing_rules(x, data, prop_cols))

In [11]:
data = pd.concat([data, pd.DataFrame(mixed_data.tolist())], axis=1)

Compute features for the bulk material

In [12]:
data['comp_obj'] = data['comp'].apply(Composition)
bulk_features = featurizer.featurize_dataframe(data, 'comp_obj')
bulk_features = bulk_features.iloc[:, len(data.columns):]  # Only get the new columns

MultipleFeaturizer: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1652/1652 [00:00<00:00, 4217.99it/s]


Rename features to have the word "bulk" in front of them

In [13]:
bulk_features.rename(columns=lambda x: f'bulk_{x}', inplace=True)
bulk_features.head()

Unnamed: 0,bulk_0-norm,bulk_2-norm,bulk_3-norm,bulk_5-norm,bulk_7-norm,bulk_10-norm,bulk_MagpieData minimum Number,bulk_MagpieData maximum Number,bulk_MagpieData range Number,bulk_MagpieData mean Number,...,bulk_MagpieData mean SpaceGroupNumber,bulk_MagpieData avg_dev SpaceGroupNumber,bulk_MagpieData mode SpaceGroupNumber,bulk_frac s valence electrons,bulk_frac p valence electrons,bulk_frac d valence electrons,bulk_frac f valence electrons,bulk_compound possible,bulk_max ionic char,bulk_avg ionic char
0,4,0.656696,0.612137,0.600745,0.600055,0.600001,8.0,73.0,65.0,32.825,...,95.8,100.56,12.0,0.190931,0.23389,0.073986,0.501193,False,0.745613,0.162261
1,4,0.656696,0.612137,0.600745,0.600055,0.600001,8.0,77.0,69.0,31.875,...,97.225,102.27,12.0,0.209974,0.251969,0.133858,0.404199,True,0.714753,0.137869
2,4,0.656696,0.612137,0.600745,0.600055,0.600001,8.0,56.0,48.0,21.8,...,96.6,101.52,12.0,0.347826,0.426087,0.226087,0.0,False,0.803211,0.160987
3,4,0.656696,0.612137,0.600745,0.600055,0.600001,8.0,75.0,67.0,31.65,...,92.575,96.69,12.0,0.212366,0.258065,0.115591,0.413978,False,0.714753,0.142475
4,3,0.663325,0.614463,0.600984,0.600078,0.600002,8.0,63.0,55.0,26.2,...,91.8,95.76,12.0,0.257143,0.342857,0.2,0.2,True,0.714753,0.132915


Concatentate and save

In [14]:
pd.concat([data.drop(columns=['comp_obj']), bulk_features], axis=1).to_csv('../datasets/band-centers.csv', index=False)