This notebook shows how we generated the example dataset as a reference: <br>
1. Load experimental bandgap dataset from Matminer
2. For each experimental composition, obtain the band gap corresponding to the most phase-stable (i.e. lowest computed energy per atom) crystal structure from the Materials Project
3. Include the levels of fidelity with one-hot encoding
4. Featurized the composition with matminer ElementProperty featurizer
5. Include ICSD year-of-discovery of each composition

##### Load packages

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
from matminer.datasets.dataset_retrieval import load_dataset
from matminer.featurizers.composition import ElementProperty
from pymatgen import Composition
from pymatgen import MPRester, Composition
mpr = MPRester() # provide your API key here or add it to pymatgen


Bad key "text.kerning_factor" on line 4 in
/home/ubuntu/miniconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


##### Load matminer bandgap data and add label ('expt_calculated')

In [4]:
# load experimental dataset
exp_data = load_dataset('expt_gap')

# clean up the dataframe, there is an unusual formula
exp_data = exp_data[exp_data.formula != 'GaAs0.1P0.9G1128']
exp_data = exp_data.rename(columns={'gap expt': 'bandgap'})
exp_data = exp_data.groupby('formula').agg({'bandgap':'min'}).reset_index()
exp_data['expt_calculated'] = 1
exp_data.shape

(6353, 2)


(4933, 3)

##### Load MP theory bandgap data

In [5]:
def get_MP_bandgap(formula):
    """Given a composition, get the band gap energy of the ground-state structure
    at that composition
    
    Args:
        composition (string) - formula
    Returns:
        (float) Band gap energy of the ground state structure"""
    # The MPRester requires integer formuals as input
    reduced_formula = Composition(formula).get_integer_formula_and_factor()[0]
    struct_lst = mpr.get_data(reduced_formula)
    
    # If there is a structure at this composition, return the band gap energy
    if struct_lst:
        return reduced_formula, sorted(struct_lst, key=lambda e: e['energy_per_atom'])[0]['band_gap']
    else:
        return reduced_formula, np.nan

bandgap_info = {}
formula_set = set(list(exp_data['formula']))
for formula in tqdm(list(formula_set)):
    reduced_formula, bandgap = get_MP_bandgap(formula) 
    bandgap_info[formula] = [reduced_formula, bandgap]   

100%|██████████| 4933/4933 [17:26<00:00,  4.71it/s]  


In [6]:
theory_df = pd.DataFrame([[key]+value for key, value in bandgap_info.items()], 
                          columns=['formula', 'reduced_formula', 'gap theory'])

theory_data = pd.DataFrame([[key]+value for key, value in bandgap_info.items()], 
                          columns=['formula', 'reduced_formula', 'bandgap']).drop(columns=['formula']).dropna()
theory_data['expt_calculated'] = 0

# Also modify experimental data and get rid of ones without their corresponding theory data
modified_expt_data = pd.merge(exp_data, theory_df[['formula', 'gap theory', 'reduced_formula']], 
                         on='formula').dropna().drop(columns=['gap theory', 'formula'])

##### Combine theory and experiment data

In [7]:
grouped_data = pd.concat([modified_expt_data, theory_data], ignore_index=True)
grouped_final_data = grouped_data.sort_values(by=['reduced_formula']).reset_index(drop=True)

In [12]:
grouped_final_data['theory_data'] = [1 if x==0 else 0 for x in grouped_final_data.expt_calculated]
grouped_final_data['expt_data'] = [1 if x==1 else 0 for x in grouped_final_data.expt_calculated]
grouped_final_data = grouped_final_data.drop(columns=['expt_calculated'])

grouped_final_data = grouped_final_data[['bandgap', 'theory_data', 'expt_data', 'reduced_formula']]

##### Featurization 
"Predicting the Band Gaps of Inorganic Solids by Machine Learning".  They mentioned The descriptor set is limited to composition descriptors in our current machine-learning model because most of the band gaps obtained from the literature are not accompanied by sufficient crystallographic data, only the necessary composition information. 

In [13]:
featurizer = ElementProperty.from_preset("magpie")

In [14]:
# magpie features
grouped_final_data['composition'] = grouped_final_data['reduced_formula'].apply(Composition)
featurizer = ElementProperty.from_preset("magpie")
featurized_data = featurizer.featurize_dataframe(grouped_final_data, 'composition')

HBox(children=(FloatProgress(value=0.0, description='ElementProperty', max=7828.0, style=ProgressStyle(descrip…




In [16]:
featurized_data.to_csv('brgoch_featurized_data.csv')