# Creating Toy Datasets from Materials Project Systematically

## Download Element Data (to save time)

In [1]:
from mp_api.client import MPRester
from pathlib import Path
import json
path_data = Path("./data")
path_data.mkdir(exist_ok=True)
path_elements = path_data/"mp_elements_dict.json"
API_KEY = "YOUR_KEY_HERE"

In [2]:
if not path_elements.exists():  
    with MPRester(API_KEY) as mpr:
        docs = mpr.materials.summary.search(fields=["material_id", "elements"])

    mp_elements = {}
    for doc in docs:
        mp_elements[doc.material_id.string] = [element.number for element in doc.elements]
    
    with open(path_elements, 'w') as f: 
        json.dump(mp_elements, f)
else:
    with open(path_elements, 'r') as f: 
        mp_elements = json.load(f)

## Element Distribution Analysis

In [3]:
cutoff_element = 18 # Argon (two rows of periodic table)

materials_by_element = {}

for element in range(1, cutoff_element+1):
    
    materials_by_element[element] = []
    
    for (mp_id, elements) in mp_elements.items():
        if element in elements:
            materials_by_element[element].append(mp_id)
    
counts = {element: len(mp_ids) for element, mp_ids in materials_by_element.items()}
counts

{1: 10449,
 2: 8,
 3: 21761,
 4: 1189,
 5: 6370,
 6: 9083,
 7: 11442,
 8: 82406,
 9: 12136,
 10: 1,
 11: 12873,
 12: 19084,
 13: 7805,
 14: 12758,
 15: 16913,
 16: 15397,
 17: 6425,
 18: 2}

According to MP website 10,394 materials contain H; 8 He; 21,686 Li

In [5]:
with MPRester(API_KEY) as mpr:
    docs = mpr.materials.summary.search(
        elements=["H"], fields=["material_id", "formula_pretty"]
    )
    mpid_formula_dict = {
        doc.material_id: doc.formula_pretty for doc in docs
    }

Retrieving SummaryDoc documents:   0%|          | 0/10394 [00:00<?, ?it/s]

In [6]:
element = 9
for id in materials_by_element[element]:
    if not (element in mp_elements[id]):
        print(mp_elements[id].elements)

In [9]:
h_ids = [id.string for id in mpid_formula_dict.keys()]

In [17]:
[id for id in materials_by_element[1] if not (id in h_ids)]

['mp-697915',
 'mp-1187975',
 'mp-632667',
 'mp-634930',
 'mp-634751',
 'mp-864603',
 'mp-625103',
 'mp-626421',
 'mp-632348',
 'mp-1070852',
 'mp-2646948',
 'mp-1025273',
 'mp-1103732',
 'mp-626413',
 'mp-643108',
 'mp-1207586',
 'mp-740759',
 'mp-1206323',
 'mp-1018646',
 'mp-1018647',
 'mp-1187892',
 'mp-1207571',
 'mp-1207559',
 'mp-979964',
 'mp-1195507',
 'mp-1198634',
 'mp-1105386',
 'mp-1216487',
 'mp-643246',
 'mp-1195012',
 'mp-1195544',
 'mp-643071',
 'mp-1203501',
 'mp-1202946',
 'mp-1200022',
 'mp-705525',
 'mp-555985',
 'mp-1202633',
 'mp-1202882',
 'mp-1198247',
 'mp-1238179',
 'mp-1200794',
 'mp-1191250',
 'mp-697925',
 'mp-699393',
 'mp-1212344',
 'mp-722346',
 'mp-1203140',
 'mp-1200555',
 'mp-1202119',
 'mp-1193866',
 'mp-1190437',
 'mp-1198865',
 'mp-1200481',
 'mp-1200272']

In [18]:
"mp-697915"

'mp-697915'

In [15]:
if not path_elements.exists():  
    with MPRester(API_KEY) as mpr:
        docs = mpr.materials.summary.search(mat)

10394