### Load data

In [1]:
import numpy as np
import pandas as pd
import pymatgen
import matplotlib.pyplot as plt
import joblib

from pymatgen.ext.matproj import MPRester
from pymatgen import core

from matminer.datasets import get_available_datasets, load_dataset

#https://www.youtube.com/watch?v=dU123Uc7HlI&ab_channel=Binge-on-atomswithVidushi
from matminer.featurizers.composition import ElementProperty, ElementFraction, Stoichiometry, IonProperty
#from matminer.featurizers.site import AverageBondLength, AverageBondAngle
from matminer.featurizers.structure import JarvisCFID, DensityFeatures, StructuralComplexity, GlobalSymmetryFeatures

from matminer.featurizers.conversions import StructureToComposition


In [2]:
# Available datasets
# https://hackingmaterials.lbl.gov/matminer/dataset_summary.html
print(get_available_datasets())

boltztrap_mp: Effective mass and thermoelectric properties of 8924 compounds in The  Materials Project database that are calculated by the BoltzTraP software package run on the GGA-PBE or GGA+U density functional theory calculation results. The properties are reported at the temperature of 300 Kelvin and the carrier concentration of 1e18 1/cm3.

brgoch_superhard_training: 2574 materials used for training regressors that predict shear and bulk modulus.

castelli_perovskites: 18,928 perovskites generated with ABX combinatorics, calculating gllbsc band gap and pbe structure, and also reporting absolute band edge positions and heat of formation.

citrine_thermal_conductivity: Thermal conductivity of 872 compounds measured experimentally and retrieved from Citrine database from various references. The reported values are measured at various temperatures of which 295 are at room temperature.

dielectric_constant: 1,056 structures with dielectric properties, calculated with DFPT-PBE.

double_

In [3]:
def element_property_features(df, formula_col_name='formula'):

    if formula_col_name != 'composition':
        # Create new column
        df['composition'] = df[formula_col_name] 

        # Create composition object 
        df['composition'] = df['composition'].apply(lambda x: core.Composition(x))
    
    # composition
    data_source = ['magpie', 'matminer', 'deml', 'megnet_el']

    for i in data_source:
        ep = ElementProperty.from_preset(i)
        print('Generating element property using ' + i)
        print('No. of features:', len(ep.feature_labels()))

        df = ep.featurize_dataframe(df, 'composition', ignore_errors=True)

    return df



def additional_compositional_features(df, comp_col_name='composition', formula_col_name='formula'):

    if comp_col_name not in df.columns:
        # Create new column
        df['composition'] = df[formula_col_name] 

        # Create composition object 
        df['composition'] = df['composition'].apply(lambda x: core.Composition(x))
        
    # composition
    descriptors_name = ['ElementFraction', 'Stoichiometry', 'IonProperty']
    descriptors = [ElementFraction(), Stoichiometry(), IonProperty()]

    for idx, des in enumerate(descriptors):

        print('Generating element property using ' + descriptors_name[idx])
        print('No. of features:', len(des.feature_labels()))

        df = des.featurize_dataframe(df, 'composition', ignore_errors=True)

    return df


def structural_features(df, struc_col_name='structure'):
        
    # Structural descriptors
    descriptors_name = ['DensityFeatures', 'StructuralComplexity', 'GlobalSymmetryFeatures', 'JarvisCFID']
    descriptors = [DensityFeatures(), StructuralComplexity(), GlobalSymmetryFeatures(), JarvisCFID()]

    for idx, des in enumerate(descriptors):

        print('Generating structural features using ' + descriptors_name[idx])
        print('No. of features:', len(des.feature_labels()))

        df = des.featurize_dataframe(df, struc_col_name, ignore_errors=True)

    return df


In [4]:
# Refractive index
df = load_dataset('matbench_expt_is_metal')
df = df.rename({'composition': 'formula'}, axis=1)
df

Unnamed: 0,formula,is_metal
0,Ag(AuS)2,True
1,Ag(W3Br7)2,True
2,Ag0.5Ge1Pb1.75S4,False
3,Ag0.5Ge1Pb1.75Se4,False
4,Ag2BBr,True
...,...,...
4916,ZrTaN3,False
4917,ZrTe,True
4918,ZrTi2O,True
4919,ZrTiF6,True


In [5]:
df2 = element_property_features(df, formula_col_name='formula')
df2.iloc[:3]

Generating element property using magpie
No. of features: 132


ElementProperty:   0%|          | 0/4921 [00:00<?, ?it/s]

Generating element property using matminer
No. of features: 65


ElementProperty:   0%|          | 0/4921 [00:00<?, ?it/s]

Generating element property using deml
No. of features: 80


ElementProperty:   0%|          | 0/4921 [00:00<?, ?it/s]

Generating element property using megnet_el
No. of features: 80


ElementProperty:   0%|          | 0/4921 [00:00<?, ?it/s]

Unnamed: 0,formula,is_metal,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,...,MEGNetElementData minimum embedding 15,MEGNetElementData maximum embedding 15,MEGNetElementData range embedding 15,MEGNetElementData mean embedding 15,MEGNetElementData std_dev embedding 15,MEGNetElementData minimum embedding 16,MEGNetElementData maximum embedding 16,MEGNetElementData range embedding 16,MEGNetElementData mean embedding 16,MEGNetElementData std_dev embedding 16
0,Ag(AuS)2,True,"(Ag, Au, S)",16.0,79.0,63.0,47.4,25.28,16.0,65.0,...,-0.502267,-0.01415,0.488117,-0.19952,0.225479,-0.343382,-0.040199,0.303184,-0.203404,0.167932
1,Ag(W3Br7)2,True,"(Ag, W, Br)",35.0,74.0,39.0,46.714286,15.619048,35.0,51.0,...,-0.502267,0.233502,0.735769,0.010484,0.459812,-0.343382,-0.098234,0.245148,-0.140536,0.096099
2,Ag0.5Ge1Pb1.75S4,False,"(Ag, Ge, Pb, S)",16.0,82.0,66.0,36.275862,23.552913,16.0,65.0,...,-0.502267,0.057007,0.559274,-0.034119,0.168126,-0.343382,-0.040199,0.303184,-0.150163,0.157439


In [6]:
df3 = additional_compositional_features(df=df2, comp_col_name='composition')
df3.iloc[:3]

Generating element property using ElementFraction
No. of features: 103


ElementFraction:   0%|          | 0/4921 [00:00<?, ?it/s]

Generating element property using Stoichiometry
No. of features: 6


Stoichiometry:   0%|          | 0/4921 [00:00<?, ?it/s]

Generating element property using IonProperty
No. of features: 3


IonProperty:   0%|          | 0/4921 [00:00<?, ?it/s]

Unnamed: 0,formula,is_metal,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,...,Lr,0-norm,2-norm,3-norm,5-norm,7-norm,10-norm,compound possible,max ionic char,avg ionic char
0,Ag(AuS)2,True,"(Ag, Au, S)",16.0,79.0,63.0,47.4,25.28,16.0,65.0,...,0,3,0.6,0.514256,0.460906,0.441882,0.42873,False,0.100238,0.015189
1,Ag(W3Br7)2,True,"(Ag, W, Br)",35.0,74.0,39.0,46.714286,15.619048,35.0,51.0,...,0,3,0.726873,0.683796,0.668584,0.666919,0.666681,False,0.232967,0.024404
2,Ag0.5Ge1Pb1.75S4,False,"(Ag, Ge, Pb, S)",16.0,82.0,66.0,36.275862,23.552913,16.0,65.0,...,0,4,0.621647,0.569761,0.553591,0.55197,0.551738,,,


In [7]:
df3

Unnamed: 0,formula,is_metal,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,...,Lr,0-norm,2-norm,3-norm,5-norm,7-norm,10-norm,compound possible,max ionic char,avg ionic char
0,Ag(AuS)2,True,"(Ag, Au, S)",16.0,79.0,63.0,47.400000,25.280000,16.0,65.0,...,0,3,0.600000,0.514256,0.460906,0.441882,0.428730,False,0.100238,0.015189
1,Ag(W3Br7)2,True,"(Ag, W, Br)",35.0,74.0,39.0,46.714286,15.619048,35.0,51.0,...,0,3,0.726873,0.683796,0.668584,0.666919,0.666681,False,0.232967,0.024404
2,Ag0.5Ge1Pb1.75S4,False,"(Ag, Ge, Pb, S)",16.0,82.0,66.0,36.275862,23.552913,16.0,65.0,...,0,4,0.621647,0.569761,0.553591,0.551970,0.551738,,,
3,Ag0.5Ge1Pb1.75Se4,False,"(Ag, Ge, Pb, Se)",32.0,82.0,50.0,46.206897,17.388823,34.0,65.0,...,0,4,0.621647,0.569761,0.553591,0.551970,0.551738,,,
4,Ag2BBr,True,"(Ag, B, Br)",5.0,47.0,42.0,33.500000,14.250000,47.0,65.0,...,0,3,0.612372,0.538609,0.506099,0.501109,0.500098,False,0.232967,0.041418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4916,ZrTaN3,False,"(Zr, Ta, N)",7.0,73.0,66.0,26.800000,23.760000,7.0,44.0,...,0,3,0.663325,0.614463,0.600984,0.600078,0.600002,True,0.518585,0.116191
4917,ZrTe,True,"(Zr, Te)",40.0,52.0,12.0,46.000000,6.000000,40.0,44.0,...,0,2,0.707107,0.629961,0.574349,0.552045,0.535887,False,0.137763,0.034441
4918,ZrTi2O,True,"(Zr, Ti, O)",8.0,40.0,32.0,23.000000,8.500000,22.0,43.0,...,0,3,0.612372,0.538609,0.506099,0.501109,0.500098,False,0.671436,0.117641
4919,ZrTiF6,True,"(Zr, Ti, F)",9.0,40.0,31.0,14.500000,8.250000,9.0,43.0,...,0,3,0.770552,0.752308,0.750039,0.750001,0.750000,False,0.827201,0.150309


In [8]:
joblib.dump(df3, 'database_metal.pkl')

['database_metal.pkl']

In [9]:
df_temp = joblib.load('database_metal.pkl')
df_temp

Unnamed: 0,formula,is_metal,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,...,Lr,0-norm,2-norm,3-norm,5-norm,7-norm,10-norm,compound possible,max ionic char,avg ionic char
0,Ag(AuS)2,True,"(Ag, Au, S)",16.0,79.0,63.0,47.400000,25.280000,16.0,65.0,...,0,3,0.600000,0.514256,0.460906,0.441882,0.428730,False,0.100238,0.015189
1,Ag(W3Br7)2,True,"(Ag, W, Br)",35.0,74.0,39.0,46.714286,15.619048,35.0,51.0,...,0,3,0.726873,0.683796,0.668584,0.666919,0.666681,False,0.232967,0.024404
2,Ag0.5Ge1Pb1.75S4,False,"(Ag, Ge, Pb, S)",16.0,82.0,66.0,36.275862,23.552913,16.0,65.0,...,0,4,0.621647,0.569761,0.553591,0.551970,0.551738,,,
3,Ag0.5Ge1Pb1.75Se4,False,"(Ag, Ge, Pb, Se)",32.0,82.0,50.0,46.206897,17.388823,34.0,65.0,...,0,4,0.621647,0.569761,0.553591,0.551970,0.551738,,,
4,Ag2BBr,True,"(Ag, B, Br)",5.0,47.0,42.0,33.500000,14.250000,47.0,65.0,...,0,3,0.612372,0.538609,0.506099,0.501109,0.500098,False,0.232967,0.041418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4916,ZrTaN3,False,"(Zr, Ta, N)",7.0,73.0,66.0,26.800000,23.760000,7.0,44.0,...,0,3,0.663325,0.614463,0.600984,0.600078,0.600002,True,0.518585,0.116191
4917,ZrTe,True,"(Zr, Te)",40.0,52.0,12.0,46.000000,6.000000,40.0,44.0,...,0,2,0.707107,0.629961,0.574349,0.552045,0.535887,False,0.137763,0.034441
4918,ZrTi2O,True,"(Zr, Ti, O)",8.0,40.0,32.0,23.000000,8.500000,22.0,43.0,...,0,3,0.612372,0.538609,0.506099,0.501109,0.500098,False,0.671436,0.117641
4919,ZrTiF6,True,"(Zr, Ti, F)",9.0,40.0,31.0,14.500000,8.250000,9.0,43.0,...,0,3,0.770552,0.752308,0.750039,0.750001,0.750000,False,0.827201,0.150309
