In [1]:
import matminer
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
mpkey = "2xTZjLYjRnj2NXjdNO2up1hd13UE1ugh"

In [3]:
from pymatgen.ext.matproj import MPRester #Using pymatgen to retrieve data
mpr = MPRester(mpkey)  #Initialize module with API key

In [4]:
df = pd.read_json("weichih_elastic.json")

In [5]:
run_info = list(df['elasticity']) #extract list of dictionaries
df_runinfo = pd.DataFrame(run_info).fillna(0).astype(int) #create new dataframe
df = pd.concat([df, df_runinfo], axis=1) #merge with original dataframe
df = df.drop('elasticity', axis=1)

In [6]:
df

Unnamed: 0,material_id,full_formula,formation_energy_per_atom,G_Reuss,G_VRH,G_Voigt,G_Voigt_Reuss_Hill,K_Reuss,K_VRH,K_Voigt,K_Voigt_Reuss_Hill
0,mp-10006,Tl1Ag1Te2,-0.041182,17,13,8,13,27,28,29,28
1,mp-10018,Ac1,0.033002,12,14,15,14,24,24,24,24
2,mp-1008601,Zr1Ag2,-0.047356,-13,17,48,17,115,116,116,116
3,mp-1008617,Yb1Ag2,-0.338535,14,18,22,18,54,54,54,54
4,mp-1008653,Ag1C1,2.235059,9,11,12,11,107,107,107,107
...,...,...,...,...,...,...,...,...,...,...,...
13111,mp-980014,Tm1Th3,0.046190,34,36,39,36,51,51,51,51
13112,mp-981210,Tl3V1,0.512790,26,27,27,27,101,102,102,102
13113,mp-9924,Te2,0.144380,1,8,15,8,-15,4,22,4
13114,mp-9948,V5Te4,-0.301741,58,30,3,30,50,50,51,50


In [7]:
# Initialize empty list for bad data with shear modulus
bad_data = []

# Begin loop over dataframe
for i in range(len(df)):
    # Define frequently used variables
    mat_id = df['material_id'].iloc[i] # use iloc to iteratively pull samples from the dataframe in the loop.
    gv = df['G_Voigt'].iloc[i]
    gr = df['G_Reuss'].iloc[i]
    gvrh = df['G_VRH'].iloc[i]
    
    if gv < 0.50 * gvrh or gr < 0.50 * gvrh:
        bad_data.append(mat_id)
    elif gv > 1.50 * gvrh or gr > 1.50 * gvrh:
        bad_data.append(mat_id)

for j in bad_data:
    df = df[df['material_id'] != j] # Remove bad data

In [8]:
# Initialize empty list for bad data with bulk modulus
# Begin loop over dataframe
for i in range(len(df)):
    # Define frequently used variables
    mat_id = df['material_id'].iloc[i] # use iloc to iteratively pull samples from the dataframe in the loop.
    kv = df['K_Voigt'].iloc[i]
    kr = df['K_Reuss'].iloc[i]
    kvrh = df['K_VRH'].iloc[i]
    
    if kv < 0.50 * kvrh or kr < 0.50 * kvrh:
        #print(f'Found bad data: {chemical_formula} k_vrh = {kvrh}, k_v = {kv}, k_r = {kr}')
        bad_data.append(mat_id)
    elif kv > 1.50 * kvrh or kr > 1.50 * kvrh:
        #print(f'Found bad data: {chemical_formula} k_vrh = {kvrh}, k_v = {kv}, k_r = {kr}')
        bad_data.append(mat_id)

for j in bad_data:
    df = df[df['material_id'] != j] # Remove bad data

In [9]:
df

Unnamed: 0,material_id,full_formula,formation_energy_per_atom,G_Reuss,G_VRH,G_Voigt,G_Voigt_Reuss_Hill,K_Reuss,K_VRH,K_Voigt,K_Voigt_Reuss_Hill
0,mp-10006,Tl1Ag1Te2,-0.041182,17,13,8,13,27,28,29,28
1,mp-10018,Ac1,0.033002,12,14,15,14,24,24,24,24
3,mp-1008617,Yb1Ag2,-0.338535,14,18,22,18,54,54,54,54
4,mp-1008653,Ag1C1,2.235059,9,11,12,11,107,107,107,107
6,mp-1008866,Na1Ag1O1,-0.759677,18,21,25,21,59,59,59,59
...,...,...,...,...,...,...,...,...,...,...,...
13109,mp-979951,Yb3U1,0.642829,27,20,14,20,10,10,11,10
13110,mp-979963,Yb3Ti1,0.391097,13,14,16,14,21,21,21,21
13111,mp-980014,Tm1Th3,0.046190,34,36,39,36,51,51,51,51
13112,mp-981210,Tl3V1,0.512790,26,27,27,27,101,102,102,102


In [10]:
df.drop(df[df['formation_energy_per_atom'] > 0.2].index, inplace = True) #removing compound with formation energy over 0.2 eV/atom, as they are generally not favorable

In [11]:
calc_hard = 0.92 * ((df['G_VRH']/df['K_VRH'])**1.137) * ((df['G_VRH'])**0.708) 

In [12]:
df

Unnamed: 0,material_id,full_formula,formation_energy_per_atom,G_Reuss,G_VRH,G_Voigt,G_Voigt_Reuss_Hill,K_Reuss,K_VRH,K_Voigt,K_Voigt_Reuss_Hill
0,mp-10006,Tl1Ag1Te2,-0.041182,17,13,8,13,27,28,29,28
1,mp-10018,Ac1,0.033002,12,14,15,14,24,24,24,24
3,mp-1008617,Yb1Ag2,-0.338535,14,18,22,18,54,54,54,54
6,mp-1008866,Na1Ag1O1,-0.759677,18,21,25,21,59,59,59,59
7,mp-1008903,Mg1Ag1Sb1,-0.228935,9,16,24,16,76,76,76,76
...,...,...,...,...,...,...,...,...,...,...,...
13102,mp-972088,Zr3Zn1,-0.121853,31,38,44,38,94,94,94,94
13103,mp-972364,Yb3,0.002530,10,10,11,10,15,15,15,15
13105,mp-977585,Zr3Tl1,-0.118315,57,60,63,60,121,121,121,121
13106,mp-979011,Tm1Zr1,0.064374,40,40,40,40,70,70,70,70


In [13]:
from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers.composition import Meredig

stc = StrToComposition() # MatMiner only interacts with PyMatgen objects to featurize, so a necessary step is to convert chem. formula to composition PyMatgen object
md = Meredig() # Class to calculate features as defined in Meredig et. al.

df = stc.featurize_dataframe(df, col_id='full_formula') # Create column in df with pymatgen composition for each sample, entitled 'composition'
df = md.featurize_dataframe(df, col_id='composition') # Create columns with features from Meredig et al. from the df's composition column

StrToComposition:   0%|          | 0/10541 [00:00<?, ?it/s]

Meredig:   0%|          | 0/10541 [00:00<?, ?it/s]

In [14]:
df

Unnamed: 0,material_id,full_formula,formation_energy_per_atom,G_Reuss,G_VRH,G_Voigt,G_Voigt_Reuss_Hill,K_Reuss,K_VRH,K_Voigt,...,range Electronegativity,mean Electronegativity,avg s valence electrons,avg p valence electrons,avg d valence electrons,avg f valence electrons,frac s valence electrons,frac p valence electrons,frac d valence electrons,frac f valence electrons
0,mp-10006,Tl1Ag1Te2,-0.041182,17,13,8,13,27,28,29,...,0.48,1.937500,1.750000,2.250000,10.000000,3.500000,0.100000,0.128571,0.571429,0.200000
1,mp-10018,Ac1,0.033002,12,14,15,14,24,24,24,...,0.00,1.100000,2.000000,0.000000,1.000000,0.000000,0.666667,0.000000,0.333333,0.000000
3,mp-1008617,Yb1Ag2,-0.338535,14,18,22,18,54,54,54,...,0.67,1.706667,1.333333,0.000000,6.666667,4.666667,0.105263,0.000000,0.526316,0.368421
6,mp-1008866,Na1Ag1O1,-0.759677,18,21,25,21,59,59,59,...,2.51,2.100000,1.333333,1.333333,3.333333,0.000000,0.222222,0.222222,0.555556,0.000000
7,mp-1008903,Mg1Ag1Sb1,-0.228935,9,16,24,16,76,76,76,...,0.74,1.763333,1.666667,1.000000,6.666667,0.000000,0.178571,0.107143,0.714286,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13102,mp-972088,Zr3Zn1,-0.121853,31,38,44,38,94,94,94,...,0.32,1.410000,2.000000,0.000000,4.000000,0.000000,0.333333,0.000000,0.666667,0.000000
13103,mp-972364,Yb3,0.002530,10,10,11,10,15,15,15,...,0.00,1.260000,2.000000,0.000000,0.000000,14.000000,0.125000,0.000000,0.000000,0.875000
13105,mp-977585,Zr3Tl1,-0.118315,57,60,63,60,121,121,121,...,0.29,1.402500,2.000000,0.250000,4.000000,3.500000,0.205128,0.025641,0.410256,0.358974
13106,mp-979011,Tm1Zr1,0.064374,40,40,40,40,70,70,70,...,0.08,1.290000,2.000000,0.000000,1.000000,6.500000,0.210526,0.000000,0.105263,0.684211


In [15]:
df.drop(columns=["G_Voigt_Reuss_Hill", "K_Voigt_Reuss_Hill"],inplace=True) #removing unnecessary extra columns
df

Unnamed: 0,material_id,full_formula,formation_energy_per_atom,G_Reuss,G_VRH,G_Voigt,K_Reuss,K_VRH,K_Voigt,composition,...,range Electronegativity,mean Electronegativity,avg s valence electrons,avg p valence electrons,avg d valence electrons,avg f valence electrons,frac s valence electrons,frac p valence electrons,frac d valence electrons,frac f valence electrons
0,mp-10006,Tl1Ag1Te2,-0.041182,17,13,8,27,28,29,"(Tl, Ag, Te)",...,0.48,1.937500,1.750000,2.250000,10.000000,3.500000,0.100000,0.128571,0.571429,0.200000
1,mp-10018,Ac1,0.033002,12,14,15,24,24,24,(Ac),...,0.00,1.100000,2.000000,0.000000,1.000000,0.000000,0.666667,0.000000,0.333333,0.000000
3,mp-1008617,Yb1Ag2,-0.338535,14,18,22,54,54,54,"(Yb, Ag)",...,0.67,1.706667,1.333333,0.000000,6.666667,4.666667,0.105263,0.000000,0.526316,0.368421
6,mp-1008866,Na1Ag1O1,-0.759677,18,21,25,59,59,59,"(Na, Ag, O)",...,2.51,2.100000,1.333333,1.333333,3.333333,0.000000,0.222222,0.222222,0.555556,0.000000
7,mp-1008903,Mg1Ag1Sb1,-0.228935,9,16,24,76,76,76,"(Mg, Ag, Sb)",...,0.74,1.763333,1.666667,1.000000,6.666667,0.000000,0.178571,0.107143,0.714286,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13102,mp-972088,Zr3Zn1,-0.121853,31,38,44,94,94,94,"(Zr, Zn)",...,0.32,1.410000,2.000000,0.000000,4.000000,0.000000,0.333333,0.000000,0.666667,0.000000
13103,mp-972364,Yb3,0.002530,10,10,11,15,15,15,(Yb),...,0.00,1.260000,2.000000,0.000000,0.000000,14.000000,0.125000,0.000000,0.000000,0.875000
13105,mp-977585,Zr3Tl1,-0.118315,57,60,63,121,121,121,"(Zr, Tl)",...,0.29,1.402500,2.000000,0.250000,4.000000,3.500000,0.205128,0.025641,0.410256,0.358974
13106,mp-979011,Tm1Zr1,0.064374,40,40,40,70,70,70,"(Tm, Zr)",...,0.08,1.290000,2.000000,0.000000,1.000000,6.500000,0.210526,0.000000,0.105263,0.684211


In [16]:
df = df.assign(hardness=calc_hard) # Adding calculated hardness to dataframe as a column

In [17]:
df

Unnamed: 0,material_id,full_formula,formation_energy_per_atom,G_Reuss,G_VRH,G_Voigt,K_Reuss,K_VRH,K_Voigt,composition,...,mean Electronegativity,avg s valence electrons,avg p valence electrons,avg d valence electrons,avg f valence electrons,frac s valence electrons,frac p valence electrons,frac d valence electrons,frac f valence electrons,hardness
0,mp-10006,Tl1Ag1Te2,-0.041182,17,13,8,27,28,29,"(Tl, Ag, Te)",...,1.937500,1.750000,2.250000,10.000000,3.500000,0.100000,0.128571,0.571429,0.200000,2.363712
1,mp-10018,Ac1,0.033002,12,14,15,24,24,24,(Ac),...,1.100000,2.000000,0.000000,1.000000,0.000000,0.666667,0.000000,0.333333,0.000000,3.229192
3,mp-1008617,Yb1Ag2,-0.338535,14,18,22,54,54,54,"(Yb, Ag)",...,1.706667,1.333333,0.000000,6.666667,4.666667,0.105263,0.000000,0.526316,0.368421,2.041897
6,mp-1008866,Na1Ag1O1,-0.759677,18,21,25,59,59,59,"(Na, Ag, O)",...,2.100000,1.333333,1.333333,3.333333,0.000000,0.222222,0.222222,0.555556,0.000000,2.453713
7,mp-1008903,Mg1Ag1Sb1,-0.228935,9,16,24,76,76,76,"(Mg, Ag, Sb)",...,1.763333,1.666667,1.000000,6.666667,0.000000,0.178571,0.107143,0.714286,0.000000,1.114049
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13102,mp-972088,Zr3Zn1,-0.121853,31,38,44,94,94,94,"(Zr, Zn)",...,1.410000,2.000000,0.000000,4.000000,0.000000,0.333333,0.000000,0.666667,0.000000,4.315610
13103,mp-972364,Yb3,0.002530,10,10,11,15,15,15,(Yb),...,1.260000,2.000000,0.000000,0.000000,14.000000,0.125000,0.000000,0.000000,0.875000,2.961911
13105,mp-977585,Zr3Tl1,-0.118315,57,60,63,121,121,121,"(Zr, Tl)",...,1.402500,2.000000,0.250000,4.000000,3.500000,0.205128,0.025641,0.410256,0.358974,7.522287
13106,mp-979011,Tm1Zr1,0.064374,40,40,40,70,70,70,"(Tm, Zr)",...,1.290000,2.000000,0.000000,1.000000,6.500000,0.210526,0.000000,0.105263,0.684211,6.633031


In [18]:
df[df['hardness'] > 200]

Unnamed: 0,material_id,full_formula,formation_energy_per_atom,G_Reuss,G_VRH,G_Voigt,K_Reuss,K_VRH,K_Voigt,composition,...,mean Electronegativity,avg s valence electrons,avg p valence electrons,avg d valence electrons,avg f valence electrons,frac s valence electrons,frac p valence electrons,frac d valence electrons,frac f valence electrons,hardness
3705,mp-23505,K2Te1Br6,-1.02308,7,7,7,0,0,0,"(K, Te, Br)",...,2.388889,1.777778,3.777778,7.777778,0.0,0.133333,0.283333,0.583333,0.0,
5599,mp-13925,Cs2Na1Y1F6,-3.59916,20,21,21,1,1,1,"(Cs, Na, Y, F)",...,2.761,1.7,3.0,0.1,0.0,0.354167,0.625,0.020833,0.0,253.094336
5971,mp-989536,Cs2Li1N1F6,-1.824214,33,33,33,2,2,2,"(Cs, Li, N, F)",...,2.948,1.7,3.3,0.0,0.0,0.34,0.66,0.0,0.0,264.956722
6820,mp-7615,Rb3Tl1F6,-2.63526,7,9,10,0,0,0,"(Rb, Tl, F)",...,2.796,1.7,3.1,1.0,1.4,0.236111,0.430556,0.138889,0.194444,
8583,mp-20457,In1P1,0.027433,48,49,51,2,2,2,"(In, P)",...,1.985,2.0,2.0,5.0,0.0,0.222222,0.222222,0.555556,0.0,549.450395


In [18]:
nan_in_df = df[df['hardness'].isnull()] #Finding & removing any compounds with a hardness value NaN
print(nan_in_df)

     material_id full_formula  formation_energy_per_atom  G_Reuss  G_VRH  \
3705    mp-23505     K2Te1Br6              -1.023080e+00        7      7   
6820     mp-7615     Rb3Tl1F6              -2.635260e+00        7      9   
8156    mp-23907           H2               8.881784e-16        0      0   
8263   mp-570752           H2               8.881784e-16        0      0   
8285   mp-632250           H1               0.000000e+00        0      0   
8289   mp-634659           H1               8.881784e-16        0      0   

      G_Voigt  K_Reuss  K_VRH  K_Voigt  composition  ...  \
3705        7        0      0        0  (K, Te, Br)  ...   
6820       10        0      0        0  (Rb, Tl, F)  ...   
8156        0        0      0        0          (H)  ...   
8263        0        0      0        0          (H)  ...   
8285        0        0      0        0          (H)  ...   
8289        0        0      0        0          (H)  ...   

      mean Electronegativity  avg s valence el

In [19]:
#Dropping due to NaN hardness calculation
df.drop(df[df['material_id'] == 'mp-23505' ].index, inplace = True)
df.drop(df[df['material_id'] == 'mp-7615' ].index, inplace = True)
df.drop(df[df['material_id'] == 'mp-23907' ].index, inplace = True)
df.drop(df[df['material_id'] == 'mp-570752' ].index, inplace = True)
df.drop(df[df['material_id'] == 'mp-632250' ].index, inplace = True)
df.drop(df[df['material_id'] == 'mp-634659' ].index, inplace = True)


#Dropping due to energy_above_hull > 0.2 eV
df.drop(df[df['material_id'] == 'mp-20457' ].index, inplace = True)

In [20]:
df

Unnamed: 0,material_id,full_formula,formation_energy_per_atom,G_Reuss,G_VRH,G_Voigt,K_Reuss,K_VRH,K_Voigt,composition,...,mean Electronegativity,avg s valence electrons,avg p valence electrons,avg d valence electrons,avg f valence electrons,frac s valence electrons,frac p valence electrons,frac d valence electrons,frac f valence electrons,hardness
0,mp-10006,Tl1Ag1Te2,-0.041182,17,13,8,27,28,29,"(Tl, Ag, Te)",...,1.937500,1.750000,2.250000,10.000000,3.500000,0.100000,0.128571,0.571429,0.200000,2.363712
1,mp-10018,Ac1,0.033002,12,14,15,24,24,24,(Ac),...,1.100000,2.000000,0.000000,1.000000,0.000000,0.666667,0.000000,0.333333,0.000000,3.229192
3,mp-1008617,Yb1Ag2,-0.338535,14,18,22,54,54,54,"(Yb, Ag)",...,1.706667,1.333333,0.000000,6.666667,4.666667,0.105263,0.000000,0.526316,0.368421,2.041897
6,mp-1008866,Na1Ag1O1,-0.759677,18,21,25,59,59,59,"(Na, Ag, O)",...,2.100000,1.333333,1.333333,3.333333,0.000000,0.222222,0.222222,0.555556,0.000000,2.453713
7,mp-1008903,Mg1Ag1Sb1,-0.228935,9,16,24,76,76,76,"(Mg, Ag, Sb)",...,1.763333,1.666667,1.000000,6.666667,0.000000,0.178571,0.107143,0.714286,0.000000,1.114049
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13102,mp-972088,Zr3Zn1,-0.121853,31,38,44,94,94,94,"(Zr, Zn)",...,1.410000,2.000000,0.000000,4.000000,0.000000,0.333333,0.000000,0.666667,0.000000,4.315610
13103,mp-972364,Yb3,0.002530,10,10,11,15,15,15,(Yb),...,1.260000,2.000000,0.000000,0.000000,14.000000,0.125000,0.000000,0.000000,0.875000,2.961911
13105,mp-977585,Zr3Tl1,-0.118315,57,60,63,121,121,121,"(Zr, Tl)",...,1.402500,2.000000,0.250000,4.000000,3.500000,0.205128,0.025641,0.410256,0.358974,7.522287
13106,mp-979011,Tm1Zr1,0.064374,40,40,40,70,70,70,"(Tm, Zr)",...,1.290000,2.000000,0.000000,1.000000,6.500000,0.210526,0.000000,0.105263,0.684211,6.633031


In [21]:
df.to_json('mp.json',default_handler = str)