In [1]:
import pandas as pd
import numpy as  np
import pickle
import matplotlib as mpl
import matplotlib.pyplot as plt

from functools import partial
from pymatgen.core.periodic_table import Element

from mendeleev import element
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler

  from .autonotebook import tqdm as notebook_tqdm


## Make fingerprint for ML

In [2]:
def get_spin(el):
    """
    Get number of unpaired electron
    """
    
    block = el.block
    group = el.group
    
    if el.symbol == 'Lu':
        return 0
    elif block == 's':
        return 0 if group == 2 else 1
    elif block == 'p':
        return (group - 12 if group < 16 else 19 - group)
    elif block == 'd':
        return (group - 2 if group < 8 else 12 - group if el.symbol != 'Pt' else 0)
    elif block == 'f':
        return (group - 3 if group < 12 else 17 - group)



df = pd.read_csv('../data/cubic_DP_data_with_cif.csv',index_col=0)
fingerprint_df = pd.DataFrame(columns = ['id'])
fingerprint_df['id'] = df['id']


# Get property
prop_list = []
for i,v in df.iterrows():
    prop_row = []
    for site in ['A_atom_1','A_atom_2','B_atom_1','B_atom_2']:
        el = Element(v[site])
        prop_row.extend([
            el.melting_point,        # Melting Point
            el.boiling_point,        # Boiling Point
            el.X,                    # Electronegativity 
            el.average_ionic_radius, # Ionic radius
            get_spin(el),            # # of Unpaired Electron
            el.common_oxidation_states[0],#int(v[site+'_oxi']),     # Oxidation state
            el.group,                # Group
            el.row,                  # Row
        ])
    prop_list.append(prop_row)

prop_arr = np.array(prop_list).T

for i, v in enumerate(prop_arr):
    fingerprint_df[f'prop_{i+1}'] = v

fingerprint_df

Unnamed: 0,id,prop_1,prop_2,prop_3,prop_4,prop_5,prop_6,prop_7,prop_8,prop_9,...,prop_23,prop_24,prop_25,prop_26,prop_27,prop_28,prop_29,prop_30,prop_31,prop_32
0,BK_c01d01d05f15,370.87,1156.0,0.93,1.1600,1.0,1.0,1.0,3.0,336.53,...,5.0,4.0,544.40,1837.0,2.02,1.035000,3.0,3.0,15.0,6.0
1,BK_c01d01d06h04,370.87,1156.0,0.93,1.1600,1.0,1.0,1.0,3.0,336.53,...,6.0,4.0,1068.00,3633.0,1.12,1.080000,0.0,3.0,3.0,6.0
2,BK_c01d01d06h05,370.87,1156.0,0.93,1.1600,1.0,1.0,1.0,3.0,336.53,...,6.0,4.0,1208.00,3563.0,1.13,1.060000,0.0,3.0,3.0,6.0
3,BK_c01d01d07e03,370.87,1156.0,0.93,1.1600,1.0,1.0,1.0,3.0,336.53,...,7.0,4.0,1799.00,3609.0,1.22,1.040000,1.0,3.0,3.0,5.0
4,BK_c01d01d07f11,370.87,1156.0,0.93,1.1600,1.0,1.0,1.0,3.0,336.53,...,7.0,4.0,1337.33,3129.0,2.54,1.070000,1.0,3.0,11.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6505,BK_f13f14h15e16,577.00,1746.0,1.62,1.3325,1.0,1.0,13.0,6.0,600.61,...,3.0,6.0,722.66,1261.0,2.10,1.293333,3.0,-2.0,16.0,5.0
6506,BK_f13f14h16e16,577.00,1746.0,1.62,1.3325,1.0,1.0,13.0,6.0,600.61,...,3.0,6.0,722.66,1261.0,2.10,1.293333,3.0,-2.0,16.0,5.0
6507,BK_f13f14h17e16,577.00,1746.0,1.62,1.3325,1.0,1.0,13.0,6.0,600.61,...,3.0,6.0,722.66,1261.0,2.10,1.293333,3.0,-2.0,16.0,5.0
6508,BK_f13f14e14f15,577.00,1746.0,1.62,1.3325,1.0,1.0,13.0,6.0,600.61,...,14.0,5.0,544.40,1837.0,2.02,1.035000,3.0,3.0,15.0,6.0


In [3]:
# Make one-hot encoded composition

alkali = ['Li','Na','K','Cs','Rb','Be','Mg','Ca','Sr','Ba']
transition = ['Sc','Ti','V','Cr','Mn','Fe','Co','Ni','Cu','Zn','Y','Zr','Nb','Mo','Ru','Rh','Pd',
             'Ag','Cd','Hf','Ta','W','Re','Os','Ir','Pt','Au','Hg'] # Tc removed: radioactive
main = ['Al','Ga','In','Sn','Tl','Pb','Bi','B','Si','Ge','As','Sb','Te']
rare = ['La','Ce','Pr','Nd','Pm','Sm','Eu','Gd','Tb','Dy','Ho','Er','Tm','Yb','Lu']

cn6, cn12 = {}, {}

for i in alkali+transition+rare+main:
    elem = element(i)
    ionic_radii = elem.ionic_radii
    for radii in ionic_radii:
        if radii.coordination == 'VI':
            for charge in range(1,9):
                if radii.charge == charge:
                    cn6['B'+'_'+i+'_'+str(radii.charge)] = radii.ionic_radius

        elif radii.coordination == 'XII':
            for charge in range(1,9):
                if radii.charge == charge:
                    cn12['A'+'_'+i+'_'+str(radii.charge)] = radii.ionic_radius


elem_candidate = [i for i in cn12.keys()] + [i for i in cn6.keys()] # +1 ~ +7
ohe_list = []

for i,v in df.iterrows():
    dummy = np.zeros(len(elem_candidate))
    elem_lst = ['A_'+v['A_atom_1']+'_'+str(v['A_atom_1_oxi']),'A_'+v['A_atom_2']+'_'+str(v['A_atom_2_oxi']),
               'B_'+v['B_atom_1']+'_'+str(v['B_atom_1_oxi']),'B_'+v['B_atom_2']+'_'+str(v['B_atom_2_oxi'])]
    
    for elem in elem_lst:
        dummy[elem_candidate.index(elem)] += 1
    ohe_list.append(dummy)
    
ohe_list = np.array(ohe_list).T

ohe_dict = {column: fp for column, fp in zip(elem_candidate, ohe_list)}
ohe_df = pd.DataFrame(ohe_dict)

fingerprint_df = pd.concat([fingerprint_df, ohe_df], axis=1)
fingerprint_df

Unnamed: 0,id,prop_1,prop_2,prop_3,prop_4,prop_5,prop_6,prop_7,prop_8,prop_9,...,B_B_3,B_Si_4,B_Ge_2,B_Ge_4,B_As_3,B_As_5,B_Sb_3,B_Sb_5,B_Te_4,B_Te_6
0,BK_c01d01d05f15,370.87,1156.0,0.93,1.1600,1.0,1.0,1.0,3.0,336.53,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,BK_c01d01d06h04,370.87,1156.0,0.93,1.1600,1.0,1.0,1.0,3.0,336.53,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,BK_c01d01d06h05,370.87,1156.0,0.93,1.1600,1.0,1.0,1.0,3.0,336.53,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,BK_c01d01d07e03,370.87,1156.0,0.93,1.1600,1.0,1.0,1.0,3.0,336.53,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,BK_c01d01d07f11,370.87,1156.0,0.93,1.1600,1.0,1.0,1.0,3.0,336.53,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6505,BK_f13f14h15e16,577.00,1746.0,1.62,1.3325,1.0,1.0,13.0,6.0,600.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6506,BK_f13f14h16e16,577.00,1746.0,1.62,1.3325,1.0,1.0,13.0,6.0,600.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6507,BK_f13f14h17e16,577.00,1746.0,1.62,1.3325,1.0,1.0,13.0,6.0,600.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6508,BK_f13f14e14f15,577.00,1746.0,1.62,1.3325,1.0,1.0,13.0,6.0,600.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Train and predict

In [4]:
energy_dict = pd.read_csv('../data/stability_data.csv',index_col=1).to_dict(orient='index')
fingerprint_df['pbx'] = [energy_dict[i]['Pourbaix stability OER'] for i in fingerprint_df['id']]
fingerprint_df['hull'] = [energy_dict[i]['Energy above hull'] for i in fingerprint_df['id']]
fingerprint_df

Unnamed: 0,id,prop_1,prop_2,prop_3,prop_4,prop_5,prop_6,prop_7,prop_8,prop_9,...,B_Ge_2,B_Ge_4,B_As_3,B_As_5,B_Sb_3,B_Sb_5,B_Te_4,B_Te_6,pbx,hull
0,BK_c01d01d05f15,370.87,1156.0,0.93,1.1600,1.0,1.0,1.0,3.0,336.53,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.532447,0.179041
1,BK_c01d01d06h04,370.87,1156.0,0.93,1.1600,1.0,1.0,1.0,3.0,336.53,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
2,BK_c01d01d06h05,370.87,1156.0,0.93,1.1600,1.0,1.0,1.0,3.0,336.53,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.719401,0.296752
3,BK_c01d01d07e03,370.87,1156.0,0.93,1.1600,1.0,1.0,1.0,3.0,336.53,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
4,BK_c01d01d07f11,370.87,1156.0,0.93,1.1600,1.0,1.0,1.0,3.0,336.53,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.641381,0.352413
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6505,BK_f13f14h15e16,577.00,1746.0,1.62,1.3325,1.0,1.0,13.0,6.0,600.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.580596,0.220240
6506,BK_f13f14h16e16,577.00,1746.0,1.62,1.3325,1.0,1.0,13.0,6.0,600.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,
6507,BK_f13f14h17e16,577.00,1746.0,1.62,1.3325,1.0,1.0,13.0,6.0,600.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.552295,0.218560
6508,BK_f13f14e14f15,577.00,1746.0,1.62,1.3325,1.0,1.0,13.0,6.0,600.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,


In [17]:
# PBX stability

calculated_df = fingerprint_df.copy(deep=True).dropna()

X = calculated_df.iloc[:,1:-2].to_numpy()
y = calculated_df.iloc[:,-2].to_numpy()

reg = GradientBoostingRegressor(n_estimators=1321, learning_rate=0.15,subsample=0.6,
                                max_depth=5,random_state=42)

kf = KFold(n_splits=5,shuffle=True,random_state=42)

mae_lst = []
r2_lst = []
for i, (train_idx,test_idx) in enumerate(kf.split(X)):
    X_train,X_test,y_train,y_test = X[train_idx], X[test_idx], y[train_idx], y[test_idx]
    
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)
    
    mae_lst.append(mean_absolute_error(y_test,y_pred))
    r2_lst.append(r2_score(y_test,y_pred))

print(f'MAE : {np.mean(mae_lst)}')
print(f'R2 score : {np.mean(r2_lst)}')

MAE : 0.02847679836935807
R2 score : 0.9380072078258339


In [19]:
# Energy above hull

X = calculated_df.iloc[:,33:-2].to_numpy()
y = calculated_df.iloc[:,-1].to_numpy()

reg = GradientBoostingRegressor(n_estimators=131, learning_rate=0.28, subsample=0.52,
                               max_depth=19, random_state=42)

kf = KFold(n_splits=5,shuffle=True,random_state=42)

mae_lst = []
r2_lst = []
for i, (train_idx,test_idx) in enumerate(kf.split(X)):
    X_train,X_test,y_train,y_test = X[train_idx], X[test_idx], y[train_idx], y[test_idx]
    
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)
    
    mae_lst.append(mean_absolute_error(y_test,y_pred))
    r2_lst.append(r2_score(y_test,y_pred))

print(f'MAE : {np.mean(mae_lst)}')
print(f'R2 score : {np.mean(r2_lst)}')

MAE : 0.031052591628245484
R2 score : 0.7872333883481402
