In [5]:
import sys
sys.path.append("../.")
import warnings
warnings.filterwarnings('ignore')

import pickle
import os
import pandas as pd
import numpy as np

from tqdm import tqdm
from itertools import product
from ast import literal_eval
from pymatgen.core import Structure
from pymatgen.io.ase import AseAtomsAdaptor

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler

from dstar.atoms import active_motif, fingerprint, substitution

In [6]:
# Data preprocessing for training

adsorbates = ['CO','H','OH']

for ads in adsorbates:
    # Load structure data
    df = pd.read_csv(f'../data/{ads}_enumerated_data.csv',index_col=0)
    df = df.head(10) # Delete this to actual running
    ids = df['id'].to_list()
    energies = df['energy'].to_list()
    structures = [Structure.from_str(poscar,fmt='poscar') for poscar in df['poscar'].to_list()]
    
    # Load tag data for tagging adsorbate
    with open(f'../data/{ads}_tag.pkl','rb') as fr:
        tag_dict = pickle.load(fr)
        
    # convert surface structures to acitve motif.
    motifs = []
    for structure, id_ in tqdm(zip(structures, ids)):
        atoms = AseAtomsAdaptor.get_atoms(structure)
        atoms.set_tags(tag_dict[id_])
        motif,_ = active_motif.get_active_motif(atoms)
        motifs.append(motif)
        
    # Save motif and energy as dataframe
    motif_df = pd.DataFrame(columns = ['name','fnn','same','sub','target'])
    motif_df['name'] = ids
    motif_df['FNN'] = [i[0] for i in motifs]
    motif_df['Same'] = [i[1] for i in motifs]
    motif_df['Sub'] = [i[2] for i in motifs]
    motif_df['target'] = energies
    
    motif_df.to_csv(f'./data/{ads}_motif.csv')

10it [00:01,  8.04it/s]
10it [00:01,  6.77it/s]
10it [00:01,  6.67it/s]


In [7]:
# Generate substituted dataframe

el_set =['Ag','Al','As','Au','Co','Cr','Cu','Fe','Ga','Ge','In','Ir','Mn','Mo','Ni',
         'Os','Pb','Pd','Pt','Re','Rh','Ru','Sb','Se','Si','Sn','Ti','V','W','Zn'] # Elements for substitution
subs_path = './energy/'
motif_df = pd.read_csv('./data/CO_motif.csv',index_col=0)

atom_set = [sorted([j for j in i]) for i in list(product(el_set,el_set))]
atom_set = [literal_eval(j) for j in set(str(i) for i in atom_set)] # Elemental combinations

binary_df = substitution.get_binary(motif_df) # Get moitfs of binary alloy
generalized_df = substitution.generalizer(binary_df) # Generalize motifs

# Substitution 
for el_set in tqdm(atom_set):
    el_name = '_'.join([el_set[0],el_set[1]]) 
    subs_fp = substitution.substitution(generalized_df, el_set)
    subs_fp.to_csv(subs_path+'/'+el_name+'.csv',index=None)

100%|██████████| 465/465 [00:01<00:00, 344.34it/s]


In [None]:
# Training and Prediction

for ads in adsorbates:
    motif_df = pd.read_csv(f'./data/{ads}_motif.csv',index_col=0)
    motif_df = motif_df[(motif_df['target']<5) & (motif_df['target'] > -5)] # Exclude potential outlier
    descriptor = fingerprint.motifs_to_df(motif_df) # Convert to atomic property descriptors
    
    # Train regressor
    X = descriptor.iloc[:,1:].to_numpy()
    y = motif_df['target'].to_numpy()
    
    scalar = StandardScaler()
    X = scalar.fit_transform(X.astype(np.float))
    
    reg = GradientBoostingRegressor(n_estimators=3938, learning_rate=0.14777,max_depth=17,
                                max_features='sqrt',min_samples_leaf=28, min_samples_split=24,loss='lad',random_state=42)
    reg.fit(X,y)
    
    # Predict binding energy of substituted active motifs
    sub_lst = sorted(os.listdir(subs_path))
    
    for idx, sub in tqdm(enumerate(sub_lst)):
        sub_df = pd.read_csv(subs_path+sub)
        sub_df.drop_duplicates(['FNN','Same','Sub'], inplace = True)
        sub_fp = fingerprint.motifs_to_df(sub_df)
        
        sub_X = sub_fp.iloc[:,1:]
        sub_X = scalar.transform(sub_X.astype(float))
        sub_y = reg.predict(sub_X)
        
        sub_df[f'{ads}_energy'] = sub_y
        sub_df.to_csv(subs_path+sub,index=None)