This notebook is a tutorial to generate local steric and electronic information in data2.

#  Load Dependencies

In [2]:
import numpy as np
import pandas as pd
import glob
from script.SEMG import Calc_SPMS,Scaler,PackCub
import warnings
warnings.filterwarnings("ignore")

# Load Data

In [3]:
data_file = './data2/data2.csv'
steric_dir = './Data/data2/xtb_sdf/'
ed_dir='./Data/data2/b3lyp_def2svp/'
tmp_df = pd.read_csv(data_file)
cat_smiles = np.array(tmp_df['Catalyst'].to_list())
imine_smiles = np.array(tmp_df['Imine'].to_list())
thiol_smiles = np.array(tmp_df['Thiol'].to_list())
labels = np.array(tmp_df['Output'].to_list())
label_std = (np.array(labels)-np.min(labels))/(np.max(labels)-np.min(labels))
cat_smi_set = list(set(cat_smiles))
imine_smi_set = list(set(imine_smiles))
thiol_smi_set = list(set(thiol_smiles))

In [4]:
with open(steric_dir+'smiles_file_dict.csv','r') as fr:
    lines = fr.readlines()

smiles_name_dict = {tmp_line.strip().split(',')[0]:tmp_line.strip().split(',')[1] for tmp_line in lines}

# Generate local steric and electronic information

In [5]:
sort_func = lambda x: eval(x.split('/')[-1].split('.')[0].split('_')[-1])
cat_files = sorted(glob.glob(steric_dir+'cat_*.sdf'),key=sort_func)
imine_files = sorted(glob.glob(steric_dir+'imine_*.sdf'),key=sort_func)
thiol_files = sorted(glob.glob(steric_dir+'thiol_*.sdf'),key=sort_func)

cat_spms,cat_id_name_dict = Calc_SPMS(cat_files,steric_dir,sphere_radius = 7).calc_spms(cat_files,steric_dir)
imine_spms,imine_id_name_dict = Calc_SPMS(imine_files,steric_dir,sphere_radius = 7).calc_spms(imine_files,steric_dir)
thiol_spms,thiol_id_name_dict = Calc_SPMS(thiol_files,steric_dir,sphere_radius = 7).calc_spms(thiol_files,steric_dir)

total_cat_spms = np.array([cat_spms[cat_id_name_dict[smiles_name_dict[tmp_smi]]] for tmp_smi in cat_smiles])
total_imine_spms = np.array([imine_spms[imine_id_name_dict[smiles_name_dict[tmp_smi]]] for tmp_smi in imine_smiles])
total_thiol_spms = np.array([thiol_spms[thiol_id_name_dict[smiles_name_dict[tmp_smi]]] for tmp_smi in thiol_smiles])

total_cat_elec_desc = PackCub(np.array([np.load(ed_dir+'def2-svp_b3lyp_'+smiles_name_dict[tmp_smi]+'.npy') for tmp_smi in cat_smiles]))
total_imine_elec_desc = PackCub(np.array([np.load(ed_dir+'def2-svp_b3lyp_'+smiles_name_dict[tmp_smi]+'.npy') for tmp_smi in imine_smiles]))
total_thiol_elec_desc = PackCub(np.array([np.load(ed_dir+'def2-svp_b3lyp_'+smiles_name_dict[tmp_smi]+'.npy') for tmp_smi in thiol_smiles]))

In [6]:
cat_spms_trans = np.transpose(total_cat_spms,(0,2,3,1))
imine_spms_trans = np.transpose(total_imine_spms,(0,2,3,1))
thiol_spms_trans = np.transpose(total_thiol_spms,(0,2,3,1))

cat_elec_trans = np.transpose(total_cat_elec_desc,(0,2,3,4,1))
imine_elec_trans = np.transpose(total_imine_elec_desc,(0,2,3,4,1))
thiol_elec_trans = np.transpose(total_thiol_elec_desc,(0,2,3,4,1))

react_spms = np.concatenate([cat_spms_trans,imine_spms_trans,thiol_spms_trans],axis=3)
react_elec = np.concatenate([cat_elec_trans,imine_elec_trans,thiol_elec_trans],axis=4)
label_std = (np.array(labels)-np.min(labels))/(np.max(labels)-np.min(labels))

In [7]:
np.save('./data2/run_data/data2_SEMG_MIGNN_react_spms.npy',react_spms)
np.save('./data2/run_data/data2_SEMG_MIGNN_react_elec.npy',react_elec)
np.save('./data2/run_data/data2_SEMG_MIGNN_labels.npy',labels)
np.save('./data2/run_data/data2_SEMG_MIGNN_label_std.npy',label_std)