This notebook is a tutorial to generate local steric and electronic information in data1.

# Load Dependencies

In [2]:
import numpy as np
import pandas as pd
import glob
from rdkit import Chem
from rdkit.Chem import AllChem
from script.SEMG import Calc_SPMS,Scaler,PackCub 
import warnings
warnings.filterwarnings("ignore")

# Load Data

In [3]:
data_file = './Data/data1/data1.csv'
steric_dir = './Data/data1/xtb_sdf/'
ed_dir='./Data/data1/b3lyp_def2svp/'
tmp_df = pd.read_csv(data_file)
lig_smiles = np.array(tmp_df['Ligand'].to_list())
add_smiles = np.array(tmp_df['Additive'].to_list())
base_smiles = np.array(tmp_df['Base'].to_list())
ar_ha_smiles = np.array(tmp_df['Aryl halide'].to_list())
labels = np.array(tmp_df['Output'].to_list())
label_std = (np.array(labels)-np.min(labels))/(np.max(labels)-np.min(labels))

lig_smi_set = list(set(lig_smiles))
add_smi_set = list(set(add_smiles))
base_smi_set = list(set(base_smiles))
ar_ha_smi_set = list(set(ar_ha_smiles))

# Generate local steric and electronic information

In [4]:
with open(steric_dir+'smiles_file_dict.csv','r') as fr:
    lines = fr.readlines()

smiles_name_dict = {tmp_line.strip().split(',')[0]:tmp_line.strip().split(',')[1] for tmp_line in lines}

In [5]:
sort_func = lambda x: eval(x.split('/')[-1].split('.')[0].split('_')[-1])
lig_files = sorted(glob.glob(steric_dir+'lig_*.sdf'),key=sort_func)
add_files = sorted(glob.glob(steric_dir+'add_*.sdf'),key=sort_func)
base_files = sorted(glob.glob(steric_dir+'base_*.sdf'),key=sort_func)
ar_ha_files = sorted(glob.glob(steric_dir+'ar_ha_*.sdf'),key=sort_func)

lig_spms,lig_id_name_dict = Calc_SPMS(lig_files,steric_dir).calc_spms(lig_files,steric_dir)
add_spms,add_id_name_dict = Calc_SPMS(add_files,steric_dir).calc_spms(add_files,steric_dir)
base_spms,base_id_name_dict = Calc_SPMS(base_files,steric_dir).calc_spms(base_files,steric_dir)
ar_ha_spms,ar_ha_id_name_dict = Calc_SPMS(ar_ha_files,steric_dir).calc_spms(ar_ha_files,steric_dir)

total_lig_spms = np.array([lig_spms[lig_id_name_dict[smiles_name_dict[tmp_smi]]] for tmp_smi in lig_smiles])
total_add_spms = np.array([add_spms[add_id_name_dict[smiles_name_dict[tmp_smi]]] for tmp_smi in add_smiles])
total_base_spms = np.array([base_spms[base_id_name_dict[smiles_name_dict[tmp_smi]]] for tmp_smi in base_smiles])
total_ar_ha_spms = np.array([ar_ha_spms[ar_ha_id_name_dict[smiles_name_dict[tmp_smi]]] for tmp_smi in ar_ha_smiles])

total_lig_elec_desc=PackCub(np.array([np.load(ed_dir+'def2-svp_b3lyp_'+smiles_name_dict[tmp_smi]+'.npy') for tmp_smi in lig_smiles]))
total_add_elec_desc=PackCub(np.array([np.load(ed_dir+'def2-svp_b3lyp_'+smiles_name_dict[tmp_smi]+'.npy') for tmp_smi in add_smiles]))
total_base_elec_desc=PackCub(np.array([np.load(ed_dir+'def2-svp_b3lyp_'+smiles_name_dict[tmp_smi]+'.npy') for tmp_smi in base_smiles]))
total_ar_ha_elec_desc=PackCub(np.array([np.load(ed_dir+'def2-svp_b3lyp_'+smiles_name_dict[tmp_smi]+'.npy') for tmp_smi in ar_ha_smiles]))

In [6]:
lig_spms_trans = np.transpose(total_lig_spms,(0,2,3,1))
add_spms_trans = np.transpose(total_add_spms,(0,2,3,1))
base_spms_trans = np.transpose(total_base_spms,(0,2,3,1))
ar_ha_spms_trans = np.transpose(total_ar_ha_spms,(0,2,3,1))

lig_elec_trans = np.transpose(total_lig_elec_desc,(0,2,3,4,1))
add_elec_trans = np.transpose(total_add_elec_desc,(0,2,3,4,1))
base_elec_trans = np.transpose(total_base_elec_desc,(0,2,3,4,1))
ar_ha_elec_trans = np.transpose(total_ar_ha_elec_desc,(0,2,3,4,1))

react_spms = np.concatenate([lig_spms_trans,add_spms_trans,base_spms_trans,ar_ha_spms_trans],axis=3)
react_elec = np.concatenate([lig_elec_trans,add_elec_trans,base_elec_trans,ar_ha_elec_trans],axis=4)
label_std = (np.array(labels)-np.min(labels))/(np.max(labels)-np.min(labels))

In [8]:
np.save('./Data/data1/run_data/data1_SEMG_MIGNN_react_spms.npy',react_spms)
np.save('./Data/data1/run_data/data1_SEMG_MIGNN_react_elec.npy',react_elec)
np.save('./Data/data1/run_data/data1_SEMG_MIGNN_labels.npy',labels)
np.save('./Data/data1/run_data/data1_SEMG_MIGNN_label_std.npy',label_std)