This notebook is a tutorial to generate local steric and electronic information in data1.

# Load Dependencies

In [1]:
import numpy as np
import pandas as pd
import glob
from rdkit import Chem
from rdkit.Chem import AllChem
from script.SEMG import Calc_SPMS_Elec,Scaler 
import warnings
warnings.filterwarnings("ignore")

Using backend: pytorch
[09:52:40] /opt/dgl/src/runtime/tensordispatch.cc:43: TensorDispatcher: dlopen failed: /root/anaconda3/lib/python3.8/site-packages/dgl/tensoradapter/pytorch/libtensoradapter_pytorch_1.11.0.so: cannot open shared object file: No such file or directory


# Load Data

In [2]:
data_file = './data1/data1.csv'
mol_dir = './data1/data1_sdf_files/'

tmp_df = pd.read_csv(data_file)
lig_smiles = np.array(tmp_df['Ligand'].to_list())
add_smiles = np.array(tmp_df['Additive'].to_list())
base_smiles = np.array(tmp_df['Base'].to_list())
ar_ha_smiles = np.array(tmp_df['Aryl halide'].to_list())
labels = np.array(tmp_df['Output'].to_list())
label_std = (np.array(labels)-np.min(labels))/(np.max(labels)-np.min(labels))

lig_smi_set = list(set(lig_smiles))
add_smi_set = list(set(add_smiles))
base_smi_set = list(set(base_smiles))
ar_ha_smi_set = list(set(ar_ha_smiles))

# Generate local steric and electronic information

In [3]:
with open(mol_dir+'smiles_file_dict.csv','r') as fr:
    lines = fr.readlines()

smiles_name_dict = {tmp_line.strip().split(',')[0]:tmp_line.strip().split(',')[1] for tmp_line in lines}

In [4]:
sort_func = lambda x: eval(x.split('/')[-1].split('.')[0].split('_')[-1])
lig_files = sorted(glob.glob('./data1/data1_sdf_files/lig_*.sdf'),key=sort_func)
add_files = sorted(glob.glob('./data1/data1_sdf_files/add_*.sdf'),key=sort_func)
base_files = sorted(glob.glob('./data1/data1_sdf_files/base_*.sdf'),key=sort_func)
ar_ha_files = sorted(glob.glob('./data1/data1_sdf_files/ar_ha_*.sdf'),key=sort_func)

lig_spms,lig_elec_desc,lig_id_name_dict = Calc_SPMS_Elec(lig_files,mol_dir,align='Origin',elec_acc = 7).calc_spms_elec(lig_files,mol_dir)
add_spms,add_elec_desc,add_id_name_dict = Calc_SPMS_Elec(add_files,mol_dir,align='Origin',elec_acc = 7).calc_spms_elec(add_files,mol_dir)
base_spms,base_elec_desc,base_id_name_dict = Calc_SPMS_Elec(base_files,mol_dir,align='Origin',elec_acc = 7).calc_spms_elec(base_files,mol_dir)
ar_ha_spms,ar_ha_elec_desc,ar_ha_id_name_dict = Calc_SPMS_Elec(ar_ha_files,mol_dir,align='Origin',elec_acc = 7).calc_spms_elec(ar_ha_files,mol_dir)

lig_spms = Scaler(lig_spms)
add_spms = Scaler(add_spms)
base_spms = Scaler(base_spms)

ar_ha_spms = Scaler(ar_ha_spms)
lig_elec_desc = Scaler(lig_elec_desc)
add_elec_desc = Scaler(add_elec_desc)
base_elec_desc = Scaler(base_elec_desc)
ar_ha_elec_desc = Scaler(ar_ha_elec_desc)

In [5]:
total_lig_spms = np.array([lig_spms[lig_id_name_dict[smiles_name_dict[tmp_smi]]] for tmp_smi in lig_smiles])
total_add_spms = np.array([add_spms[add_id_name_dict[smiles_name_dict[tmp_smi]]] for tmp_smi in add_smiles])
total_base_spms = np.array([base_spms[base_id_name_dict[smiles_name_dict[tmp_smi]]] for tmp_smi in base_smiles])
total_ar_ha_spms = np.array([ar_ha_spms[ar_ha_id_name_dict[smiles_name_dict[tmp_smi]]] for tmp_smi in ar_ha_smiles])

total_lig_elec_desc = np.array([lig_elec_desc[lig_id_name_dict[smiles_name_dict[tmp_smi]]] for tmp_smi in lig_smiles])
total_add_elec_desc = np.array([add_elec_desc[add_id_name_dict[smiles_name_dict[tmp_smi]]] for tmp_smi in add_smiles])
total_base_elec_desc = np.array([base_elec_desc[base_id_name_dict[smiles_name_dict[tmp_smi]]] for tmp_smi in base_smiles])
total_ar_ha_elec_desc = np.array([ar_ha_elec_desc[ar_ha_id_name_dict[smiles_name_dict[tmp_smi]]] for tmp_smi in ar_ha_smiles])

lig_spms_trans = np.transpose(total_lig_spms,(0,2,3,1))
add_spms_trans = np.transpose(total_add_spms,(0,2,3,1))
base_spms_trans = np.transpose(total_base_spms,(0,2,3,1))
ar_ha_spms_trans = np.transpose(total_ar_ha_spms,(0,2,3,1))

lig_elec_trans = np.transpose(total_lig_elec_desc,(0,2,3,4,1))
add_elec_trans = np.transpose(total_add_elec_desc,(0,2,3,4,1))
base_elec_trans = np.transpose(total_base_elec_desc,(0,2,3,4,1))
ar_ha_elec_trans = np.transpose(total_ar_ha_elec_desc,(0,2,3,4,1))

react_spms = np.concatenate([lig_spms_trans,add_spms_trans,base_spms_trans,ar_ha_spms_trans],axis=3)
react_elec = np.concatenate([lig_elec_trans,add_elec_trans,base_elec_trans,ar_ha_elec_trans],axis=4)
label_std = (np.array(labels)-np.min(labels))/(np.max(labels)-np.min(labels))

In [6]:
np.save('./data1/run_data/data1_SEMG_MIGNN_react_spms.npy',react_spms)
np.save('./data1/run_data/data1_SEMG_MIGNN_react_elec.npy',react_elec)
np.save('./data1/run_data/data1_SEMG_MIGNN_labels.npy',labels)
np.save('./data1/run_data/data1_SEMG_MIGNN_label_std.npy',label_std)