In [1]:
import os
import numpy as np
import pickle as pkl
import pandas as pd

## Load DrugBank processed data

In [2]:
drugbank_df = pd.read_csv('D:/study_data/DrugTarget/DrugBank/Processed_drugbank_whole_info.csv')
print(drugbank_df.shape)
# drugbank_df.head(3)

# keep small molecules & known as action
sm_drugbank_df = drugbank_df[drugbank_df.type == 'small molecule']
sm_drugbank_df = sm_drugbank_df[['drugbank_id', 'groups', 'smiles']].dropna().drop_duplicates()
print(sm_drugbank_df.shape)

sm_drugbank_df.head(3)

(15235, 10)
(11583, 3)


Unnamed: 0,drugbank_id,groups,smiles
5,DB00006,approved|investigational,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...
6,DB00007,approved|investigational,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...
13,DB00014,approved,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...


In [3]:
# load target information
target_df = pd.read_csv('D:/study_data/DrugTarget/DrugBank/Processed_DTI_info.csv').drop_duplicates()
# keep Humans
target_df = target_df[target_df.organism == 'Humans']
print(target_df.shape)
target_df.head(3)

(24758, 6)


Unnamed: 0,drugbank_id,category,organism,known_action,actions,uniprot_id
0,DB00001,target,Humans,yes,inhibitor,P00734
1,DB00002,target,Humans,yes,binder,P00533
2,DB00002,target,Humans,unknown,binder,O75015


In [4]:
# len(set(target_df.drugbank_id.tolist())) # 6596

merged_drugbank_df = target_df.merge(sm_drugbank_df, how='inner', on=['drugbank_id'])
merged_drugbank_df.shape

(22833, 8)

In [5]:
merged_drugbank_df.head(3)

Unnamed: 0,drugbank_id,category,organism,known_action,actions,uniprot_id,groups,smiles
0,DB00006,target,Humans,yes,inhibitor,P00734,approved|investigational,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...
1,DB00006,enzyme,Humans,unknown,inhibitor,P05164,approved|investigational,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...
2,DB00007,target,Humans,yes,agonist,P30968,approved|investigational,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...


In [6]:
merged_drugbank_df.groups.value_counts()

approved|investigational                               6627
approved                                               5138
experimental                                           4704
investigational                                        1439
approved|vet_approved                                   891
approved|investigational|vet_approved                   713
approved|nutraceutical                                  511
approved|investigational|withdrawn                      336
approved|investigational|nutraceutical                  309
approved|withdrawn                                      294
approved|illicit                                        248
experimental|investigational                            248
approved|nutraceutical|vet_approved                     167
approved|experimental                                   152
investigational|nutraceutical                           132
approved|experimental|investigational                   126
experimental|illicit                    

In [7]:
merged_drugbank_df[merged_drugbank_df.known_action == 'yes'].category.value_counts()

target    3248
enzyme      10
Name: category, dtype: int64

In [8]:
merged_drugbank_df[merged_drugbank_df.known_action == 'yes'].groups.value_counts()

approved                                               1053
approved|investigational                               1041
investigational                                         178
approved|vet_approved                                   177
experimental                                            158
approved|illicit                                         93
approved|investigational|vet_approved                    88
approved|investigational|withdrawn                       64
approved|withdrawn                                       61
experimental|illicit                                     61
withdrawn                                                35
approved|illicit|investigational                         26
approved|nutraceutical                                   23
approved|experimental                                    21
approved|investigational|nutraceutical                   20
investigational|withdrawn                                18
approved|illicit|withdrawn              

In [6]:
len(set(merged_drugbank_df.drugbank_id.tolist()))

5846

In [6]:
from utils import canonic_smiles
def my_canonic_smiles(smi):
    try:
        ca_smi = canonic_smiles(smi)
        return ca_smi
    except:
        return None

merged_drugbank_df['smiles'] = merged_drugbank_df['smiles'].apply(my_canonic_smiles)

[11:18:49] Explicit valence for atom # 0 N, 4, is greater than permitted
[11:18:49] Explicit valence for atom # 0 N, 4, is greater than permitted
[11:18:49] Explicit valence for atom # 0 N, 4, is greater than permitted
[11:18:49] Explicit valence for atom # 0 N, 4, is greater than permitted
[11:18:49] Explicit valence for atom # 0 N, 4, is greater than permitted
[11:18:49] Explicit valence for atom # 0 N, 4, is greater than permitted
[11:18:49] Explicit valence for atom # 0 N, 4, is greater than permitted
[11:18:49] Explicit valence for atom # 0 N, 4, is greater than permitted
[11:18:49] Explicit valence for atom # 0 N, 4, is greater than permitted
[11:18:49] Explicit valence for atom # 0 N, 4, is greater than permitted
[11:18:49] Explicit valence for atom # 0 N, 4, is greater than permitted
[11:18:49] Explicit valence for atom # 0 N, 4, is greater than permitted
[11:18:49] Explicit valence for atom # 0 N, 4, is greater than permitted
[11:18:49] Explicit valence for atom # 0 N, 4, is g

In [8]:
merged_drugbank_df = merged_drugbank_df.dropna(subset=['uniprot_id','groups', 'smiles'])
merged_drugbank_df.to_csv('./data/Binding_data/DrugBank_small_molecules_pairs_data.csv', index=False)

In [8]:
merged_drugbank_df['smiles'].dropna().shape

(22760,)

### Load PS pros information

In [9]:
# top 5
with open('./data/PS_Pro/Homo_whole_PS_top5.txt', 'r') as file:
    ps_whole_list = file.read().strip().split('\n')
print(len(ps_whole_list))

1679
2860


In [10]:
from collections import defaultdict
partition_smis, unknown_smis = set(), set()
partition_target_smis, unknown_target_smis = defaultdict(set), defaultdict(set)
nps_success_smis = set()
nps_success_target_smis = defaultdict(set)

for i in range(len(merged_drugbank_df)):
    uni_ids = merged_drugbank_df.iloc[i, 5]
    smi = merged_drugbank_df.iloc[i, 7]
    if str(uni_ids) != 'nan' and smi is not None:
        if '|' not in uni_ids:
            if uni_ids in ps_whole_list:
                if 'approved' in merged_drugbank_df.iloc[i, 6] and merged_drugbank_df.iloc[i, 3] == 'yes':
                    partition_smis.update([smi])
                    partition_target_smis[uni_ids].update([smi])
                else:
                    unknown_smis.update([smi])
                    unknown_target_smis[uni_ids].update([smi])
            else:
                if 'approved' in merged_drugbank_df.iloc[i, 6] and merged_drugbank_df.iloc[i, 3] == 'yes':
                    # disconsider unreviewed uniprot ids (manually from UniProt database)
                    if uni_ids not in ['O43519', 'Q693P7', 'Q86V67', 'Q99870', 'Q9UE69']:
                        nps_success_smis.update([smi])
                        nps_success_target_smis[uni_ids].update([smi])
        else:
            uni_id_list = uni_ids.split('|')
            for uni in uni_id_list:
                if uni in ps_whole_list:
                    if 'approved' in merged_drugbank_df.iloc[i, 6] and merged_drugbank_df.iloc[i, 3] == 'yes':
                        partition_smis.update([smi])
                        partition_target_smis[uni].update([smi])
                    else:
                        unknown_smis.update([smi])
                        unknown_target_smis[uni].update([smi])
                else:
                    if 'approved' in merged_drugbank_df.iloc[i, 6] and merged_drugbank_df.iloc[i, 3] == 'yes':
                        # disconsider unreviewed uniprot ids (manually from UniProt database)
                        if uni_ids not in ['O43519', 'Q693P7', 'Q86V67', 'Q99870', 'Q9UE69']:
                            nps_success_smis.update([smi])
                            nps_success_target_smis[uni].update([smi])

In [11]:
print(len(partition_smis), len(partition_target_smis))
print(len(unknown_smis), len(unknown_target_smis))
print(len(nps_success_smis), len(nps_success_target_smis))

209 49
1179 261
1213 660


In [12]:
with open('./data/Binding_data/DrugBank_ps_smiles_top5.pkl', 'wb') as file:
    pkl.dump((partition_smis, unknown_smis, partition_target_smis, unknown_target_smis), file)

In [13]:
with open('./data/Binding_data/DrugBank_nps_smiles_top5.pkl', 'wb') as file:
    pkl.dump((nps_success_smis, nps_success_target_smis), file=file)