In [1]:
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from tqdm import tqdm

## TTD Targets

In [2]:
target_raw_data = pd.read_table('D:/study_data/DrugTarget/TTD/P1-01-TTD_target_download.txt',
                                sep='\t', skiprows=39, header=None)
target_raw_data = target_raw_data.dropna(how='all')
target_raw_data.head()

Unnamed: 0,0,1,2,3,4
0,T47101,TARGETID,T47101,,
1,T47101,FORMERID,TTDC00024,,
2,T47101,UNIPROID,FGFR1_HUMAN,,
3,T47101,TARGNAME,Fibroblast growth factor receptor 1 (FGFR1),,
4,T47101,GENENAME,FGFR1,,


In [3]:
len(set(target_raw_data[0].tolist()))

4221

In [4]:
targets = sorted(list(set(target_raw_data[0].tolist())))
uniprot_ids, gene_names, pdb_ids, biochemical_classes, sequences, drug_infos = [], [], [], [], [], []
full_names, target_types = [], []
for target in tqdm(targets):
    abbs = ['UNIPROID', 'TARGNAME', 'GENENAME', 'TARGTYPE', 'PDBSTRUC', 'BIOCLASS', 'SEQUENCE']
    lists = [uniprot_ids, full_names, gene_names, target_types, pdb_ids, biochemical_classes, sequences]
    for abb, lst in zip(abbs, lists):
        tmp = target_raw_data[(target_raw_data[0] == target) & (target_raw_data[1] == abb)]
        try:
            value = tmp.iloc[0, 2]
            lst.append(value)
        except:
            lst.append(None)

    # drug info
    drug_df = target_raw_data[(target_raw_data[0] == target) & (target_raw_data[1] == 'DRUGINFO')]
    try:
        drug_infos.append(drug_df.iloc[:, 2:].values)
    except:
        drug_infos.append(None)

100%|██████████| 4221/4221 [05:43<00:00, 12.28it/s]


In [5]:
target_df = pd.DataFrame({
    'target_id': targets,
    'uniprot_id': uniprot_ids,
    'target_name': full_names,
    'gene_name': gene_names,
    'target_type': target_types,
    'PDB_id': pdb_ids,
    'biochemical_class': biochemical_classes,
    'sequence': sequences,
    'drug_info': drug_infos
})

target_df.head()

Unnamed: 0,target_id,uniprot_id,target_name,gene_name,target_type,PDB_id,biochemical_class,sequence,drug_info
0,T00032,OSTP_HUMAN,Osteopontin (SPP1),SPP1,Literature-reported target,3DSF; 3CXD,,MRIAVICFCLLGITCAIPVKQADSGSSEEKQLYNKYPDAVATWLNP...,[]
1,T00033,TGFA_HUMAN,Transforming growth factor alpha (TGFA),TGFA,Clinical trial target,5KN5; 4TGF; 3TGF; 3.00E+50; 2TGF,Growth factor,MVPSAGQLALFALGIVLAACQALENSTSPLSADPPVAAAVVSHFND...,"[[D08EIK, LY3016859, Phase 1/2]]"
2,T00037,ERG6_PNEC8,Fungal Sterol 24-C-methyltransferase (Fung erg6),Fung erg6,Literature-reported target,,Methyltransferase,MSFELERIDIEKDREFSEIMHGKDAAKERGLLSSFRKDKEAQKIAL...,"[[D0M7DG, 24-thiacycloartanol, Investigative]]"
3,T00039,CTGF_HUMAN,CTGF messenger RNA (CTGF mRNA),CTGF,Clinical trial target,,mRNA target,MTAASMGPVRVAFVVLLALCSRPAVGQNCSGPCRCPDEPAPRCPAG...,"[[D0R3LQ, EXC 001, Phase 2]]"
4,T00064,,microRNA hsa-miR-199a (MIR199a),,Literature-reported target,,Non-coding RNA target,,[]


In [6]:
target_df.to_csv('D:/study_data/DrugTarget/TTD/Processed_target_info.csv', index=False)

### Target ID mapping

In [2]:
target_mapping_data = pd.read_table('D:/study_data/DrugTarget/TTD/P2-01-TTD_uniprot_all.txt',
                                    sep='\t', skiprows=22, header=None)
target_mapping_data = target_mapping_data.dropna(how='all')
target_mapping_data.head(2)

Unnamed: 0,0,1,2
0,T00032,TARGETID,T00032
1,T00032,UNIPROID,OSTP_HUMAN


In [4]:
targets = sorted(list(set(target_mapping_data[0].tolist())))
print(len(targets))

abbs = ['UNIPROID', 'TARGNAME', 'TARGTYPE']
uniprot_ids, target_names, target_types = [], [], []
lsts = [uniprot_ids, target_names, target_types]
uniprot_ids_dict, target_names_dict, target_types_dict = {}, {}, {}
dcts = [uniprot_ids_dict, target_names_dict, target_types_dict]
for abb, dct in zip(abbs, dcts):
    mapping_df = target_mapping_data[target_mapping_data[1] == abb][[0, 2]]
    for target, value in mapping_df.values:
        dct[target] = value

for lst, dct in zip(lsts, dcts):
    lst += [dct[target] if target in dct else None for target in targets]

3606


In [5]:
target_mapping_df = pd.DataFrame({
    'target_id': targets,
    'uniprot_id': uniprot_ids,
    'target_name': target_names,
    'target_type': target_types
})

target_mapping_df.to_csv('D:/study_data/DrugTarget/TTD/Processed_target_mapping_info.csv', index=False)

In [6]:
target_mapping_df.head()

Unnamed: 0,target_id,uniprot_id,target_name,target_type
0,T00032,OSTP_HUMAN,Osteopontin (SPP1),Research target
1,T00033,TGFA_HUMAN,Transforming growth factor alpha (TGFA),Clinical Trial target
2,T00037,ERG6_PNEC8,Fungal Sterol 24-C-methyltransferase (Fung erg6),Research target
3,T00039,CTGF_HUMAN,CTGF messenger RNA (CTGF mRNA),Clinical Trial target
4,T00075,MRCKA_HUMAN,CDC42 binding protein kinase alpha (DMPK-like ...,Research target


## TTD Drugs

In [2]:
drug_raw_data = pd.read_table('D:/study_data/DrugTarget/TTD/P1-02-TTD_drug_download.txt',
                              sep='\t', skiprows=28, header=None)
drug_raw_data = drug_raw_data.dropna(how='all')
drug_raw_data.head()

Unnamed: 0,0,1,2
1,D00AAN,DRUG__ID,D00AAN
2,D00AAN,DRUGCLAS,Investigative Drug(s)
3,D00AAN,DRUGINCH,1S/C42H47ClN4O5S/c43-28-12-16-31(17-13-28)53(5...
4,D00AAN,DRUGINKE,MSUMHGMGRZWLMN-WXPZYUJUSA-N
5,D00AAN,DRUGSMIL,C1CCN2CCC3C(=CC(CCC=CC1)(C4C3(C2)CC5N4CCCC(=O)...


In [3]:
len(set(drug_raw_data[0].tolist()))

41818

In [10]:
drugs = sorted(list(set(drug_raw_data[0].tolist())))
therap_classes, drug_types, drug_inchis, drug_inchikeys, drug_smiles, status, drug_classes, comp_classes = \
    [], [], [], [], [], [], [], []
abbs = ['THERCLAS', 'DRUGTYPE', 'DRUGINCH', 'DRUGINKE', 'DRUGSMIL', 'HIGHSTAT', 'DRUGCLAS', 'COMPCLAS']
lsts = [therap_classes, drug_types, drug_inchis, drug_inchikeys, drug_smiles, status, drug_classes, comp_classes]
# This way is too time-consuming!!!
# for drug in tqdm(drugs):
#     for abb, lst in zip(abbs, lsts):
#         tmp = drug_raw_data[(drug_raw_data[0] == drug) & (drug_raw_data[1] == abb)]
#         try:
#             value = tmp.iloc[0, 2]
#             lst.append(value)
#         except:
#             lst.append(None)
therap_classes_dict, drug_types_dict, drug_inchis_dict, drug_inchikeys_dict, drug_smiles_dict, status_dict, drug_classes_dict, comp_classes_dict = \
    {}, {}, {}, {}, {}, {}, {}, {}
dcts = [therap_classes_dict, drug_types_dict, drug_inchis_dict, drug_inchikeys_dict, drug_smiles_dict, status_dict, drug_classes_dict, comp_classes_dict]
for abb, dct in zip(abbs, dcts):
    abb_drug_raw_df = drug_raw_data[drug_raw_data[1] == abb][[0, 2]]
    for drug, value in abb_drug_raw_df.values:
        dct[drug] = value

for lst, dct in zip(lsts,dcts):
    lst += [dct[drug] if drug in dct else None for drug in drugs]

In [11]:
drug_df = pd.DataFrame({
    'drug_id': drugs,
    'therap_class': therap_classes,
    'drug_type': drug_types,
    'InchI': drug_inchis,
    'InchIKey': drug_inchikeys,
    'smiles': drug_smiles,
    'status': status,
    'drug_class': drug_classes,
    'compound_class': comp_classes
})

drug_df.to_csv('D:/study_data/DrugTarget/TTD/Processed_drug_info.csv', index=False)

In [12]:
drug_df.head()

Unnamed: 0,drug_id,therap_class,drug_type,InchI,InchIKey,smiles,status,drug_class,compound_class
0,D00AAN,,Small molecular drug,1S/C42H47ClN4O5S/c43-28-12-16-31(17-13-28)53(5...,MSUMHGMGRZWLMN-WXPZYUJUSA-N,C1CCN2CCC3C(=CC(CCC=CC1)(C4C3(C2)CC5N4CCCC(=O)...,Investigative,Investigative Drug(s),
1,D00AAU,,Small molecular drug,1S/C18H22O2/c1-3-17(13-7-5-9-15(19)11-13)18(4-...,KUJAWCSIKNKXLL-UHFFFAOYSA-N,CCC(C1=CC(=CC=C1)O)C(CC)C2=CC(=CC=C2)O,Investigative,Investigative Drug(s),
2,D00ABE,,,,,,Phase 2,Clinical Trial Drug(s),
3,D00ABO,,Small molecular drug,1S/C20H20N4O/c25-20(24-13-11-21-12-14-24)16-8-...,YYLKKYCXAOBSRM-JXMROGBWSA-N,C1CN(CCN1)C(=O)C2=CC=C(C=C2)C=CC3=NNC4=CC=CC=C43,Phase 1,Clinical Trial Drug(s),
4,D00ABW,,,,,,Discontinued in Phase 1,Discontinued Drug(s),


### Drug ID mappings

In [13]:
mapping_raw_data = pd.read_table('D:/study_data/DrugTarget/TTD/P1-03-TTD_crossmatching.txt',
                                 sep='\t', skiprows=27, header=None)
mapping_raw_data = mapping_raw_data.dropna(how='all')
mapping_raw_data.head(3)

Unnamed: 0,0,1,2
0,D00AAN,TTDDRUID,D00AAN
1,D00AAN,DRUGNAME,8-O-(4-chlorobenzenesulfonyl)manzamine F
2,D00AAN,D_FOMULA,C42H47ClN4O5S


In [15]:
mapping_raw_data.shape, len(set(mapping_raw_data[0].tolist()))

((129222, 3), 25258)

In [17]:
m_drugs = sorted(list(set(mapping_raw_data[0].tolist())))
drug_names, drug_cas_numbers, drug_formulars, drug_cids, drug_sids, drug_chebi_ids = [], [], [], [], [], []
lsts = [drug_names, drug_cas_numbers, drug_formulars, drug_cids, drug_sids, drug_chebi_ids]
drug_names_dict, drug_cas_numbers_dict, drug_formulars_dict, drug_cids_dict, drug_sids_dict, drug_chebi_ids_dict = \
    {}, {}, {}, {}, {}, {}
abbs = ['DRUGNAME', 'CASNUMBE', 'D_FOMULA', 'PUBCHCID', 'PUBCHSID', 'CHEBI_ID']
dcts = [drug_names_dict, drug_cas_numbers_dict, drug_formulars_dict, drug_cids_dict, drug_sids_dict, drug_chebi_ids_dict]
for abb, dct in zip(abbs, dcts):
    abb_df = mapping_raw_data[(mapping_raw_data[1] == abb)][[0, 2]]
    for drug, value in abb_df.values:
        dct[drug] = value

for lst, dct in zip(lsts, dcts):
    lst += [dct[drug] if drug in dct else None for drug in m_drugs]

In [18]:
mapping_df = pd.DataFrame({
    'drug_id': m_drugs,
    'drug_name': drug_names,
    'CAS_number': drug_cas_numbers,
    'formular': drug_formulars,
    'pubchem_cid': drug_cids,
    'pubchem_sid': drug_sids,
    'ChEBI_id': drug_chebi_ids
})
mapping_df.to_csv('D:/study_data/DrugTarget/TTD/Processed_drug_mapping_info.csv', index=False)
mapping_df.head()

Unnamed: 0,drug_id,drug_name,CAS_number,formular,pubchem_cid,pubchem_sid,ChEBI_id
0,D00AAN,8-O-(4-chlorobenzenesulfonyl)manzamine F,,C42H47ClN4O5S,23643731,,
1,D00AAU,3-[1-ethyl-2-(3-hydroxyphenyl)butyl]phenol,CAS 68266-24-0,C18H22O2,100424,,
2,D00ABO,KW-2449,CAS 841258-76-2,C20H20N4O,11427553,16524833; 23572482; 42506629; 78872837; 104253...,
3,D00ACC,ND1251,CAS 280783-56-4,C26H31N3O,9844019,14805894; 15676330; 24141633; 44942090; 496842...,
4,D00ACL,PMID28092474-Compound-33d,,C16H13FN6O2,117703486,,


## Drug & Target

In [7]:
activity_df = pd.read_table('D:/study_data/DrugTarget/TTD/P1-09-Target_compound_activity.txt', sep='\t', header=0)
activity_df.head(2)

Unnamed: 0,TTD Target ID,TTD Drug/Compound ID,Pubchem CID,Activity
0,T34562,CANH78,17280,IC50 = 3270000 nM
1,T34562,C27IRS,197934,IC50 = 1280000 nM


In [10]:
activity_df['Affinity'] = activity_df['Activity'].apply(lambda item: float(item.split(' ')[2]))

In [14]:
# keep affinity <= 1000 nM
keep_activity_df = activity_df[activity_df['Affinity'] <= 1000.]
len(keep_activity_df)

468872

In [15]:
keep_activity_df.iloc[:, :3].to_csv('D:/study_data/DrugTarget/TTD/Drug_target_pairs_1000.csv', index=False)

### Clinical data

In [16]:
clinical_data = pd.read_excel('D:/study_data/DrugTarget/TTD/P1-07-Drug-TargetMapping.xlsx')
clinical_data.head(2)

Unnamed: 0,TargetID,DrugID,Highest_status,MOA
0,T71390,D07OAC,Investigative,Inhibitor
1,T70309,D07OAC,Investigative,Inhibitor


In [19]:
clinical_data.shape

(44663, 4)

In [20]:
clinical_data.value_counts(subset=['Highest_status'])

Highest_status                 
Investigative                      24859
Patented                            6017
Phase 2                             2624
Approved                            2617
Phase 1                             2497
Terminated                          1449
Phase 3                             1088
Discontinued in Phase 2              922
Phase 1/2                            734
Discontinued in Phase 1              576
Preclinical                          520
Discontinued in Phase 3              218
Phase 2/3                            128
Clinical trial                        70
Withdrawn from market                 56
Phase 4                               54
Phase 3 Trial                         48
Discontinued in Phase 1/2             33
Phase 2 Trial                         27
Discontinued in Preregistration       26
Phase 2/3 Trial                       12
Preregistration                       12
Phase 2a                              11
Phase 1 Trial            