## **Import**

In [None]:
import re
import numpy as np
import pandas as pd
from google.colab import files
import urllib.parse
import urllib.request

## **Drug-Target Information**

### **Load Files**

Load the information from Drug Central and Monarch: 

In [None]:
df = pd.read_csv('drug.target.interaction.tsv', header=0, index_col=0)
#nodes = pd.read_csv('graph_nodes_v2022-01-11.csv', header=0)
nodes = pd.read_csv('monarch_nodes_v2022-05-19.csv', header=0)

### **Uniprot to Monarch IDs**

Use Uniprot API to obtain new target IDs: 

In [None]:
url = 'https://www.uniprot.org/uploadlists/'

for i in range(0, df.shape[0]): 

  while True: 

    print('Going through row', i)

    id = df['ACCESSION'][i].split('|')[0]
    org = df['ORGANISM'][i]

    if org == 'Homo sapiens': 
      to = 'HGNC_ID'
      pre = ''
    elif org == 'Rattus norvegicus': 
      to = 'RGD_ID'
      pre = 'RGD:'
    elif org == 'Mus musculus': 
      to = 'MGI_ID'
      pre = ''
    elif org == 'Drosophila melanogaster':
      to = 'FLYBASE_ID'
      pre = 'FlyBase:'
    elif org == 'Caenorhabditis elegans': 
      to = 'WORMBASE_ID'
      pre = 'WormBase:'
    elif org == 'Danio rerio': 
      to = 'ZFIN_ID'
      pre = 'ZFIN:'
    elif org == 'Saccharomyces cerevisiae': 
      to = 'SGD_ID'
      pre = 'SGD:'
    else:   
      to = 'ENSEMBL_ID'
      pre = 'ENSEMBL:'

    params = {
      'from': 'ACC+ID',
      'to': to,
      'format': 'tab',
      'query': id
      }

    data = urllib.parse.urlencode(params)
    data = data.encode('utf-8')
    req = urllib.request.Request(url, data)
    try: 
      with urllib.request.urlopen(req) as f:
        response = f.read()
    except: 
      continue
    try: 
      new_id = response.decode('utf-8').split()[3]
    except: 
      pre = ''
      new_id = 'NA'
    print(pre + new_id)
    df['NEW_ID'][i] = pre + new_id
    break

Drop rows that don't have a new ID: 

In [None]:
df = df.drop(df[df.NEW_ID == 'NA'].index)

Keep only rows whose the target is in the Monarch nodes:

In [None]:
df = df[df['NEW_ID'].isin(list(nodes['id']))]

Save the Drug-Target dataframe as .csv: 

In [None]:
df.to_csv('drug.target.final.csv', encoding = 'utf-8-sig') 

Create and save a dataframe containing the drug nodes: 

In [None]:
df_drugs = df[['DRUG_NAME', 'STRUCT_ID']]
df_drugs = df_drugs.drop_duplicates()
df_drugs.to_csv('drug_nodes_v2.csv', encoding = 'utf-8-sig') 

## **Drug-Disease Information**

### **Text to CSV**

Create a Dataframe that will contain the Drug-Disease information:

In [None]:
column_names = ["DRUG_ID", "DRUG_NAME", "DISEASES", 'PHASE']

df = pd.DataFrame(columns = column_names)

Read the .txt file downloaded from TTD and convert it into a pandas dataframe:

In [None]:
with open('drug_to_disease.txt') as f:
  lines = f.readlines()

In [None]:
i = 0
id = ''
name = ''
disease = ''
phase = ''

for line in lines: 
  if line.startswith('DRUGNAME'): 
    result = re.search('\t(.*)\n', line)
    name = result.group(1)
  elif line.startswith('TTDDRUID'): 
    result = re.search('\t(.*)\n', line)
    id = result.group(1)
  elif line.startswith('INDICATI'): 
    result = re.search('\t(.*)\[', line)
    result2 = re.search('\](.*)', line)
    disease = result.group(1) 
    phase = result2.group(1) 
    df.loc[i] = [id,name,disease, phase]
    i += 1
  elif line.startswith('\n'): 
    id = ''
    name = ''
    disease = ''
    phase = ''


In [None]:
df

Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE
0,D00ABE,ALD-301,Ischemia,Phase 2
1,D00ABE,ALD-301,Peripheral arterial disease,Phase 2
2,D00ABO,KW-2449,Acute myeloid leukaemia,Phase 1
3,D00ABW,Opterone,Hormone deficiency,Discontinued in Phase 1
4,D00ACC,ND1251,Depression,Discontinued in Phase 1
...,...,...,...,...
28973,DZTX12,ASC-J9,End-stage renal disease,Phase 2
28974,DZU72C,OKI 179,Solid tumour/cancer,Phase 1
28975,DZVF15,TRN-110,Attention deficit hyperactivity disorder,Clinical Trial
28976,DZW53X,GSK4069889,Non-small cell lung cancer,Phase 2


In [None]:
df.to_csv('drug_to_disease.csv', encoding = 'utf-8-sig') 

### **Load CSVs**

Load the drug nodes (obtained from Drug Central) and the Monarch nodes: 

In [None]:
graph_drugs = pd.read_csv('drug_nodes_v2.csv', header=0)
nodes = pd.read_csv('monarch_nodes_v2022-05-19.csv', header=0)

Get list of unique drugs and diseases:

In [None]:
graph_diseases = nodes[nodes['semantic_groups'] == 'DISO']
unique_diseases = graph_diseases['name'].unique()
unique_dis_id = graph_diseases['id'].unique()
unique_diseases = [x.lower() for x in unique_diseases]

25636

In [None]:
unique_drugs = graph_drugs['DRUG_NAME'].unique()
unique_drugs = [x.lower() for x in unique_drugs]
len(unique_drugs)

1556

At this point you should use SORTA tool (https://sorta.molgeniscloud.org/menu/main/sorta?) to match the IDs of TTD to Human Phenotype Ontology (HPO). The file is also available in the project's Github:

In [None]:
matched = pd.read_csv('matched.csv', header = 0, delimiter = ';')

In [None]:
matched

Unnamed: 0,Name,ontologyTermName,ontologyTermIRI,score,validated,review
0,respiratory failure,Respiratory failure,http://purl.obolibrary.org/obo/HP_0002878,100.00,False,False
1,sexual dysfunction,Male sexual dysfunction,http://purl.obolibrary.org/obo/HP_0040307,86.49,False,False
2,achondroplasia,Bronchodysplasia,http://purl.obolibrary.org/obo/HP_0006533,62.50,False,False
3,glabellar frown line,Prominent glabella,http://purl.obolibrary.org/obo/HP_0002057,59.46,False,False
4,testicular germ cell tumour,Testicular neoplasm,http://purl.obolibrary.org/obo/HP_0010788,71.11,False,False
...,...,...,...,...,...,...
1796,systemic mastocytosis,Mastocytosis,http://purl.obolibrary.org/obo/HP_0100495,77.42,False,False
1797,acute iron or aluminum toxicity,Abnormal total iron binding capacity,http://purl.obolibrary.org/obo/HP_0033212,48.15,False,False
1798,chronic inflammatory demyelinating polyneuropathy,Acute demyelinating polyneuropathy,http://purl.obolibrary.org/obo/HP_0007131,64.86,False,False
1799,aortic aneurysm,Aortic aneurysm,http://purl.obolibrary.org/obo/HP_0004942,100.00,False,False


Select only those IDs with a score greater than 80:

In [None]:
matched = matched[matched['score'] > 80]

Create the final ID:

In [None]:
matched['ID'] = matched['ontologyTermIRI'].str.split('/obo/').str[1]

In [None]:
new_id = []
for i in matched['ID']: 
  id = re.sub("[^0-9a-zA-Z]+", ":", i)
  new_id.append(id)
matched['ID'] = new_id


In [None]:
i = 0
for index, row in df.iterrows(): 
  if row['DRUG_NAME'].lower() in unique_drugs: 
    if row['DISEASES'].lower() in unique_diseases: 
      i += 1

0


### **Merging**

In [None]:
modified = []
for d in df['DISEASES']: 
  new_string = re.sub("[^0-9a-zA-Z]+", " ", d)
  modified.append(new_string.lower())

In [None]:
df['Name'] = modified

In [None]:
df['Name'] = df['Name'].str.strip()
matched['Name'] = matched['Name'].str.strip()

In [None]:
final = pd.merge(df, matched, on = 'Name', how = 'left')

In [None]:
final

Unnamed: 0.1,Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE,Name,ontologyTermName,ontologyTermIRI,score,validated,review,ID
0,0,D00ABE,ALD-301,Ischemia,Phase 2,ischemia,Cerebral ischemia,http://purl.obolibrary.org/obo/HP_0002637,75.0,False,False,HP:0002637
1,1,D00ABE,ALD-301,Peripheral arterial disease,Phase 2,peripheral arterial disease,Peripheral arterial stenosis,http://purl.obolibrary.org/obo/HP_0004950,100.0,False,False,HP:0004950
2,2,D00ABO,KW-2449,Acute myeloid leukaemia,Phase 1,acute myeloid leukaemia,Acute myeloid leukemia,http://purl.obolibrary.org/obo/HP_0004808,100.0,False,False,HP:0004808
3,3,D00ABW,Opterone,Hormone deficiency,Discontinued in Phase 1,hormone deficiency,Decreased response to growth hormone stimuatio...,http://purl.obolibrary.org/obo/HP_0000824,80.0,False,False,HP:0000824
4,4,D00ACC,ND1251,Depression,Discontinued in Phase 1,depression,Depressivity,http://purl.obolibrary.org/obo/HP_0000716,100.0,False,False,HP:0000716
...,...,...,...,...,...,...,...,...,...,...,...,...
29562,28974,DZU72C,OKI 179,Solid tumour/cancer,Phase 1,solid tumour cancer,Benign gastrointestinal tract tumors,http://purl.obolibrary.org/obo/HP_0006719,60.0,False,False,HP:0006719
29563,28975,DZVF15,TRN-110,Attention deficit hyperactivity disorder,Clinical Trial,attention deficit hyperactivity disorder,Attention deficit hyperactivity disorder,http://purl.obolibrary.org/obo/HP_0007018,100.0,False,False,HP:0007018
29564,28976,DZW53X,GSK4069889,Non-small cell lung cancer,Phase 2,non small cell lung cancer,Non-small cell lung carcinoma,http://purl.obolibrary.org/obo/HP_0030358,100.0,False,False,HP:0030358
29565,28976,DZW53X,GSK4069889,Non-small cell lung cancer,Phase 2,non small cell lung cancer,Non-small cell lung carcinoma,http://purl.obolibrary.org/obo/HP_0030358,100.0,False,False,HP:0030358


In [None]:
final.to_csv('drug_to_disease_HP.csv', encoding = 'utf-8-sig') 

### **Comparison**

In [None]:
final = pd.read_csv('drug_to_disease_HP.csv', header=0)

In [None]:
final_filtered = final[final['score'] == 100]

In [None]:
len(final_filtered)

12151

In [None]:
final_filtered

Unnamed: 0.1,Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE,Name,ontologyTermName,ontologyTermIRI,score,validated,review,ID
1,1,D00ABE,ALD-301,Peripheral arterial disease,Phase 2,peripheral arterial disease,Peripheral arterial stenosis,http://purl.obolibrary.org/obo/HP_0004950,100.0,False,False,HP:0004950
2,2,D00ABO,KW-2449,Acute myeloid leukaemia,Phase 1,acute myeloid leukaemia,Acute myeloid leukemia,http://purl.obolibrary.org/obo/HP_0004808,100.0,False,False,HP:0004808
4,4,D00ACC,ND1251,Depression,Discontinued in Phase 1,depression,Depressivity,http://purl.obolibrary.org/obo/HP_0000716,100.0,False,False,HP:0000716
11,11,D00AHT,PRAME antigen-specific cancer immunotherapeutic,Non-small-cell lung cancer,Phase 2,non small cell lung cancer,Non-small cell lung carcinoma,http://purl.obolibrary.org/obo/HP_0030358,100.0,False,False,HP:0030358
12,11,D00AHT,PRAME antigen-specific cancer immunotherapeutic,Non-small-cell lung cancer,Phase 2,non small cell lung cancer,Non-small cell lung carcinoma,http://purl.obolibrary.org/obo/HP_0030358,100.0,False,False,HP:0030358
...,...,...,...,...,...,...,...,...,...,...,...,...
29556,28968,DZNH43,MT-3724,B-cell lymphoma,Phase 1,b cell lymphoma,B-cell lymphoma,http://purl.obolibrary.org/obo/HP_0012191,100.0,False,False,HP:0012191
29561,28973,DZTX12,ASC-J9,End-stage renal disease,Phase 2,end stage renal disease,Stage 5 chronic kidney disease,http://purl.obolibrary.org/obo/HP_0003774,100.0,False,False,HP:0003774
29563,28975,DZVF15,TRN-110,Attention deficit hyperactivity disorder,Clinical Trial,attention deficit hyperactivity disorder,Attention deficit hyperactivity disorder,http://purl.obolibrary.org/obo/HP_0007018,100.0,False,False,HP:0007018
29564,28976,DZW53X,GSK4069889,Non-small cell lung cancer,Phase 2,non small cell lung cancer,Non-small cell lung carcinoma,http://purl.obolibrary.org/obo/HP_0030358,100.0,False,False,HP:0030358


In [None]:
column_names = ["DRUG_ID", "DRUG_NAME", "DISEASES", 'PHASE']

df = pd.DataFrame(columns = column_names)

In [None]:
unique_drugs

['levobupivacaine',
 '(s)-nicardipine',
 '(s)-nitrendipine',
 'levdobutamine',
 'aminopterin',
 'phenylbutanoic acid',
 'azacitidine',
 'methoxsalen',
 'acamprosate',
 'acarbose',
 'acebutolol',
 'aceclofenac',
 'paracetamol',
 'acetohexamide',
 'acetophenazine',
 'racecadotril',
 'acetylcholine',
 'acetylcysteine',
 'acetyldigitoxin',
 'acepromazine',
 'acyclovir',
 'adapalene',
 'adefovir dipivoxil',
 'adenine',
 'adenosine',
 'adenosine triphosphate',
 'adenosine phosphate',
 'adiphenine',
 'ajmaline',
 'alacepril',
 'salbutamol',
 'alcuronium',
 'alfentanil',
 'aliskiren',
 'alizapride',
 'allopurinol',
 'alosetron',
 'alfacalcidol',
 'alfaxalone',
 'alprazolam',
 'alprenolol',
 'alprostadil',
 'alvimopan',
 'amantadine',
 'ambroxol',
 'amcinonide',
 'amifostine',
 'amiloride',
 'amineptine',
 'aminoglutethimide',
 'aminoquinuride',
 'amiodarone',
 'amisulpride',
 'amitriptyline',
 'amitriptylinoxide',
 'amlexanox',
 'amlodipine',
 'amodiaquine',
 'amoxapine',
 'amperozide',
 'amfe

In [None]:
i = 0
for index, row in final_filtered.iterrows(): 
  if row['DRUG_NAME'].lower() not in unique_drugs or row['ID'] not in unique_dis_id:
   final_filtered = final_filtered.drop(labels = index, axis = 0)
len(final_filtered)

599

In [None]:
i = 0
for index, row in final_filtered.iterrows(): 
  drug = row['DRUG_NAME'].lower()
  id = graph_drugs[graph_drugs['DRUG_NAME'] == drug]['STRUCT_ID'].values[0]
  final_filtered.at[index, 'DRUG_ID'] = id
len(final_filtered)

In [None]:
final_filtered

Unnamed: 0.1,Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE,Name,ontologyTermName,ontologyTermIRI,score,validated,review,ID
58,54,622,Chlorpropamide,Non-insulin dependent diabetes,Approved,non insulin dependent diabetes,Type II diabetes mellitus,http://purl.obolibrary.org/obo/HP_0005978,100.0,False,False,HP:0005978
154,148,4135,Lisdexamfetamine,Attention deficit hyperactivity disorder,Approved,attention deficit hyperactivity disorder,Attention deficit hyperactivity disorder,http://purl.obolibrary.org/obo/HP_0007018,100.0,False,False,HP:0007018
220,213,1275,Gallopamil,Asthma,Phase 2,asthma,Asthma,http://purl.obolibrary.org/obo/HP_0002099,100.0,False,False,HP:0002099
221,214,1275,Gallopamil,Hypertension,Discontinued in Phase 1,hypertension,Hypertension,http://purl.obolibrary.org/obo/HP_0000822,100.0,False,False,HP:0000822
355,341,1827,Moexipril,Hypertension,Approved,hypertension,Hypertension,http://purl.obolibrary.org/obo/HP_0000822,100.0,False,False,HP:0000822
...,...,...,...,...,...,...,...,...,...,...,...,...
27189,26704,D0Z6UC,Sumatriptan,Migraine,Approved,migraine,Migraine,http://purl.obolibrary.org/obo/HP_0002076,100.0,False,False,HP:0002076
27209,26724,D0Z7KE,Isradipine,Hypertension,Approved,hypertension,Hypertension,http://purl.obolibrary.org/obo/HP_0000822,100.0,False,False,HP:0000822
27445,26953,D0ZS8P,Clomipramine,Depression,Approved,depression,Depressivity,http://purl.obolibrary.org/obo/HP_0000716,100.0,False,False,HP:0000716
27491,26996,D0ZX1P,FADROZOLE,Breast cancer,Approved,breast cancer,Breast carcinoma,http://purl.obolibrary.org/obo/HP_0003002,100.0,False,False,HP:0003002


In [None]:
final_filtered = final_filtered.drop(labels = ['Unnamed: 0', 'score'], axis =1)

In [None]:
final_filtered

Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE,Name,ontologyTermName,ontologyTermIRI,validated,review,ID
58,622,Chlorpropamide,Non-insulin dependent diabetes,Approved,non insulin dependent diabetes,Type II diabetes mellitus,http://purl.obolibrary.org/obo/HP_0005978,False,False,HP:0005978
154,4135,Lisdexamfetamine,Attention deficit hyperactivity disorder,Approved,attention deficit hyperactivity disorder,Attention deficit hyperactivity disorder,http://purl.obolibrary.org/obo/HP_0007018,False,False,HP:0007018
220,1275,Gallopamil,Asthma,Phase 2,asthma,Asthma,http://purl.obolibrary.org/obo/HP_0002099,False,False,HP:0002099
221,1275,Gallopamil,Hypertension,Discontinued in Phase 1,hypertension,Hypertension,http://purl.obolibrary.org/obo/HP_0000822,False,False,HP:0000822
355,1827,Moexipril,Hypertension,Approved,hypertension,Hypertension,http://purl.obolibrary.org/obo/HP_0000822,False,False,HP:0000822
...,...,...,...,...,...,...,...,...,...,...
27189,D0Z6UC,Sumatriptan,Migraine,Approved,migraine,Migraine,http://purl.obolibrary.org/obo/HP_0002076,False,False,HP:0002076
27209,D0Z7KE,Isradipine,Hypertension,Approved,hypertension,Hypertension,http://purl.obolibrary.org/obo/HP_0000822,False,False,HP:0000822
27445,D0ZS8P,Clomipramine,Depression,Approved,depression,Depressivity,http://purl.obolibrary.org/obo/HP_0000716,False,False,HP:0000716
27491,D0ZX1P,FADROZOLE,Breast cancer,Approved,breast cancer,Breast carcinoma,http://purl.obolibrary.org/obo/HP_0003002,False,False,HP:0003002


In [None]:
final_filtered.to_csv('drug_to_disease_final_v2.csv', encoding = 'utf-8-sig') 