## **Import**

In [None]:
import re
import numpy as np
import pandas as pd
from google.colab import files
import urllib.parse
import urllib.request
import requests

## **Drug-Target Information**

### **Load Files**

Load the information from Drug Central and Monarch: 

In [None]:
df = pd.read_csv('drug.target.interaction.tsv', header=0, index_col=0, sep='\t')
#nodes = pd.read_csv('graph_nodes_v2022-01-11.csv', header=0)
nodes = pd.read_csv('monarch_nodes_v2022-12-29.csv', header=0)

In [None]:
df['NEW_ID'] = ''

### **Uniprot to Monarch IDs**

Use Uniprot API to obtain new target IDs: 

In [None]:
import re
import time
import json
import zlib
from xml.etree import ElementTree
from urllib.parse import urlparse, parse_qs, urlencode
import requests
from requests.adapters import HTTPAdapter, Retry


POLLING_INTERVAL = 3
API_URL = "https://rest.uniprot.org"


retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))


def check_response(response):
    try:
        response.raise_for_status()
    except requests.HTTPError:
        print(response.json())
        raise


def submit_id_mapping(from_db, to_db, ids):
    request = requests.post(
        f"{API_URL}/idmapping/run",
        data={"from": from_db, "to": to_db, "ids": ",".join(ids)},
    )
    check_response(request)
    return request.json()["jobId"]


def get_next_link(headers):
    re_next_link = re.compile(r'<(.+)>; rel="next"')
    if "Link" in headers:
        match = re_next_link.match(headers["Link"])
        if match:
            return match.group(1)


def check_id_mapping_results_ready(job_id):
    while True:
        request = session.get(f"{API_URL}/idmapping/status/{job_id}")
        check_response(request)
        j = request.json()
        if "jobStatus" in j:
            if j["jobStatus"] == "RUNNING":
                print(f"Retrying in {POLLING_INTERVAL}s")
                time.sleep(POLLING_INTERVAL)
            else:
                raise Exception(j["jobStatus"])
        else:
            return bool(j["results"] or j["failedIds"])


def get_batch(batch_response, file_format, compressed):
    batch_url = get_next_link(batch_response.headers)
    while batch_url:
        batch_response = session.get(batch_url)
        batch_response.raise_for_status()
        yield decode_results(batch_response, file_format, compressed)
        batch_url = get_next_link(batch_response.headers)


def combine_batches(all_results, batch_results, file_format):
    if file_format == "json":
        for key in ("results", "failedIds"):
            if key in batch_results and batch_results[key]:
                all_results[key] += batch_results[key]
    elif file_format == "tsv":
        return all_results + batch_results[1:]
    else:
        return all_results + batch_results
    return all_results


def get_id_mapping_results_link(job_id):
    url = f"{API_URL}/idmapping/details/{job_id}"
    request = session.get(url)
    check_response(request)
    return request.json()["redirectURL"]


def decode_results(response, file_format, compressed):
    if compressed:
        decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
        if file_format == "json":
            j = json.loads(decompressed.decode("utf-8"))
            return j
        elif file_format == "tsv":
            return [line for line in decompressed.decode("utf-8").split("\n") if line]
        elif file_format == "xlsx":
            return [decompressed]
        elif file_format == "xml":
            return [decompressed.decode("utf-8")]
        else:
            return decompressed.decode("utf-8")
    elif file_format == "json":
        return response.json()
    elif file_format == "tsv":
        return [line for line in response.text.split("\n") if line]
    elif file_format == "xlsx":
        return [response.content]
    elif file_format == "xml":
        return [response.text]
    return response.text


def get_xml_namespace(element):
    m = re.match(r"\{(.*)\}", element.tag)
    return m.groups()[0] if m else ""


def merge_xml_results(xml_results):
    merged_root = ElementTree.fromstring(xml_results[0])
    for result in xml_results[1:]:
        root = ElementTree.fromstring(result)
        for child in root.findall("{http://uniprot.org/uniprot}entry"):
            merged_root.insert(-1, child)
    ElementTree.register_namespace("", get_xml_namespace(merged_root[0]))
    return ElementTree.tostring(merged_root, encoding="utf-8", xml_declaration=True)


def print_progress_batches(batch_index, size, total):
    n_fetched = min((batch_index + 1) * size, total)
    print(f"Fetched: {n_fetched} / {total}")


def get_id_mapping_results_search(url):
    parsed = urlparse(url)
    query = parse_qs(parsed.query)
    file_format = query["format"][0] if "format" in query else "json"
    if "size" in query:
        size = int(query["size"][0])
    else:
        size = 500
        query["size"] = size
    compressed = (
        query["compressed"][0].lower() == "true" if "compressed" in query else False
    )
    parsed = parsed._replace(query=urlencode(query, doseq=True))
    url = parsed.geturl()
    request = session.get(url)
    check_response(request)
    results = decode_results(request, file_format, compressed)
    total = int(request.headers["x-total-results"])
    print_progress_batches(0, size, total)
    for i, batch in enumerate(get_batch(request, file_format, compressed), 1):
        results = combine_batches(results, batch, file_format)
        print_progress_batches(i, size, total)
    if file_format == "xml":
        return merge_xml_results(results)
    return results


def get_id_mapping_results_stream(url):
    if "/stream/" not in url:
        url = url.replace("/results/", "/results/stream/")
    request = session.get(url)
    check_response(request)
    parsed = urlparse(url)
    query = parse_qs(parsed.query)
    file_format = query["format"][0] if "format" in query else "json"
    compressed = (
        query["compressed"][0].lower() == "true" if "compressed" in query else False
    )
    return decode_results(request, file_format, compressed)


job_id = submit_id_mapping(
    from_db="UniProtKB_AC-ID", to_db="HGNC", ids=["Q12809"]
)
if check_id_mapping_results_ready(job_id):
    link = get_id_mapping_results_link(job_id)
    results = get_id_mapping_results_search(link)
    # Equivalently using the stream endpoint which is more demanding
    # on the API and so is less stable:
    # results = get_id_mapping_results_stream(link)

print(results)
# {'results': [{'from': 'P05067', 'to': 'CHEMBL2487'}], 'failedIds': ['P12345']}

Fetched: 1 / 1
{'results': [{'from': 'Q12809', 'to': 'HGNC:6251'}]}


In [None]:
for i in range(0, df.shape[0]): 
  print('Going through row', i, ' out of', df.shape[0])

  id = df['ACCESSION'][i].split('|')[0]
  print(id)
  org = df['ORGANISM'][i]
  print(org)

  if org == 'Homo sapiens': 
    to = 'HGNC'
    pre = ''
  elif org == 'Rattus norvegicus': 
    to = 'RGD'
    pre = 'RGD:'
  elif org == 'Mus musculus': 
    to = 'MGI'
    pre = ''
  elif org == 'Drosophila melanogaster':
    to = 'FlyBase'
    pre = 'FlyBase'
  elif org == 'Caenorhabditis elegans': 
    to = 'WormBase'
    pre = 'WormBase:'
  elif org == 'Danio rerio': 
    to = 'ZFIN'
    pre = 'ZFIN:'
  elif org == 'Saccharomyces cerevisiae': 
    to = 'SGD'
    pre = 'SGD:'
  else:   
    to = 'Ensembl'
    pre = 'ENSEMBL:'
  
  
  job_id = submit_id_mapping(
  from_db="UniProtKB_AC-ID", to_db=to , ids=[id])

  try: 
    if check_id_mapping_results_ready(job_id):
        link = get_id_mapping_results_link(job_id)
        results = get_id_mapping_results_search(link)
        # Equivalently using the stream endpoint which is more demanding
        # on the API and so is less stable:
        # results = get_id_mapping_results_stream(link)
  except: 
    continue
  #print(results)
  if len(results['results'])>0: 
    df['NEW_ID'][i] = pre + results['results'][0]['to']
  else: 
    df['NEW_ID'][i] = 'NA'

Going through row 0  out of 19378
Q12809
Homo sapiens
Fetched: 1 / 1
Going through row 1  out of 19378
P35498
Homo sapiens


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Fetched: 1 / 1
Going through row 2  out of 19378
P35499
Homo sapiens
Fetched: 1 / 1
Going through row 3  out of 19378
P34995
Homo sapiens
Fetched: 1 / 1
Going through row 4  out of 19378
P10635
Homo sapiens
Fetched: 1 / 1
Going through row 5  out of 19378
P46098
Homo sapiens
Fetched: 1 / 1
Going through row 6  out of 19378
Q9UK17
Homo sapiens
Fetched: 1 / 1
Going through row 7  out of 19378
P22460
Homo sapiens
Fetched: 1 / 1
Going through row 8  out of 19378
Q01668
Homo sapiens
Fetched: 1 / 1
Going through row 9  out of 19378
O15554
Homo sapiens
Fetched: 1 / 1
Going through row 10  out of 19378
Q01668
Homo sapiens
Fetched: 1 / 1
Going through row 11  out of 19378
O60840
Homo sapiens
Fetched: 1 / 1
Going through row 12  out of 19378
P22002
Rattus norvegicus
Fetched: 1 / 1
Going through row 13  out of 19378
Q02485
Rattus norvegicus
Fetched: 1 / 1
Going through row 14  out of 19378
Q13936
Homo sapiens
Fetched: 1 / 1
Going through row 15  out of 19378
P08588
Homo sapiens
Fetched: 1 / 1
Goi

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Fetched: 1 / 1
Going through row 12043  out of 19378
P51451
Homo sapiens
Fetched: 1 / 1
Going through row 12044  out of 19378
Q13873
Homo sapiens
Fetched: 1 / 1
Going through row 12045  out of 19378
Q8TDC3
Homo sapiens
Fetched: 1 / 1
Going through row 12046  out of 19378
O95819
Homo sapiens
Fetched: 1 / 1
Going through row 12047  out of 19378
Q9P0L2
Homo sapiens
Fetched: 1 / 1
Going through row 12048  out of 19378
Q96L34
Homo sapiens
Fetched: 1 / 1
Going through row 12049  out of 19378
P45985
Homo sapiens
Fetched: 1 / 1
Going through row 12050  out of 19378
P52564
Homo sapiens
Fetched: 1 / 1
Going through row 12051  out of 19378
Q12866
Homo sapiens
Fetched: 1 / 1
Going through row 12052  out of 19378
Q16566
Homo sapiens
Fetched: 1 / 1
Going through row 12053  out of 19378
P11799
Gallus gallus
Fetched: 0 / 0
Going through row 12054  out of 19378
P80192
Homo sapiens
Fetched: 1 / 1
Going through row 12055  out of 19378
Q1658

In [None]:
df = pd.read_csv('drug-target-all.csv', header=0)

In [None]:
df

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,...,ACT_SOURCE,RELATION,MOA,MOA_SOURCE,ACT_SOURCE_URL,MOA_SOURCE_URL,ACTION_TYPE,TDL,ORGANISM,NEW_ID
0,levobupivacaine,4,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,KCNH2_HUMAN,4.890,,IC50,...,CHEMBL,=,,,,,,Tclin,Homo sapiens,HGNC:6251
1,levobupivacaine,4,Sodium channel protein type 1 subunit alpha,Ion channel,P35498,SCN1A,SCN1A_HUMAN,5.790,,IC50,...,WOMBAT-PK,=,,,,,,Tclin,Homo sapiens,HGNC:10585
2,levobupivacaine,4,Sodium channel protein type 4 subunit alpha,Ion channel,P35499,SCN4A,SCN4A_HUMAN,,,,...,WOMBAT-PK,,1.0,CHEMBL,,https://www.ebi.ac.uk/chembl/compound/inspect/...,BLOCKER,Tclin,Homo sapiens,HGNC:10591
3,levobupivacaine,4,Prostaglandin E2 receptor EP1 subtype,GPCR,P34995,PTGER1,PE2R1_HUMAN,,,,...,WOMBAT-PK,,,,,,,Tclin,Homo sapiens,HGNC:9593
4,levobupivacaine,4,Cytochrome P450 2D6,Enzyme,P10635,CYP2D6,CP2D6_HUMAN,6.707,,IC50,...,DRUG MATRIX,=,,,,,,Tclin,Homo sapiens,HGNC:2625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19373,samidorphan,5460,Delta-type opioid receptor,GPCR,P41143,OPRD1,OPRD_HUMAN,8.590,,Ki,...,DRUG LABEL,=,,,https://www.accessdata.fda.gov/drugsatfda_docs...,,PARTIAL AGONIST,Tclin,Homo sapiens,HGNC:8153
19374,sotorasib,5461,GTPase KRas,Enzyme,P01116,KRAS,RASK_HUMAN,7.030,,IC50,...,DRUG LABEL,=,1.0,DRUG LABEL,https://www.accessdata.fda.gov/drugsatfda_docs...,https://www.accessdata.fda.gov/drugsatfda_docs...,INHIBITOR,Tchem,Homo sapiens,HGNC:6407
19375,ibrexafungerp,5462,"Beta-1,3-glucan synthase catalytic subunit 1",Enzyme,O13428,GSC1,O13428_CANAX,8.350,,IC50,...,SCIENTIFIC LITERATURE,=,1.0,DRUG LABEL,https://pubmed.ncbi.nlm.nih.gov/24323472,https://www.accessdata.fda.gov/drugsatfda_docs...,INHIBITOR,,Candida albicans,
19376,ibrexafungerp,5462,"1,3-Beta-D-glucan-UDP glucosyltransferase",Enzyme,Q6FTN8,FKS1,Q6FTN8_CANGA,7.830,,IC50,...,SCIENTIFIC LITERATURE,=,1.0,SCIENTIFIC LITERATURE,https://pubmed.ncbi.nlm.nih.gov/24323472,https://www.accessdata.fda.gov/drugsatfda_docs...,INHIBITOR,,Candida glabrata,


Drop rows that don't have a new ID: 

In [None]:
df = df.drop(df[df.NEW_ID == 'NA'].index)

Keep only rows whose the target is in the Monarch nodes:

In [None]:
df = df[df['NEW_ID'].isin(list(nodes['id']))]

Save the Drug-Target dataframe as .csv: 

In [None]:
df.to_csv('drug.target.final.ELA.csv', encoding = 'utf-8-sig') 

Create and save a dataframe containing the drug nodes: 

In [None]:
df_drugs = df[['DRUG_NAME', 'STRUCT_ID']]
df_drugs = df_drugs.drop_duplicates()
df_drugs.to_csv('drug_nodes_ELA.csv', encoding = 'utf-8-sig') 

In [None]:
df

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,...,ACT_SOURCE,RELATION,MOA,MOA_SOURCE,ACT_SOURCE_URL,MOA_SOURCE_URL,ACTION_TYPE,TDL,ORGANISM,NEW_ID
21,phenylbutanoic acid,24,Aldose reductase,Enzyme,P15121,AKR1B1,ALDR_HUMAN,4.02,,IC50,...,CHEMBL,=,,,,,,Tclin,Homo sapiens,HGNC:381
59,aceclofenac,43,Aldose reductase,Enzyme,P07943,Akr1b1,ALDR_RAT,5.89,,IC50,...,DRUG MATRIX,=,,,,,,,Rattus norvegicus,RGD:2092
67,acemetacin,47,Lactoylglutathione lyase,Enzyme,Q04760,GLO1,LGUL_HUMAN,4.89,,Ki,...,CHEMBL,=,,,,,,Tchem,Homo sapiens,HGNC:4323
174,acetylcysteine,66,Cytochrome c,Enzyme,P99999,CYCS,CYC_HUMAN,,,,...,WOMBAT-PK,,,,,,,Tbio,Homo sapiens,HGNC:19986
175,acetylcysteine,66,Vascular endothelial growth factor A,Secreted,P15692,VEGFA,VEGFA_HUMAN,,,,...,WOMBAT-PK,,,,,,,Tclin,Homo sapiens,HGNC:12680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19194,lonafarnib,5414,GTPase KRas,Enzyme,P01116,KRAS,RASK_HUMAN,8.28,,IC50,...,IUPHAR,=,,,,,INHIBITOR,Tchem,Homo sapiens,HGNC:6407
19218,vadadustat,5420,Vascular endothelial growth factor A,Secreted,P15692,VEGFA,VEGFA_HUMAN,5.12,,EC50,...,CHEMBL,=,,,,,,Tclin,Homo sapiens,HGNC:12680
19220,vadadustat,5420,Vascular endothelial growth factor A,Unclassified,P15692,VEGFA,VEGFA_HUMAN,5.12,,EC50,...,CHEMBL,=,,,,,,Tclin,Homo sapiens,HGNC:12680
19302,estetrol,5450,Estrogen receptor beta,Nuclear hormone receptor,Q92731,ESR2,ESR2_HUMAN,7.72,,Ki,...,SCIENTIFIC LITERATURE,=,1.0,DRUG LABEL,https://pubmed.ncbi.nlm.nih.gov/18464023,https://www.accessdata.fda.gov/drugsatfda_docs...,MODULATOR,Tclin,Homo sapiens,HGNC:3468


## **Drug-Disease Information**

### **Text to CSV**

If already saved, run only the cell of this section, otherwise continue:

Create a Dataframe that will contain the Drug-Disease information:

In [None]:
column_names = ["DRUG_ID", "DRUG_NAME", "DISEASES", 'PHASE']

df = pd.DataFrame(columns = column_names)

Read the .txt file downloaded from TTD and convert it into a pandas dataframe:

In [None]:
with open('drug_to_disease.txt') as f:
  lines = f.readlines()

In [None]:
i = 0
id = ''
name = ''
disease = ''
phase = ''

for line in lines: 
  if line.startswith('DRUGNAME'): 
    result = re.search('\t(.*)\n', line)
    name = result.group(1)
  elif line.startswith('TTDDRUID'): 
    result = re.search('\t(.*)\n', line)
    id = result.group(1)
  elif line.startswith('INDICATI'): 
    result = re.search('\t(.*)\[', line)
    result2 = re.search('\](.*)', line)
    disease = result.group(1) 
    phase = result2.group(1) 
    df.loc[i] = [id,name,disease, phase]
    i += 1
  elif line.startswith('\n'): 
    id = ''
    name = ''
    disease = ''
    phase = ''


In [None]:
df

Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE
0,D00ABE,ALD-301,Ischemia,Phase 2
1,D00ABE,ALD-301,Peripheral arterial disease,Phase 2
2,D00ABO,KW-2449,Acute myeloid leukaemia,Phase 1
3,D00ABW,Opterone,Hormone deficiency,Discontinued in Phase 1
4,D00ACC,ND1251,Depression,Discontinued in Phase 1
...,...,...,...,...
28973,DZTX12,ASC-J9,End-stage renal disease,Phase 2
28974,DZU72C,OKI 179,Solid tumour/cancer,Phase 1
28975,DZVF15,TRN-110,Attention deficit hyperactivity disorder,Clinical Trial
28976,DZW53X,GSK4069889,Non-small cell lung cancer,Phase 2


In [None]:
df.to_csv('drug_to_disease (1).csv', encoding = 'utf-8-sig') 

Run only this cell if you have already saved the Drug-Disease interactions: 

In [None]:
df = pd.read_csv('drug_to_disease (1).csv', header=0, index_col=0)

In [None]:
df

Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE
0,D00ABE,ALD-301,Ischemia,Phase 2
1,D00ABE,ALD-301,Peripheral arterial disease,Phase 2
2,D00ABO,KW-2449,Acute myeloid leukaemia,Phase 1
3,D00ABW,Opterone,Hormone deficiency,Discontinued in Phase 1
4,D00ACC,ND1251,Depression,Discontinued in Phase 1
...,...,...,...,...
28973,DZTX12,ASC-J9,End-stage renal disease,Phase 2
28974,DZU72C,OKI 179,Solid tumour/cancer,Phase 1
28975,DZVF15,TRN-110,Attention deficit hyperactivity disorder,Clinical Trial
28976,DZW53X,GSK4069889,Non-small cell lung cancer,Phase 2


In [None]:
df.to_csv('drug_to_disease (1).csv', encoding = 'utf-8-sig') 

### **Load CSVs**

Load the drug nodes (obtained from Drug Central) and the Monarch nodes: 

In [None]:
graph_drugs = pd.read_csv('drug_nodes_ELA.csv', header=0)
nodes = pd.read_csv('monarch_nodes_v2022-12-29.csv', header=0)

Get list of unique drugs and diseases:

In [None]:
graph_diseases = nodes[nodes['semantic_groups'] == 'DISO']
unique_diseases = graph_diseases['name'].unique()
unique_dis_id = graph_diseases['id'].unique()
unique_diseases = [x.lower() for x in unique_diseases]

In [None]:
unique_drugs = graph_drugs['DRUG_NAME'].unique()
unique_drugs = [x.lower() for x in unique_drugs]
len(unique_drugs)

229

At this point you should use SORTA tool (https://sorta.molgeniscloud.org/menu/main/sorta?) to match the IDs of TTD to Human Phenotype Ontology (HPO). The file is also available in the project's Github:

In [None]:
matched = pd.read_csv('matched.csv', header = 0, delimiter = ';')

In [None]:
matched

Unnamed: 0,Name,ontologyTermName,ontologyTermIRI,score,validated,review
0,respiratory failure,Respiratory failure,http://purl.obolibrary.org/obo/HP_0002878,100.00,False,False
1,sexual dysfunction,Male sexual dysfunction,http://purl.obolibrary.org/obo/HP_0040307,86.49,False,False
2,achondroplasia,Bronchodysplasia,http://purl.obolibrary.org/obo/HP_0006533,62.50,False,False
3,glabellar frown line,Prominent glabella,http://purl.obolibrary.org/obo/HP_0002057,59.46,False,False
4,testicular germ cell tumour,Testicular neoplasm,http://purl.obolibrary.org/obo/HP_0010788,71.11,False,False
...,...,...,...,...,...,...
1796,systemic mastocytosis,Mastocytosis,http://purl.obolibrary.org/obo/HP_0100495,77.42,False,False
1797,acute iron or aluminum toxicity,Abnormal total iron binding capacity,http://purl.obolibrary.org/obo/HP_0033212,48.15,False,False
1798,chronic inflammatory demyelinating polyneuropathy,Acute demyelinating polyneuropathy,http://purl.obolibrary.org/obo/HP_0007131,64.86,False,False
1799,aortic aneurysm,Aortic aneurysm,http://purl.obolibrary.org/obo/HP_0004942,100.00,False,False


Select only those IDs with a score greater than 80:

In [None]:
matched = matched[matched['score'] > 80]

Create the final ID:

In [None]:
matched['ID'] = matched['ontologyTermIRI'].str.split('/obo/').str[1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matched['ID'] = matched['ontologyTermIRI'].str.split('/obo/').str[1]


In [None]:
new_id = []
for i in matched['ID']: 
  id = re.sub("[^0-9a-zA-Z]+", ":", i)
  new_id.append(id)
matched['ID'] = new_id


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matched['ID'] = new_id


In [None]:
i = 0
for index, row in df.iterrows(): 
  if row['DRUG_NAME'].lower() in unique_drugs: 
    if row['DISEASES'].lower() in unique_diseases: 
      i += 1

### **Merging**

In [None]:
modified = []
for d in df['DISEASES']: 
  new_string = re.sub("[^0-9a-zA-Z]+", " ", d)
  modified.append(new_string.lower())

In [None]:
df['Name'] = modified

In [None]:
df['Name'] = df['Name'].str.strip()
matched['Name'] = matched['Name'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matched['Name'] = matched['Name'].str.strip()


In [None]:
final = pd.merge(df, matched, on = 'Name', how = 'left')

In [None]:
final

Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE,Name,ontologyTermName,ontologyTermIRI,score,validated,review,ID
0,D00ABE,ALD-301,Ischemia,Phase 2,ischemia,,,,,,
1,D00ABE,ALD-301,Peripheral arterial disease,Phase 2,peripheral arterial disease,Peripheral arterial stenosis,http://purl.obolibrary.org/obo/HP_0004950,100.0,False,False,HP:0004950
2,D00ABO,KW-2449,Acute myeloid leukaemia,Phase 1,acute myeloid leukaemia,Acute myeloid leukemia,http://purl.obolibrary.org/obo/HP_0004808,100.0,False,False,HP:0004808
3,D00ABW,Opterone,Hormone deficiency,Discontinued in Phase 1,hormone deficiency,,,,,,
4,D00ACC,ND1251,Depression,Discontinued in Phase 1,depression,Depressivity,http://purl.obolibrary.org/obo/HP_0000716,100.0,False,False,HP:0000716
...,...,...,...,...,...,...,...,...,...,...,...
29562,DZU72C,OKI 179,Solid tumour/cancer,Phase 1,solid tumour cancer,,,,,,
29563,DZVF15,TRN-110,Attention deficit hyperactivity disorder,Clinical Trial,attention deficit hyperactivity disorder,Attention deficit hyperactivity disorder,http://purl.obolibrary.org/obo/HP_0007018,100.0,False,False,HP:0007018
29564,DZW53X,GSK4069889,Non-small cell lung cancer,Phase 2,non small cell lung cancer,Non-small cell lung carcinoma,http://purl.obolibrary.org/obo/HP_0030358,100.0,False,False,HP:0030358
29565,DZW53X,GSK4069889,Non-small cell lung cancer,Phase 2,non small cell lung cancer,Non-small cell lung carcinoma,http://purl.obolibrary.org/obo/HP_0030358,100.0,False,False,HP:0030358


In [None]:
final.to_csv('drug_to_disease_HP.csv', encoding = 'utf-8-sig') 

### **Comparison**

In [None]:
final = pd.read_csv('drug_to_disease_HP.csv', header=0)

In [None]:
final_filtered = final[final['score'] == 100]

In [None]:
len(final_filtered)

12151

In [None]:
final_filtered

Unnamed: 0.1,Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE,Name,ontologyTermName,ontologyTermIRI,score,validated,review,ID
1,1,D00ABE,ALD-301,Peripheral arterial disease,Phase 2,peripheral arterial disease,Peripheral arterial stenosis,http://purl.obolibrary.org/obo/HP_0004950,100.0,False,False,HP:0004950
2,2,D00ABO,KW-2449,Acute myeloid leukaemia,Phase 1,acute myeloid leukaemia,Acute myeloid leukemia,http://purl.obolibrary.org/obo/HP_0004808,100.0,False,False,HP:0004808
4,4,D00ACC,ND1251,Depression,Discontinued in Phase 1,depression,Depressivity,http://purl.obolibrary.org/obo/HP_0000716,100.0,False,False,HP:0000716
11,11,D00AHT,PRAME antigen-specific cancer immunotherapeutic,Non-small-cell lung cancer,Phase 2,non small cell lung cancer,Non-small cell lung carcinoma,http://purl.obolibrary.org/obo/HP_0030358,100.0,False,False,HP:0030358
12,12,D00AHT,PRAME antigen-specific cancer immunotherapeutic,Non-small-cell lung cancer,Phase 2,non small cell lung cancer,Non-small cell lung carcinoma,http://purl.obolibrary.org/obo/HP_0030358,100.0,False,False,HP:0030358
...,...,...,...,...,...,...,...,...,...,...,...,...
29556,29556,DZNH43,MT-3724,B-cell lymphoma,Phase 1,b cell lymphoma,B-cell lymphoma,http://purl.obolibrary.org/obo/HP_0012191,100.0,False,False,HP:0012191
29561,29561,DZTX12,ASC-J9,End-stage renal disease,Phase 2,end stage renal disease,Stage 5 chronic kidney disease,http://purl.obolibrary.org/obo/HP_0003774,100.0,False,False,HP:0003774
29563,29563,DZVF15,TRN-110,Attention deficit hyperactivity disorder,Clinical Trial,attention deficit hyperactivity disorder,Attention deficit hyperactivity disorder,http://purl.obolibrary.org/obo/HP_0007018,100.0,False,False,HP:0007018
29564,29564,DZW53X,GSK4069889,Non-small cell lung cancer,Phase 2,non small cell lung cancer,Non-small cell lung carcinoma,http://purl.obolibrary.org/obo/HP_0030358,100.0,False,False,HP:0030358


In [None]:
column_names = ["DRUG_ID", "DRUG_NAME", "DISEASES", 'PHASE']

df = pd.DataFrame(columns = column_names)

In [None]:
unique_drugs

['phenylbutanoic acid',
 'aceclofenac',
 'acemetacin',
 'acetylcysteine',
 'acetyldigitoxin',
 'adenosine',
 'adenosine triphosphate',
 'adenosine phosphate',
 'aminopicoline',
 'amodiaquine',
 'androstenediol',
 'apomorphine',
 'aripiprazole',
 'arsenic trioxide',
 'atorvastatin',
 'azathioprine',
 'bendamustine',
 'benserazide',
 'betamethasone',
 'bortezomib',
 'bumetanide',
 'carmustine',
 'carvedilol',
 'celecoxib',
 'chlorambucil',
 'chlorcyclizine',
 'chloroquine',
 'chlorotrianisene',
 'chlorzoxazone',
 'cianidanol',
 'clioquinol',
 'clomipramine',
 'clotrimazole',
 'clozapine',
 'cortisone acetate',
 'ciclosporin',
 'dantrolene',
 'dasatinib',
 'delavirdine',
 'demeclocycline',
 'deslanoside',
 'desoximetasone',
 'dexamethasone',
 'diazepam',
 'dichlorophen',
 'dicoumarol',
 'diethylstilbestrol',
 'digitoxin',
 'digoxin',
 'doxorubicin',
 'econazole',
 'edetic acid',
 'epalrestat',
 'estradiol',
 'estramustine phosphate',
 'estriol succinate',
 'ethacrynic acid',
 'ethinylestr

In [None]:
i = 0
for index, row in final_filtered.iterrows(): 
  if row['DRUG_NAME'].lower() not in unique_drugs or row['ID'] not in unique_dis_id:
   final_filtered = final_filtered.drop(labels = index, axis = 0)
len(final_filtered)

28

In [None]:
i = 0
for index, row in final_filtered.iterrows(): 
  drug = row['DRUG_NAME'].lower()
  id = graph_drugs[graph_drugs['DRUG_NAME'] == drug]['STRUCT_ID'].values[0]
  final_filtered.at[index, 'DRUG_ID'] = id
len(final_filtered)

28

In [None]:
final_filtered

Unnamed: 0.1,Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE,Name,ontologyTermName,ontologyTermIRI,score,validated,review,ID
1302,1302,91,Adenosine triphosphate,Bradycardia,Discontinued in Phase 2,bradycardia,Bradycardia,http://purl.obolibrary.org/obo/HP_0001662,100.0,False,False,HP:0001662
2262,2262,2351,Raloxifene,Osteoporosis,Approved,osteoporosis,Osteoporosis,http://purl.obolibrary.org/obo/HP_0000939,100.0,False,False,HP:0000939
3015,3015,882,Digoxin,Heart failure,Approved,heart failure,Congestive heart failure,http://purl.obolibrary.org/obo/HP_0001635,100.0,False,False,HP:0001635
3016,3016,882,Digoxin,Arrhythmia,Approved,arrhythmia,Arrhythmia,http://purl.obolibrary.org/obo/HP_0011675,100.0,False,False,HP:0011675
3978,3978,1021,Epalrestat,Pain,Investigative,pain,Pain,http://purl.obolibrary.org/obo/HP_0012531,100.0,False,False,HP:0012531
4141,4141,2179,Pioglitazone,Obesity,Investigative,obesity,Obesity,http://purl.obolibrary.org/obo/HP_0001513,100.0,False,False,HP:0001513
4323,4323,568,Celecoxib,Pain,Phase 3,pain,Pain,http://purl.obolibrary.org/obo/HP_0012531,100.0,False,False,HP:0012531
6768,6768,1989,Omapatrilat,Hypertension,Terminated,hypertension,Hypertension,http://purl.obolibrary.org/obo/HP_0000822,100.0,False,False,HP:0000822
9064,9064,813,Deslanoside,Heart failure,Approved,heart failure,Congestive heart failure,http://purl.obolibrary.org/obo/HP_0001635,100.0,False,False,HP:0001635
9065,9065,813,Deslanoside,Arrhythmia,Approved,arrhythmia,Arrhythmia,http://purl.obolibrary.org/obo/HP_0011675,100.0,False,False,HP:0011675


In [None]:
final_filtered = final_filtered.drop(labels = ['Unnamed: 0', 'score'], axis =1)

In [None]:
final_filtered

Unnamed: 0,DRUG_ID,DRUG_NAME,DISEASES,PHASE,Name,ontologyTermName,ontologyTermIRI,validated,review,ID
1302,91,Adenosine triphosphate,Bradycardia,Discontinued in Phase 2,bradycardia,Bradycardia,http://purl.obolibrary.org/obo/HP_0001662,False,False,HP:0001662
2262,2351,Raloxifene,Osteoporosis,Approved,osteoporosis,Osteoporosis,http://purl.obolibrary.org/obo/HP_0000939,False,False,HP:0000939
3015,882,Digoxin,Heart failure,Approved,heart failure,Congestive heart failure,http://purl.obolibrary.org/obo/HP_0001635,False,False,HP:0001635
3016,882,Digoxin,Arrhythmia,Approved,arrhythmia,Arrhythmia,http://purl.obolibrary.org/obo/HP_0011675,False,False,HP:0011675
3978,1021,Epalrestat,Pain,Investigative,pain,Pain,http://purl.obolibrary.org/obo/HP_0012531,False,False,HP:0012531
4141,2179,Pioglitazone,Obesity,Investigative,obesity,Obesity,http://purl.obolibrary.org/obo/HP_0001513,False,False,HP:0001513
4323,568,Celecoxib,Pain,Phase 3,pain,Pain,http://purl.obolibrary.org/obo/HP_0012531,False,False,HP:0012531
6768,1989,Omapatrilat,Hypertension,Terminated,hypertension,Hypertension,http://purl.obolibrary.org/obo/HP_0000822,False,False,HP:0000822
9064,813,Deslanoside,Heart failure,Approved,heart failure,Congestive heart failure,http://purl.obolibrary.org/obo/HP_0001635,False,False,HP:0001635
9065,813,Deslanoside,Arrhythmia,Approved,arrhythmia,Arrhythmia,http://purl.obolibrary.org/obo/HP_0011675,False,False,HP:0011675


In [None]:
final_filtered.to_csv('drug_to_disease_final_v2_ELA.csv', encoding = 'utf-8-sig') 