## **Import**

In [None]:
import re
import numpy as np
import pandas as pd
import urllib.parse
import urllib.request
import requests

## **Drug-Target Information**

### **Load Files**

Load the information from Drug Central and Monarch: 

In [None]:
df = pd.read_csv('drug.target.interaction.tsv', header=0, sep='\t')
nodes = pd.read_csv('graph_nodes_v2022-01-11.csv', header=0)

In [None]:
df['NEW_ID'] = ''

### **Uniprot to Monarch IDs**

Use Uniprot API to obtain new target IDs: 

In [None]:
import re
import time
import json
import zlib
from xml.etree import ElementTree
from urllib.parse import urlparse, parse_qs, urlencode
import requests
from requests.adapters import HTTPAdapter, Retry


POLLING_INTERVAL = 3
API_URL = "https://rest.uniprot.org"


retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))


def check_response(response):
    try:
        response.raise_for_status()
    except requests.HTTPError:
        print(response.json())
        raise


def submit_id_mapping(from_db, to_db, ids):
    request = requests.post(
        f"{API_URL}/idmapping/run",
        data={"from": from_db, "to": to_db, "ids": ",".join(ids)},
    )
    check_response(request)
    return request.json()["jobId"]


def get_next_link(headers):
    re_next_link = re.compile(r'<(.+)>; rel="next"')
    if "Link" in headers:
        match = re_next_link.match(headers["Link"])
        if match:
            return match.group(1)


def check_id_mapping_results_ready(job_id):
    while True:
        request = session.get(f"{API_URL}/idmapping/status/{job_id}")
        check_response(request)
        j = request.json()
        if "jobStatus" in j:
            if j["jobStatus"] == "RUNNING":
                print(f"Retrying in {POLLING_INTERVAL}s")
                time.sleep(POLLING_INTERVAL)
            else:
                raise Exception(j["jobStatus"])
        else:
            return bool(j["results"] or j["failedIds"])


def get_batch(batch_response, file_format, compressed):
    batch_url = get_next_link(batch_response.headers)
    while batch_url:
        batch_response = session.get(batch_url)
        batch_response.raise_for_status()
        yield decode_results(batch_response, file_format, compressed)
        batch_url = get_next_link(batch_response.headers)


def combine_batches(all_results, batch_results, file_format):
    if file_format == "json":
        for key in ("results", "failedIds"):
            if key in batch_results and batch_results[key]:
                all_results[key] += batch_results[key]
    elif file_format == "tsv":
        return all_results + batch_results[1:]
    else:
        return all_results + batch_results
    return all_results


def get_id_mapping_results_link(job_id):
    url = f"{API_URL}/idmapping/details/{job_id}"
    request = session.get(url)
    check_response(request)
    return request.json()["redirectURL"]


def decode_results(response, file_format, compressed):
    if compressed:
        decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
        if file_format == "json":
            j = json.loads(decompressed.decode("utf-8"))
            return j
        elif file_format == "tsv":
            return [line for line in decompressed.decode("utf-8").split("\n") if line]
        elif file_format == "xlsx":
            return [decompressed]
        elif file_format == "xml":
            return [decompressed.decode("utf-8")]
        else:
            return decompressed.decode("utf-8")
    elif file_format == "json":
        return response.json()
    elif file_format == "tsv":
        return [line for line in response.text.split("\n") if line]
    elif file_format == "xlsx":
        return [response.content]
    elif file_format == "xml":
        return [response.text]
    return response.text


def get_xml_namespace(element):
    m = re.match(r"\{(.*)\}", element.tag)
    return m.groups()[0] if m else ""


def merge_xml_results(xml_results):
    merged_root = ElementTree.fromstring(xml_results[0])
    for result in xml_results[1:]:
        root = ElementTree.fromstring(result)
        for child in root.findall("{http://uniprot.org/uniprot}entry"):
            merged_root.insert(-1, child)
    ElementTree.register_namespace("", get_xml_namespace(merged_root[0]))
    return ElementTree.tostring(merged_root, encoding="utf-8", xml_declaration=True)


def print_progress_batches(batch_index, size, total):
    n_fetched = min((batch_index + 1) * size, total)
    print(f"Fetched: {n_fetched} / {total}")


def get_id_mapping_results_search(url):
    parsed = urlparse(url)
    query = parse_qs(parsed.query)
    file_format = query["format"][0] if "format" in query else "json"
    if "size" in query:
        size = int(query["size"][0])
    else:
        size = 500
        query["size"] = size
    compressed = (
        query["compressed"][0].lower() == "true" if "compressed" in query else False
    )
    parsed = parsed._replace(query=urlencode(query, doseq=True))
    url = parsed.geturl()
    request = session.get(url)
    check_response(request)
    results = decode_results(request, file_format, compressed)
    total = int(request.headers["x-total-results"])
    print_progress_batches(0, size, total)
    for i, batch in enumerate(get_batch(request, file_format, compressed), 1):
        results = combine_batches(results, batch, file_format)
        print_progress_batches(i, size, total)
    if file_format == "xml":
        return merge_xml_results(results)
    return results


def get_id_mapping_results_stream(url):
    if "/stream/" not in url:
        url = url.replace("/results/", "/results/stream/")
    request = session.get(url)
    check_response(request)
    parsed = urlparse(url)
    query = parse_qs(parsed.query)
    file_format = query["format"][0] if "format" in query else "json"
    compressed = (
        query["compressed"][0].lower() == "true" if "compressed" in query else False
    )
    return decode_results(request, file_format, compressed)


job_id = submit_id_mapping(
    from_db="UniProtKB_AC-ID", to_db="HGNC", ids=["Q12809"]
)
if check_id_mapping_results_ready(job_id):
    link = get_id_mapping_results_link(job_id)
    results = get_id_mapping_results_search(link)
    # Equivalently using the stream endpoint which is more demanding
    # on the API and so is less stable:
    # results = get_id_mapping_results_stream(link)

print(results)
# {'results': [{'from': 'P05067', 'to': 'CHEMBL2487'}], 'failedIds': ['P12345']}

In [None]:
for i in range(0, df.shape[0]): 
  print('Going through row', i, ' out of', df.shape[0])

  id = df['ACCESSION'][i].split('|')[0]
  print(id)
  org = df['ORGANISM'][i]
  print(org)

  if org == 'Homo sapiens': 
    to = 'HGNC'
    pre = ''
  elif org == 'Rattus norvegicus': 
    to = 'RGD'
    pre = 'RGD:'
  elif org == 'Mus musculus': 
    to = 'MGI'
    pre = ''
  elif org == 'Drosophila melanogaster':
    to = 'FlyBase'
    pre = 'FlyBase'
  elif org == 'Caenorhabditis elegans': 
    to = 'WormBase'
    pre = 'WormBase:'
  elif org == 'Danio rerio': 
    to = 'ZFIN'
    pre = 'ZFIN:'
  elif org == 'Saccharomyces cerevisiae': 
    to = 'SGD'
    pre = 'SGD:'
  else:   
    to = 'Ensembl'
    pre = 'ENSEMBL:'
  
  
  job_id = submit_id_mapping(
  from_db="UniProtKB_AC-ID", to_db=to , ids=[id])

  try: 
    if check_id_mapping_results_ready(job_id):
        link = get_id_mapping_results_link(job_id)
        results = get_id_mapping_results_search(link)
        # Equivalently using the stream endpoint which is more demanding
        # on the API and so is less stable:
        # results = get_id_mapping_results_stream(link)
  except: 
    continue
  #print(results)
  if len(results['results'])>0: 
    df['NEW_ID'][i] = pre + results['results'][0]['to']
  else: 
    df['NEW_ID'][i] = 'NA'

In [None]:
df

Drop rows that don't have a new ID: 

In [None]:
df = df.drop(df[df.NEW_ID == 'NA'].index)

Keep only rows whose the target is in the Monarch nodes:

In [None]:
df = df[df['NEW_ID'].isin(list(nodes['id']))]

Save the Drug-Target dataframe as .csv: 

In [None]:
df.to_csv('drug_target_edges.csv', encoding = 'utf-8-sig') 

Create and save a dataframe containing the drug nodes: 

In [None]:
df_drugs = df[['DRUG_NAME', 'STRUCT_ID']]
df_drugs = df_drugs.drop_duplicates()
df_drugs.to_csv('drug_nodes.csv', encoding = 'utf-8-sig') 

In [None]:
df

## **Drug-Disease Information**

### **Text to CSV**

If already saved, run only the cell of this section, otherwise continue:

Create a Dataframe that will contain the Drug-Disease information:

In [None]:
column_names = ["DRUG_ID", "DRUG_NAME", "DISEASES", 'PHASE']

df = pd.DataFrame(columns = column_names)

Read the .txt file downloaded from TTD and convert it into a pandas dataframe:

In [None]:
with open('drug_to_disease.txt') as f:
  lines = f.readlines()

In [None]:
i = 0
id = ''
name = ''
disease = ''
phase = ''

for line in lines: 
  if line.startswith('DRUGNAME'): 
    result = re.search('\t(.*)\n', line)
    name = result.group(1)
  elif line.startswith('TTDDRUID'): 
    result = re.search('\t(.*)\n', line)
    id = result.group(1)
  elif line.startswith('INDICATI'): 
    result = re.search('\t(.*)\[', line)
    result2 = re.search('\](.*)', line)
    disease = result.group(1) 
    phase = result2.group(1) 
    df.loc[i] = [id,name,disease, phase]
    i += 1
  elif line.startswith('\n'): 
    id = ''
    name = ''
    disease = ''
    phase = ''


In [None]:
df

In [None]:
df.to_csv('drug_to_disease.csv', encoding = 'utf-8-sig') 

Run only this cell if you have already saved the Drug-Disease interactions: 

In [None]:
df = pd.read_csv('drug_to_disease.csv', header=0, index_col=0)

In [None]:
df

### **Load CSVs**

Load the drug nodes (obtained from Drug Central) and the Monarch nodes: 

In [None]:
graph_drugs = pd.read_csv('drug_nodes.csv', header=0)
nodes = pd.read_csv('graph_nodes_v2022-01-11.csv', header=0)

Get list of unique drugs and diseases:

In [None]:
graph_diseases = nodes[nodes['semantic_groups'] == 'DISO']
unique_diseases = graph_diseases['name'].unique()
unique_dis_id = graph_diseases['id'].unique()
unique_diseases = [x.lower() for x in unique_diseases]

In [None]:
unique_drugs = graph_drugs['DRUG_NAME'].unique()
unique_drugs = [x.lower() for x in unique_drugs]
len(unique_drugs)

At this point you should use SORTA tool (https://sorta.molgeniscloud.org/menu/main/sorta?) to match the IDs of TTD to Human Phenotype Ontology (HPO). The file is also available in the project's Github:

In [None]:
matched = pd.read_csv('matched.csv', header = 0, delimiter = ';')

In [None]:
matched

Select only those IDs with a score greater than 80:

In [None]:
matched = matched[matched['score'] > 80]

Create the final ID:

In [None]:
matched['ID'] = matched['ontologyTermIRI'].str.split('/obo/').str[1]

In [None]:
new_id = []
for i in matched['ID']: 
  id = re.sub("[^0-9a-zA-Z]+", ":", i)
  new_id.append(id)
matched['ID'] = new_id


### **Merging**

In [None]:
modified = []
for d in df['DISEASES']: 
  new_string = re.sub("[^0-9a-zA-Z]+", " ", d)
  modified.append(new_string.lower())

In [None]:
df['Name'] = modified

In [None]:
df['Name'] = df['Name'].str.strip()
matched['Name'] = matched['Name'].str.strip()

In [None]:
final = pd.merge(df, matched, on = 'Name', how = 'left')

In [None]:
final

In [None]:
final.to_csv('drug_to_disease_merged.csv', encoding = 'utf-8-sig') 

### **Comparison**

In [None]:
final = pd.read_csv('drug_to_disease_merged.csv', header=0)

In [None]:
final_filtered = final[final['score'] > 80]

In [None]:
len(final_filtered)

In [None]:
final_filtered

In [None]:
column_names = ["DRUG_ID", "DRUG_NAME", "DISEASES", 'PHASE']

df = pd.DataFrame(columns = column_names)

In [None]:
unique_drugs

In [None]:
for index, row in final_filtered.iterrows(): 
  if row['DRUG_NAME'].lower() not in unique_drugs or row['ID'] not in unique_dis_id:
   final_filtered = final_filtered.drop(labels = index, axis = 0)
len(final_filtered)

In [None]:
for index, row in final_filtered.iterrows(): 
  drug = row['DRUG_NAME'].lower()
  id = graph_drugs[graph_drugs['DRUG_NAME'] == drug]['STRUCT_ID'].values[0]
  final_filtered.at[index, 'DRUG_ID'] = id
len(final_filtered)

In [None]:
final_filtered

In [None]:
final_filtered = final_filtered.drop(labels = ['Unnamed: 0', 'score'], axis =1)

In [None]:
final_filtered

In [None]:
final_filtered.to_csv('drug_disease_edges.csv', encoding = 'utf-8-sig') 