# Creating the "PREDICT" dataset based on various data sources

Version 1.0.0 (December 28th 2022). Please run the notebooks "FEATURELESS_dataset-v1.0.0.ipynb" and "TRANSCRIPT_dataset-v1.0.0" beforehand.

## Librairies

In [1]:
import pandas as pd
import numpy as np
import subprocess as sb
import os
import pickle

from multiprocessing import cpu_count
from joblib import Parallel, delayed
from time import time
from itertools import product
n_jobs=cpu_count()-2
assert n_jobs > 0 and n_jobs < cpu_count()
parallel = Parallel(n_jobs=n_jobs, backend='loky')

import sys
sys.path.insert(0, "../../src/")

import paths_global
import utils
import data_processing

## Local paths

In [2]:
## Where database files are stored
print('root_folder="%s"' % paths_global.root_folder)
## Where intermediary files are stored
print('data_folder="%s"' % paths_global.data_folder)

root_folder="/media/kali/1b80f30d-2803-4260-a792-9ae206084252/Code/M30/data/"
data_folder="/media/kali/1b80f30d-2803-4260-a792-9ae206084252/RECeSS/cfdr/data/"


In [3]:
predict_folder = paths_global.data_folder+"PREDICT_v1.0.0/"
sb.Popen(["mkdir", "-p", predict_folder])
## Where PREDICT dataset files are stored
print('predict_folder="%s"' % predict_folder)

predict_folder="/media/kali/1b80f30d-2803-4260-a792-9ae206084252/RECeSS/cfdr/data/PREDICT_v1.0.0/"


## Drug and disease identifiers

In [4]:
assert os.path.exists(paths_global.data_folder+"drugbankid2drugname.pck")
with open(paths_global.data_folder+"drugbankid2drugname.pck", "rb") as f:
    di_drugbankid2drugname = pickle.load(f)
    
assert os.path.exists(paths_global.data_folder+"omimid2diseasename.pck")
with open(paths_global.data_folder+"omimid2diseasename.pck", "rb") as f:
    di_omimid2diseasename = pickle.load(f)
    
cids_file = paths_global.data_folder+"medgenid2diseasename.pck"
if (not os.path.exists(cids_file)):
    di_medgenid2diseasename = {}
else:
    with open(cids_file, "rb") as f:
        di_medgenid2diseasename = pickle.load(f)
        
pubchem_file = paths_global.data_folder+"pubchemid2drugname.pck"
if (not os.path.exists(pubchem_file)):
    di_pubchemid2drugname = {}
else:
    with open(pubchem_file, "rb") as f:
        di_pubchemid2drugname = pickle.load(f)

## I. Matrix A : $N_S \times N_D$ of drug-disease associations

In [5]:
A = pd.read_csv(paths_global.data_folder+"FEATURELESS_v1.0.0/all_ratings.csv", index_col=0)
A

Unnamed: 0,C1851649,C0042133,C5193005,C2676676,C1704272,C4722327,C1858361,C2676676.1,C4310232,C0029456,...,C0242770,C1880129,C0022661,C0236792,C1135191,C0149516,C1835407,C0016667,C0039445,C5203670
657181,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DB00010,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5311128,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DB00017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5311065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
442872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
442021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
ratings_A = utils.matrix2ratings(A, "ind_id", "drug_id", "rating")

print("Sparsity = "+str(utils.compute_sparsity(A))+"%")
utils.print_dataset(ratings_A, "ind_id", "drug_id", "rating")
ratings_A.T

Sparsity = 0.35070326199346175%
Ndrugs=1599	Ndiseases=1599
8658 positive	320 negative	2547823 unknown matchings


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8968,8969,8970,8971,8972,8973,8974,8975,8976,8977
ind_id,C1851649,C0042133,C5193005,C2676676,C1704272,C4722327,C1858361,C0034013,C0014175,C1858361,...,C0014544,C0014544,C0014544,C0014544,C0014544,C0014544,C0014544,C0014544,C0014544,C5203670
drug_id,657181,657181,657181,657181,657181,657181,657181,657181,657181,DB00010,...,2310,492405,4375,3352,3373,5917,442872,442021,4843,DB12466
rating,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0


## II. Drug-drug similarity matrix

We build a larger version of the similarity matrices mentioned in the [PREDICT](https://dx.doi.org/10.1038%2Fmsb.2011.26) paper.

### II.1. Drug-drug similarity: Chemical drug-drug similarities (from structures in DrugBank or in PubChem)

This yields a dataframe `chemical_similarity_df` of size #drugs x #drugs, where identifiers are PubChem or DrugBank identifiers, which contains, when computable, drug-pairwise Tanimoto scores between SMILES fingerprints.

In [7]:
chemical_similarity_file = predict_folder+"chemical_similarity_df.csv"

if (not os.path.exists(chemical_similarity_file)):
    drugbank_chem_file = paths_global.drugbank_folder+"STRUCTURES/Structure External Links/structure links.csv"
    if (not os.path.exists(drugbank_chem_file)):
        sb.call("unzip "+path+"drugbank_all_structure_links.csv.zip", shell=True)
    chemical_based = pd.read_csv(drugbank_chem_file, sep=",", index_col=0, header=0)[["SMILES"]]
    chemical_based = chemical_based.dropna()
    ndrugs=chemical_based.shape[0]
    
    def job_function(i, j, run_id=None, seed=None):
        if (str(seed)!="None"):
            np.random.seed(seed)
        if (i<j):
            return 0.
        elif (i==j):
            return 1.
        else:
            return data_processing.smiles_similarity(chemical_based.values[i,0], chemical_based.values[j,0])

    start = time()
    print("%d drugs, %d jobs" % (ndrugs, n_jobs))
    if (n_jobs==1):
        chemical_similarity = [job_function(i, j) for i, j in product(range(ndrugs), range(ndrugs))]
    else:
        seeds = [np.random.randint(int(10e6)) for _ in range(ndrugs*ndrugs)]
        chemical_similarity = parallel(delayed(job_function)(run_id//ndrugs, run_id%ndrugs, run_id, seed) for run_id, seed in enumerate(seeds))
    
    chemical_similarity = np.array(chemical_similarity).reshape((ndrugs, ndrugs))
    chemical_similarity += chemical_similarity.T
    chemical_similarity[range(ndrugs), range(ndrugs)] = 1.
    end = time()-start

    drug_index=chemical_based.index  
    chemical_similarity_df = pd.DataFrame(chemical_similarity, index=drug_index, columns=drug_index)
    chemical_similarity_df.to_csv(chemical_similarity_file)
    print("Computation took %.2f sec. on %d jobs" % (end, n_jobs))

    drugnames = [di_drugbankid2drugname.get(cid, None) for cid in chemical_similarity_df.index]
    di_drugname2pubchemid = {di_pubchemid2drugname[k] : k for k in di_pubchemid2drugname}
    pubchem_ids = [di_drugname2pubchemid.get(d, None) for d in drugnames]
    drug_ids = [ip for ip, p in enumerate(pubchem_ids) if (p)]
    chemical_custom = chemical_similarity_df.iloc[drug_ids,drug_ids]
    chemical_custom.index = [p for p in pubchem_ids if (p)]
    chemical_custom.columns = chemical_custom.index

    chemical_custom.index = chemical_custom.index.astype(str)
    chemical_custom.columns = chemical_custom.columns.astype(str)

    chemical_custom.to_csv(chemical_similarity_file)

chemical_custom = pd.read_csv(chemical_similarity_file, index_col=0, header=0)

### II.2. Drug-drug similarity: Side effect drug-drug similarities (from reported SE in SIDER)

This yields a dataframe `se_similarity_df` of size #drugs x #drugs, where identifiers are PubChem or DrugBank identifiers, which contains, when computable, drug-pairwise Jaccard scores between lists of SIDER-reported side effects.

In [8]:
se_similarity_file = predict_folder+"se_similarity_df.csv"

if (not os.path.exists(se_similarity_file)):
    drug_names = pd.read_csv(sider_folder+"drug_names.tsv", header=None, sep="\t", 
                             index_col=0)
    drug_names.columns = ["drug_name"]
    drug_names = drug_names.to_dict()["drug_name"]
    se = pd.read_csv(sider_folder+"meddra_all_se.tsv", header=None, sep="\t", 
                           index_col=0)
    se.index = [drug_names[s] for s in se.index]
    se = se[se.columns[[2, 3]]]
    se.columns = ['MedDRA SE concept type', 'UMLS concept id for MedDRA term']
    ## preferred term, unique
    se = se.loc[se['MedDRA SE concept type'] == "PT"][se.columns[-1:]]

    print("%d drugs" % (len(se.index)))
    start = time()
    se_based = data_processing.sideeffect_similarity(se)
    end = time()-start
    print("Computation took %.2f sec." % end)
    
    drug_index=se.index
    se_similarity_df = pd.DataFrame(se_based, index=drug_index, columns=drug_index)
    
    ## Standardize drug names
    def standard_name_from_SIDER(drug):
        if (drug == "1,25(OH)2D3"):
            drug = "Calcitriol"
        elif (drug == '5-ASA'):
            drug = "Mesalazine"
        elif (drug == "18F-FDG"):
            drug = "Fludeoxyglucose (18F)"
        elif (drug == "18F-flutemetamol"):
            drug = "Flutemetamol (18F)"
        elif (drug == "4-AP"):
            drug = "Dalfampridine"
        elif (drug == '4-PBA'):
            drug = "Phenylbutyric acid"
        elif (drug == "4-methylpyrazole"):
            drug = "Fomepizole"
        elif (drug == "5-FU"):
            drug = "Fluorouracil"
        elif (drug == "5-aminolevulinic"):
            drug = "Aminolevulinic acid"
        elif (drug == "5-aza-2'-deoxycytidine"):
            drug = "Decitabine"
        elif (drug == "5-azacytidine"):
            drug = "Azacitidine"
        elif (drug == "5-fluorocytosine"):
            drug = "Flucytosine"
        elif (drug == "5-methyltetrahydrofolate"):
            drug = "Levomefolic acid"
        elif (drug == "6-thioguanine"):
            drug = "Tioguanine"
        elif (drug == "68Ga"):
            drug = "Ga 68 PSMA-11"
        elif (drug == "8-MOP"):
            drug = "Methoxsalen"
        elif (drug == "A77"):
            drug = "Teriflunomide"
        elif (drug == "ACTH(1-39"):
            drug = "Corticotropin"
        elif (drug == "AMD3100"):
            drug = "Plerixafor"
        elif (drug == "AN2690"):
            drug = "Tavaborole"
        elif (drug == 'Almeta'):
            drug = "Alclometasone"
        elif (drug == "Azarga"):
            drug = "Brinzolamide"
        elif (drug == "BCNU"):
            drug = "Carmustine"
        elif (drug == "BPTI"):
            drug = "Aprotinin"
        elif (drug == "Benicar-HCT"):
            drug = "Olmesartan"
        elif (drug == "Buscopan"):
            drug = "Butylscopolamine"
        elif (drug == "Cancidas"):
            drug = "Caspofungin"
        elif (drug == "Cantril"):
            drug = "Mepenzolate"
        elif (drug == "Colimycin"):
            drug = "Colistin"
        elif (drug == "DFMO"):
            drug = "Eflornithine"
        elif (drug == "DMSO"):
            drug = "Dimethyl sulfoxide"
        elif (drug == "Diane-35"):
            drug = "Cyproterone acetate"
        elif (drug == "EACA"):
            drug = "Aminocaproic acid"
        elif (drug == "EDTA"):
            drug = "Edetic acid"
        elif (drug == "Estrofem"):
            drug = "Estradiol"
        elif (drug == "FAMP"):
            drug = "Fludarabine"
        elif (drug == "FK463"):
            drug = "Micafungin"
        elif (drug == "FTY720"):
            drug = "Fingolimod"
        elif (drug == "FdUrd"):
            drug = "Floxuridine"
        elif (drug == "Fe(III"):
            drug = "Ferric derisomaltose"
        elif (drug == "Forteo"):
            drug = "Teriparatide"
        elif (drug == "Gd-DTPA"):
            drug = "Gadopentetic acid"
        elif (drug == "Gd-EOB-DTPA"):
            drug = "Gadoxetic acid"
        elif (drug == 'Glat'):
            drug = "Glatiramer"
        elif (drug == "Humalog"):
            drug = "Insulin lispro"
        elif (drug == "Implanon"):
            drug = "Etonogestrel"
        elif (drug == "Insulin"):
            drug = "Insulin human"
        elif (drug == "Kaluril"):
            drug = "Amiloride"
        elif (drug == 'L-threo-DOPS'):
            drug = "Droxidopa"
        elif (drug == 'LMWH'):
            drug = "Heparin"
        elif (drug == 'LY146032'):
            drug = "Daptomycin"
        elif (drug == "Lantus"):
            drug = "Insulin glargine"
        elif (drug == "Leuprorelin"):
            drug = "Leuprolide"
        elif (drug == "Locorten"):
            drug = "Flumethasone"
        elif (drug == "Lovaza"):
            drug = "Omega-3-acid ethyl esters"
        elif (drug == "Lyxumia"):
            drug = "Lixisenatide"
        elif (drug == "MDV3100"):
            drug = "Enzalutamide"
        elif (drug == "MK-462"):
            drug = "Rizatriptan"
        elif (drug == "Madopar"):
            drug = "Carbidopa"
        elif (drug == "Mersyndol"):
            drug = "Codeine"
        elif (drug == "Metrodin"):
            drug = "Urofollitropin"
        elif (drug == "MnDPDP"):
            drug = "Mangafodipir"
        elif (drug == "N-carbamylglutamate"):
            drug = "Carglumic acid"
        elif (drug == "N-methylscopolamine"):
            drug = "Methscopolamine"
        elif (drug == 'NTBC'):
            drug = "Nitisinone"
        elif (drug == "NovoLog"):
            drug = "Insulin aspart"
        elif (drug == 'NuvaRing'):
            drug = "Etonogestrel"
        elif (drug == 'Nuvocid'):
            drug = "Oritavancin"
        elif (drug == "Optison"):
            drug = "Perflutren"
        elif (drug == "PCI-32765"):
            drug = "Ibrutinib"
        elif (drug == "PEP005"):
            drug = "Ingenol mebutate"
        elif (drug == "PGE1"):
            drug = "Alprostadil"
        elif (drug == "PGE2"):
            drug = "Dinoprostone"
        elif (drug == "Paroven"):
            drug = "Oxerutins"
        elif (drug == "Photofrin"):
            drug = "Porfimer sodium"
        elif (drug == 'Prednefrin'):
            drug = "Prednisolone acetate"
        elif (drug == "Promacta"):
            drug = "Eltrombopag"
        elif (drug == "Prussian"):
            drug = "Prussian blue"
        elif (drug == "Refludan"):
            drug = "Lepirudin"
        elif (drug == "Revasc"):
            drug = "Desirudin"
        elif (drug in ["SOM230", "Signifor"]):
            drug = "Pasireotide"
        elif (drug == "SU5416"):
            drug = "Semaxanib"
        elif (drug == 'Sativex'):
            drug = "Nabiximols"
        elif (drug == "SonoVue"):
            drug = "Sulfur hexafluoride"
        elif (drug == "Stalevo"):
            drug = "Carbidopa"
        elif (drug == 'Symlin'):
            drug = "Pramlintide"
        elif (drug == "TMC207"):
            drug = "Bedaquiline"
        elif (drug == "TMC435"):
            drug = "Simeprevir"
        elif (drug == "TPGS"):
            drug = "Tocofersolan"
        elif (drug == "TR-700"):
            drug = "Tedizolid"
        elif (drug == "Tadenan"):
            drug = "Pygeum africanum bark"
        elif (drug == "Timentin"):
            drug = "Ticarcillin"
        elif (drug == "TlCl"):
            drug = "Thallous Chloride"
        elif (drug == "Triphasil"):
            drug = "Levonorgestrel"
        elif (drug == "Trisequens"):
            drug = "Estradiol"
        elif (drug == "UDCA"):
            drug = "Ursodeoxycholic acid"
        elif (drug == 'VACV'):
            drug = "Human vaccinia virus immune globulin"
        elif (drug == "Vallergan"):
            drug = "Alimemazine"
        elif (drug == "Vancocine"):
            drug = "Vancomycin"
        elif (drug == "Westcort"):
            drug = "Hydrocortisone valerate"
        elif (drug == "ZnCl2"):
            drug = "Zinc chloride"
        elif (drug == "Zyprexa"):
            drug = "Olanzapine"
        elif (drug == "acetohydroxamic"):
            drug = "Acetohydroxamic acid"
        elif (drug == "actinomycin"):
            drug = "Dactinomycin"
        elif (drug == "alendronate"):
            drug = "Alendronic acid"
        elif (drug == "alpha-methyl-p-tyrosine"):
            drug = "Racemetyrosine"
        else:
            drug = drug.split("-")[-1]
            drug = drug[0].upper()+drug[1:]
        return drug

    drugnames = list(se_similarity_df.index)
    di_drugname2pubchemid = {di_pubchemid2drugname[k] : k for k in di_pubchemid2drugname}
    pubchem_ids = [di_drugname2pubchemid.get(standard_name_from_SIDER(d), None) for d in drugnames]
    notfound_ids = [standard_name_from_SIDER(drugnames[ip]) for ip, p in enumerate(pubchem_ids) if (not p)]
    got_pubchem_ids = []
    sz = 100
    for i in range(0,len(notfound_ids),sz):
        print("%d/%d" % (i, len(notfound_ids)))
        got_pubchem_ids += get_pubchem_id(notfound_ids[i:(i+sz)])
        
    pubchem_ids = [di_drugname2pubchemid.get(standard_name_from_SIDER(d), None) for d in drugnames]
    idx_notfound=0
    for i, p in enumerate(pubchem_ids):
        if (not p):
            pubchem_ids[i] = got_pubchem_ids[idx_notfound]
            idx_notfound += 1
    print((idx_notfound, len(got_pubchem_ids)))
    
    drug_ids = [ip for ip, p in enumerate(pubchem_ids) if (p)]
    se_custom = se_similarity_df.iloc[drug_ids,drug_ids]
    se_custom.index = [p for p in pubchem_ids if (p)]
    se_custom.columns = se_custom.index

    custom = pd.read_csv(paths_global.data_folder+"FEATURELESS_v1.0.0/all_ratings.csv", index_col=0, header=0)

    ids = [str(x) for x in custom.index if (x not in se_custom.index.astype(str))]
    se_custom = pd.concat((se_custom, pd.DataFrame([], index=ids, columns=ids)))
    se_custom.index = se_custom.index.astype(str)
    se_custom.columns = se_custom.columns.astype(str)
    se_custom = se_custom.loc[~se_custom.index.duplicated()]
    se_custom = se_custom.loc[chemical_custom.index][chemical_custom.index]
    np.fill_diagonal(se_custom.values,1)

    se_custom.index = se_custom.index.astype(str)
    se_custom.columns = se_custom.columns.astype(str)
    
    se_custom.to_csv(se_similarity_file)
se_custom = pd.read_csv(se_similarity_file, header=0, index_col=0)
se_custom

Unnamed: 0,5280453,6238,450503,15950376,1727,4775,3406,4075,3385,137,...,16135415,35370,60490,23994,60854,60857,5732,5734,5735,5311507
5280453,1.000000,0.112903,0.042553,0.031250,0.149123,0.099237,0.135593,0.138743,0.143541,0.109677,...,0.166667,0.161435,0.176471,0.032967,0.125326,0.163701,0.145897,0.173010,0.164234,0.172566
6238,0.112903,1.000000,0.080000,0.145833,0.208333,0.086957,0.111111,0.056757,0.107955,0.113043,...,0.089888,0.085859,0.156627,0.020408,0.060274,0.076046,0.081169,0.068841,0.074219,0.105000
450503,0.042553,0.080000,1.000000,0.071429,0.093023,0.052632,0.063830,0.017391,0.040268,0.060241,...,0.033557,0.023392,0.076923,0.000000,0.014620,0.025316,0.024476,0.024096,0.017316,0.034286
15950376,0.031250,0.145833,0.071429,1.000000,0.142857,0.051724,0.085106,0.014409,0.033113,0.022989,...,0.033333,0.023256,0.075472,0.100000,0.014577,0.029536,0.027972,0.019920,0.021645,0.028249
1727,0.149123,0.208333,0.093023,0.142857,1.000000,0.134146,0.152778,0.103448,0.153374,0.152381,...,0.198718,0.162011,0.219178,0.023810,0.095101,0.131148,0.120275,0.116279,0.107438,0.144385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60857,0.163701,0.076046,0.025316,0.029536,0.131148,0.115830,0.102767,0.309255,0.219048,0.108392,...,0.251634,0.278481,0.144578,0.008439,0.294382,1.000000,0.380952,0.344444,0.380952,0.261538
5732,0.145897,0.081169,0.024476,0.027972,0.120275,0.104235,0.100334,0.296296,0.189041,0.098802,...,0.288690,0.250689,0.127946,0.003472,0.282787,0.380952,1.000000,0.315271,0.389189,0.284916
5734,0.173010,0.068841,0.024096,0.019920,0.116279,0.157692,0.098113,0.330337,0.200000,0.111486,...,0.312292,0.308176,0.120755,0.008032,0.348624,0.344444,0.315271,1.000000,0.391813,0.278788
5735,0.164234,0.074219,0.017316,0.021645,0.107438,0.080769,0.106122,0.273942,0.193651,0.091873,...,0.271186,0.265176,0.139918,0.004348,0.285068,0.380952,0.389189,0.391813,1.000000,0.244582


### II.3 Drug-drug similarity: (Target) sequence drug-drug similarities

This yields a dataframe `sequence_similarity_df` of size #drugs x #drugs, where identifiers are PubChem or DrugBank identifiers, which contains, when computable, average drug-pairwise alignment scores between gene target sequences. We use the normalization suggested in [Bleakley and Yamanishi (2009)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2735674/).

In [9]:
from Bio import Align
from Bio import SeqIO
from time import time

seq_similarity_file = predict_folder+"sequence_similarity_df.csv"
seq_cst_similarity_file = predict_folder+"TARGET_SIMILARITY/sequence_cst_similarity_df.txt"

if (not os.path.exists(seq_cst_similarity_file)):
    sb.Popen(("mkdir -p "+predict_folder+"TARGET_SIMILARITY/".split(" ")))
    sequence_aligner = Align.PairwiseAligner()
    fname = drugbank_folder+"TARGET SEQUENCES/Drug Target Sequences/gene.fasta"
    ## Target sequences
    target_sequences = [record.seq for record in SeqIO.parse(fname, "fasta")]
    
    ## Following the normalization suggested in Bleakley and Yamanishi (2009)
    from scipy.stats.mstats import gmean as geometric_mean
    start=time()
    sequence_CST_similarity = [sequence_aligner.score(seq, seq) for seq in target_sequences]
    end=time()-start
    print("Computation took %.2f sec." % (end))
    norm_cst = geometric_mean(sequence_CST_similarity)
    with open(seq_cst_similarity_file, "w+") as f:
        f.write(str(norm_cst))
else:
    with open(seq_cst_similarity_file, "r") as f:
        norm_cst = float(f.read())
        
if (not os.path.exists(predict_folder+"TARGET_SIMILARITY/save_sequence_similarity_df.csv")):
    sequence_aligner = Align.PairwiseAligner()
    fname = drugbank_folder+"TARGET SEQUENCES/Drug Target Sequences/gene.fasta"
    ## Target sequences
    target_sequences = [record.seq for record in SeqIO.parse(fname, "fasta")]
    ## Associated DrugBank ids
    sequence_drug_ids = sb.check_output("cat '"+fname+"' | grep '>drugbank_target|' | sed -e 's/^.*(//g' | sed -e 's/)$//g'", shell=True).decode("utf-8").split("\n")
    drug_ids_per_seq = [idx.split("; ") for idx in sequence_drug_ids]
    ## unique DrugBank ids
    drug_ids = list(set(filter(lambda x : len(x) > 0, [y for x in drug_ids_per_seq for y in x])))
    ## consider only drugbank ids in custom dataset for computational reasons
    drugnames = [di_drugbankid2drugname.get(cid, None) for cid in drug_ids]
    di_drugname2pubchemid = {di_pubchemid2drugname[k] : k for k in di_pubchemid2drugname}
    pubchem_ids = [di_drugname2pubchemid.get(d, None) for d in drugnames]
    drug_notnone_ids = [ip for ip, p in enumerate(pubchem_ids) if (p)]
    drug_ids = [drug_ids[i] for i in drug_notnone_ids]
    print(len(drug_ids))

    ndrugs=len(drug_ids)
    
    def seq_function(i, j):
        print((i,j,ndrugs))
        if (i>j):
            return 0.
        else:
            drug_i = drug_ids[i]
            drug_j = drug_ids[j]
            ## all sequences in which drug_i appears
            target_i_ids = [idx for idx, drug_ls in enumerate(drug_ids_per_seq) if (drug_i in drug_ls)]
            target_seq_i = [target_sequences[idx_i] for idx_i in target_i_ids]
            if (i==j):
                align_scores = [sequence_aligner.score(s1, s2) for s1 in target_seq_i for s2 in target_seq_i]
            else:
                ## all sequences in which drug_j appears
                target_j_ids = [idx for idx, drug_ls in enumerate(drug_ids_per_seq) if (drug_j in drug_ls)]
                target_seq_j = [target_sequences[idx_j] for idx_j in target_j_ids]
                align_scores = [sequence_aligner.score(s1, s2) for s1 in target_seq_i for s2 in target_seq_j]
            score = np.mean(align_scores) if (len(align_scores)>0) else np.nan
            return score
        
    start = time()
    if (os.path.exists(predict_folder+"TARGET_SIMILARITY/sequence_similarity.out")):
        sequence_similarity = np.loadtxt(predict_folder+"TARGET_SIMILARITY/sequence_similarity.out")
    else:
        sequence_similarity = np.zeros((ndrugs, ndrugs))
    start_i = 0
    for i in range(start_i,ndrugs):
        for j in range(i, ndrugs):
            sequence_similarity[i,j] = seq_function(i, j)
        np.savetxt(predict_folder+"TARGET_SIMILARITY/sequence_similarity.out", sequence_similarity)
    end = time()-start
    print("Computation took %.2f sec." % end)
    
    sequence_similarity = np.loadtxt(predict_folder+"TARGET_SIMILARITY/sequence_similarity.out")
    sequence_similarity_df = pd.DataFrame(sequence_similarity, index=drug_ids, columns=drug_ids)
    sequence_similarity_df.to_csv(predict_folder+"TARGET_SIMILARITY/save_sequence_similarity_df.csv")
                                  
sequence_similarity_df = pd.read_csv(predict_folder+"TARGET_SIMILARITY/save_sequence_similarity_df.csv", index_col=0)

if (not os.path.exists(seq_similarity_file)):
    diag = np.diag(sequence_similarity_df.values)
    sequence_similarity_df += sequence_similarity_df.T
    np.fill_diagonal(sequence_similarity_df.values, diag)
    ## normalisation
    sequence_similarity_df = sequence_similarity_df/norm_cst
    
    drugnames = [di_drugbankid2drugname.get(cid, None) for cid in sequence_similarity_df.index]
    di_drugname2pubchemid = {di_pubchemid2drugname[k] : k for k in di_pubchemid2drugname}
    pubchem_ids = [di_drugname2pubchemid.get(d, None) for d in drugnames]
    drug_ids = [ip for ip, p in enumerate(pubchem_ids) if (p)]
    sequence_custom = sequence_similarity_df.iloc[drug_ids,drug_ids]
    sequence_custom.index = [p for p in pubchem_ids if (p)]
    sequence_custom.columns = sequence_custom.index

    sequence_custom.index = sequence_custom.index.astype(str)
    sequence_custom.columns = sequence_custom.columns.astype(str)

    sequence_custom.to_csv(seq_similarity_file)

sequence_custom = pd.read_csv(seq_similarity_file, index_col=0)

### II.4 Drug-drug similarity: Closeness in the human PPI network drug target to drug target similarities

This yields a dataframe `network_similarity_df` of size #drugs x #drugs, where identifiers are PubChem or DrugBank identifiers, which contains, when computable, the transformed distance between drug targets in the PPI using the Floyd-Warshall algorithm. We apply to the Floyd-Warshall cost matrix the transformation suggested by the [PREDICT paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3159979/).

In [10]:
network_folder = predict_folder+"NETWORK_SIMILARITY/"
network_similarity_file = predict_folder+"network_similarity_df.csv"

## 1. Mapping drugs and genes
if (not os.path.exists(network_folder+"SAVE_network_similarity_df.csv")):
    sb.Popen(("mkdir -p "+network_folder).split(" "))
    fname=paths_global.drugbank_folder+"PROTEIN IDENTIFIERS/Drug Target Identifiers/all.csv"

    protein_db = pd.read_csv(fname, index_col=5).query("Species=='Humans'")[["Gene Name", "Drug IDs"]]
    drug_protein_db = protein_db[["Drug IDs"]].groupby(level=0).apply(lambda x : "; ".join(list(sorted(set(list(x.values.flatten()))))))
    drug_gene_db = protein_db[["Gene Name"]].groupby(level=0).apply(lambda x : "; ".join(list(map(str,set(x.values.flatten())))))
    protein_db = pd.DataFrame([], index=drug_protein_db.index)
    protein_db["Drug IDs"] = drug_protein_db
    protein_db["Gene Name"] = drug_gene_db

    drug_ids = list(set([y for x in list(protein_db["Drug IDs"]) for y in x.split("; ")]))
    ## consider only drugbank ids in custom dataset for computational reasons
    drugnames = [di_drugbankid2drugname.get(cid, None) for cid in drug_ids]
    di_drugname2pubchemid = {di_pubchemid2drugname[k] : k for k in di_pubchemid2drugname}
    pubchem_ids = [di_drugname2pubchemid.get(d, None) for d in drugnames]
    drug_notnone_ids = [ip for ip, p in enumerate(pubchem_ids) if (p)]
    drug_ids = [drug_ids[i] for i in drug_notnone_ids]

    keep_ids = [ix for ix, x in enumerate(list(protein_db["Drug IDs"])) if (any([y in drug_ids for y in x.split("; ")]))]
    protein_db = protein_db.iloc[keep_ids]

    from NORDic.UTILS.STRING_utils import get_app_name_STRING, get_network_from_STRING
    network_fname = network_folder+"human_ppi_network.csv"
    ## 9606=human, retrieve PPI from STRING
    if (not os.path.exists(network_fname)):
        gene_list = list(protein_db["Gene Name"].unique())
        taxon_id=9606 #human
        app_name = get_app_name_STRING(paths_global.string_file)
        ppi = get_network_from_STRING(gene_list, taxon_id, min_score=0, network_type="functional", add_nodes=0, 
                                      app_name=app_name, version="11.5", quiet=False)
        ppi.to_csv(network_fname)
    ppi = pd.read_csv(network_fname, index_col=0)
    
    ## selection of interactions
    score_thres=0

    ## Get drug-gene associations
    gene_targets = list(protein_db["Gene Name"])

    ppi = pd.read_csv(network_fname, sep="\t", header=0, index_col=0)
    ppi_subset = ppi.loc[ppi["score"]>score_thres][["preferredName_A","preferredName_B","score"]]
    ppi_subset = ppi_subset.loc[[(a in gene_targets) and (b in gene_targets) for a,b in zip(list(ppi_subset["preferredName_A"]),list(ppi_subset["preferredName_B"]))]]
    ppi_subset["cost"] = 1-ppi_subset[["score"]].values.flatten() #cost
    ppi_subset = ppi_subset[["preferredName_A","preferredName_B","cost"]]
    ppi_subset = ppi_subset.drop_duplicates(keep="first")

    ## 1 is the maximum cost of edge
    max_cost=1.
    wam = ppi_subset.pivot(index='preferredName_A', columns='preferredName_B', values='cost').fillna(max_cost)
    for g in wam.index:
        if (g not in wam.columns):
            wam[g] = [max_cost]*wam.shape[0]
    for g in wam.columns:
        if (g not in wam.index):
            wam.loc[g] = [max_cost]*wam.shape[1]
    wam = wam.loc[wam.index][wam.index]

    ## Compute distance between target genes
    from scipy.sparse.csgraph import floyd_warshall

    dists = floyd_warshall(wam.values.copy(order='C'), directed=False)
    dists_df = pd.DataFrame(dists, index=wam.index, columns=wam.columns)

    A_mat, b_mat = 0.9*np.exp(1), 1.
    network_similarity = A_mat*np.exp(-b_mat*dists_df)
    network_similarity /= np.max(network_similarity)

    network_similarity_1 = pd.DataFrame(0., index=drug_ids, columns=network_similarity.columns)
    for drug in drug_ids:
        keep_drug_ids = [idx for idx in protein_db.index if (drug in protein_db.loc[idx]["Drug IDs"])]
        targeted_genes = [x for x in list(protein_db.loc[keep_drug_ids]["Gene Name"]) if (x in network_similarity.index)]
        network_similarity_1.loc[drug] = network_similarity.loc[targeted_genes].mean(axis=0)
        
    network_similarity_df = pd.DataFrame([], index=drug_ids)
    for drug in drug_ids:
        keep_drug_ids = [idx for idx in protein_db.index if (drug in protein_db.loc[idx]["Drug IDs"])]
        targeted_genes = [x for x in list(protein_db.loc[keep_drug_ids]["Gene Name"]) if (x in network_similarity.index)]
        network_similarity_df[drug] = network_similarity_1[targeted_genes].mean(axis=1)
        
    network_similarity_df.to_csv(network_folder+"SAVE_network_similarity_df.csv")
    
network_similarity_df = pd.read_csv(network_folder+"SAVE_network_similarity_df.csv", index_col=0)

if (not os.path.exists(network_similarity_file)):
    drugnames = [di_drugbankid2drugname.get(cid, None) for cid in network_similarity_df.index]
    di_drugname2pubchemid = {di_pubchemid2drugname[k] : k for k in di_pubchemid2drugname}
    pubchem_ids = [di_drugname2pubchemid.get(d, None) for d in drugnames]
    drug_ids = [ip for ip, p in enumerate(pubchem_ids) if (p)]
    network_custom = network_similarity_df.iloc[drug_ids,drug_ids]
    network_custom.index = [p for p in pubchem_ids if (p)]
    network_custom.columns = network_custom.index

    network_custom.index = network_custom.index.astype(str)
    network_custom.columns = network_custom.columns.astype(str)

    sb.call("mkdir -p "+data_folder+"custom_PREDICT/", shell=True)
    network_similarity_file = data_folder+"custom_PREDICT/network_similarity_df.csv"
    network_custom.to_csv(network_similarity_file)
    
network_custom = pd.read_csv(network_similarity_file, index_col=0)

### II.5 Drug-drug similarity: Gene Onthology drug-drug similarities

This yields a dataframe `go_similarity_df` of size #drugs x #drugs, where identifiers are PubChem or DrugBank identifiers, which contains the similarity between Gene Onthology annotations.

In [11]:
go_similarity_file = predict_folder+"go_similarity_df.csv"
    
if (not os.path.exists(go_similarity_file)):
    ## Get drug-protein-gene associations
    fname=paths_global.drugbank_folder+"PROTEIN IDENTIFIERS/Drug Target Identifiers/all.csv"

    protein_db = pd.read_csv(fname, index_col=5).query("Species=='Humans'")[["Gene Name", "Drug IDs"]]
    drug_protein_db = protein_db[["Drug IDs"]].groupby(level=0).apply(lambda x : "; ".join(list(sorted(set(list(x.values.flatten()))))))
    drug_gene_db = protein_db[["Gene Name"]].groupby(level=0).apply(lambda x : "; ".join(list(map(str,set(x.values.flatten())))))
    protein_db = pd.DataFrame([], index=drug_protein_db.index)
    protein_db["Drug IDs"] = drug_protein_db
    protein_db["Gene Name"] = drug_gene_db

    drug_ids = list(set([y for x in list(protein_db["Drug IDs"]) for y in x.split("; ")]))
    ## consider only drugbank ids in custom dataset for computational reasons
    drugnames = [di_drugbankid2drugname.get(cid, None) for cid in drug_ids]
    di_drugname2pubchemid = {di_pubchemid2drugname[k] : k for k in di_pubchemid2drugname}
    pubchem_ids = [di_drugname2pubchemid.get(d, None) for d in drugnames]
    drug_notnone_ids = [ip for ip, p in enumerate(pubchem_ids) if (p)]
    drug_ids = [drug_ids[i] for i in drug_notnone_ids]

    keep_ids = [ix for ix, x in enumerate(list(protein_db["Drug IDs"])) if (any([y in drug_ids for y in x.split("; ")]))]
    protein_db = protein_db.iloc[keep_ids]

    go_sim_fname = "go_similarity"
    if (not os.path.exists(predict_folder+"GO_SIMILARITY/"+go_sim_fname+".csv")):
        sb.Popen(("mkdir -p "+predict_folder+"GO_SIMILARITY/").split(" "))
        proteins = list(set(list(protein_db.index)))
        with open("proteins.txt", "w+") as f:
            f.write(",".join(proteins))
        process = sb.Popen(["Rscript", "../src/Rscript_GOsim.R"])
        process.wait()
        sb.Popen(["mv", "go_similarity.csv", "go_similarity_MF.csv", 
                  "go_similarity_CC.csv", "go_similarity_BP.csv", predict_folder+"GO_SIMILARITY/"])
    ## annotated by protein        
    go_similarity = pd.read_csv(predict_folder+"GO_SIMILARITY/"+go_sim_fname+".csv", index_col=0)

    go_similarity_mat = go_similarity.fillna(0.)

    go_similarity_1 = pd.DataFrame([], index=drug_ids, columns=go_similarity_mat.columns)
    for drug in drug_ids:
        keep_drug_ids = []
        for idx in protein_db.index[3:]:
            df = protein_db.loc[idx]["Drug IDs"]
            if ("str" in str(type(df))):
                drs = df.split("; ")
            else:
                drs = [x for ls in list(df) for x in ls.split("; ")]
            if (drug in drs):
                keep_drug_ids.append(idx)
        keep_drug_ids = list(set(keep_drug_ids))
        if (len(keep_drug_ids)==0):
            continue
        go_similarity_1.loc[drug] = list(go_similarity_mat.loc[keep_drug_ids].mean(axis=0))
        
    ## annotated by DrugBank ID 
    go_similarity_df = pd.DataFrame([], index=drug_ids)
    for drug in drug_ids:
        keep_drug_ids = []
        for idx in protein_db.index[3:]:
            df = protein_db.loc[idx]["Drug IDs"]
            if ("str" in str(type(df))):
                drs = df.split("; ")
            else:
                drs = [x for ls in list(df) for x in ls.split("; ")]
            if (drug in drs):
                keep_drug_ids.append(idx)
        keep_drug_ids = list(set(keep_drug_ids))
        if (len(keep_drug_ids)==0):
            continue
        go_similarity_df[drug] = go_similarity_1[keep_drug_ids].mean(axis=1)

    drugnames = [di_drugbankid2drugname.get(cid, None) for cid in go_similarity_df.index]
    di_drugname2pubchemid = {di_pubchemid2drugname[k] : k for k in di_pubchemid2drugname}
    pubchem_ids = [di_drugname2pubchemid.get(d, None) for d in drugnames]
    drug_ids = [ip for ip, p in enumerate(pubchem_ids) if (p)]
    go_custom = go_similarity_df.iloc[drug_ids,drug_ids]
    go_custom.index = [p for p in pubchem_ids if (p)]
    go_custom.columns = go_custom.index

    ## annotated by PubChem CID mainly
    go_custom.index = go_custom.index.astype(str)
    go_custom.columns = go_custom.columns.astype(str)

    go_custom.to_csv(go_similarity_file)
    
go_custom = pd.read_csv(go_similarity_file, index_col=0)

### II.6 Drug-drug similarity 6: Transcriptomic-based

"Given genetic signatures of diseases obtained from gene expression experiments, we used a Jaccard score between every pair of signatures, taking into account the direction of the response of each gene. That is, the total number of mutual upregulated genes and mutual downregulated genes over the unified list of all genes. Signature genes with inconsistent regulation directionality for the same disease across various experiments (i.e., registered as both upregulated and downregulated across various experiments for the same disease) were filtered, allowing for up to 10% expression measurement errors." 

This yields a dataframe `go_similarity_df` of size #drugs x #drugs, where identifiers are PubChem or DrugBank identifiers, which contains the similarity between Gene Onthology annotations.

Signatures were obtained in the notebook TRANSCRIPT_dataset-v1.0.0.ipynb.

In [12]:
if (not os.path.exists(predict_folder+"signature_similarity_df.csv")):

    from sklearn.metrics import pairwise_distances

    S = pd.read_csv(paths_global.data_folder+"TRANSCRIPT_v1.0.0/all_drugs_+LINCS.csv", index_col=0, header=0)
    ## Look only at the direction of differential expression
    S[S<0] = -1
    S[S>0] = 1
    # sum of mutual up/down-regulated genes divided by total number of genes
    Dists = (S.shape[0]-pairwise_distances(S.T, metric='l1'))/S.shape[0] 
    signature_custom = pd.DataFrame(Dists, index=S.columns, columns=S.columns)
    
    signature_custom.index = signature_custom.index.astype(str)
    signature_custom.columns = signature_custom.columns.astype(str)
    
    signature_custom.to_csv(predict_folder+"signature_similarity_df.csv")
    
signature_custom = pd.read_csv(predict_folder+"signature_similarity_df.csv", index_col=0)
signature_custom

Unnamed: 0,36314,11354606,679,2554,126941,3033,444795,5757,19649,6279,...,5281078,16129706,2310,4375,3352,3373,5917,442872,442021,4843
36314,1.000000,0.922423,0.917092,0.917092,0.919255,0.927368,0.922037,0.926905,0.918251,0.918251,...,0.205069,0.214495,0.210478,0.216195,0.216350,0.207078,0.218359,0.205378,0.211096,0.213723
11354606,0.922423,1.000000,0.917246,0.917246,0.917710,0.918869,0.919873,0.926905,0.913769,0.913769,...,0.207696,0.208623,0.212023,0.215423,0.210168,0.213877,0.213414,0.211250,0.213723,0.209241
679,0.917092,0.917246,1.000000,1.000000,0.924587,0.920028,0.919487,0.927755,0.915237,0.915237,...,0.222145,0.213182,0.209628,0.214418,0.219363,0.214264,0.213027,0.212255,0.213027,0.212718
2554,0.917092,0.917246,1.000000,1.000000,0.924587,0.920028,0.919487,0.927755,0.915237,0.915237,...,0.222145,0.213182,0.209628,0.214418,0.219363,0.214264,0.213027,0.212255,0.213027,0.212718
126941,0.919255,0.917710,0.924587,0.924587,1.000000,0.923582,0.919332,0.928991,0.916783,0.916783,...,0.224308,0.215500,0.214573,0.216118,0.209473,0.214109,0.214418,0.213027,0.217663,0.210709
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3373,0.207078,0.213877,0.214264,0.214264,0.214109,0.209396,0.218127,0.215268,0.205223,0.205223,...,0.186215,0.305362,0.331479,0.281409,0.159017,1.000000,0.329161,0.214650,0.259002,0.189306
5917,0.218359,0.213414,0.213027,0.213027,0.214418,0.219286,0.219518,0.221140,0.211405,0.211405,...,0.205532,0.241539,0.248802,0.298563,0.160408,0.329161,1.000000,0.231494,0.188688,0.155772
442872,0.205378,0.211250,0.212255,0.212255,0.213027,0.212950,0.215809,0.220522,0.207850,0.207850,...,0.252357,0.260547,0.208159,0.257302,0.156854,0.214650,0.231494,1.000000,0.242621,0.171380
442021,0.211096,0.213723,0.213027,0.213027,0.217663,0.219595,0.212409,0.221913,0.214341,0.214341,...,0.251584,0.219595,0.246948,0.237058,0.225931,0.259002,0.188688,0.242621,1.000000,0.266110


## III. Disease-disease similarity database

### III.1 Disease-disease similarity: Phenotype disease-disease similarities

Identifying similarity between MeSH terms appearing in the medical description of diseases from the OMIM database. 

"We used the phenotypic similarity constructed by [van Driel et al (2006)](https://www.nature.com/articles/5201585.pdf). The phenotypic similarity was constructed by identifying similarity between MeSH terms (Lipscomb, 2000) appearing in the medical description of diseases from the OMIM database (Hamosh et al, 2002)". We actually use a posterior package for computing similarities, described in [this paper](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-020-03583-6).

In order to run this code, you need to sign up to [a UMLS Terminology Services Account](https://uts.nlm.nih.gov/uts/signup-login) (with your academic e-mail address for instance). The review of the registration application takes up to 3 business days. Then
1. Download file [public_mm_linux_main_2020.tar.bz2](https://data.lhncbc.nlm.nih.gov/umls-restricted/ii/tools/MetaMap/download/public_mm_linux_main_2020.tar.bz2) once logged in
2. Place that file in folder `~/`

For more information about MetaMap, please check the [documentation](https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/README.html#Getting%20the%20distribution).

This yields a dataframe `disease_phenotype_similarity_df.csv` of size #diseases x #diseases, where identifiers are MedGen Concept ID identifiers, which contains the similarity between disease phenotype annotations.

In [13]:
if (not os.path.exists(predict_folder+"disease_phenotype_similarity_df.csv")):

    A = pd.read_csv(paths_global.data_folder+"FEATURELESS_v1.0.0/all_ratings.csv", index_col=0)
    sb.Popen(["mkdir", "-p", predict_folder+"DISEASE_PHENOTYPE_SIMILARITY/"])

    ## Retrieve MeSH terms from MedGen Concept IDs
    if (not os.path.exists(predict_folder+"DISEASE_PHENOTYPE_SIMILARITY/disease_phenotype_MeshTERMS.out")):
        with open("diseases.txt", "w+") as f:
            f.write(','.join(list(A.columns)))  
        process = sb.Popen("bash ../src/Bashscript_PHENOsim.sh")
        process.wait()
        sb.Popen(["mv", "disease_phenotype_MeshTERMS.out", predict_folder+"DISEASE_PHENOTYPE_SIMILARITY/"])

    disease_MeshTERMS = pd.read_csv(predict_folder+"DISEASE_PHENOTYPE_SIMILARITY/disease_phenotype_MeshTERMS.out", index_col=None, header=None)
    disease_MeshTERMS.columns = ["Disease UMLS","MeSH ID"]
    disease_MeshTERMS = disease_MeshTERMS.loc[disease_MeshTERMS["MeSH ID"].astype(str)!=" None"]
    disease_MeshTERMS["MeSH ID"] = [x[2:-1] for x in list(disease_MeshTERMS["MeSH ID"])]
    disease_MeshTERMS.index = list(disease_MeshTERMS["MeSH ID"])
    disease_MeshTERMS["Disease UMLS"] = [x[1:-1] for x in list(disease_MeshTERMS["Disease UMLS"])]
    mesh_terms = list(disease_MeshTERMS["MeSH ID"])

    ## Compute MeSH term similarity
    if (not os.path.exists(predict_folder+"DISEASE_PHENOTYPE_SIMILARITY/disease_phenotype_similarity.out")):
        method, category = "lin", "C"
        cmd = ["source ~/miniconda3/etc/profile.d/conda.sh"]
        cmd += ["conda activate pysim"]
        cmd += ["~/public_mm/bin/skrmedpostctl start"]
        cmd += ["~/public_mm/bin/wsdserverctl start"]
        cmd_py = ["diseases_=[\""+"\",\"".join(mesh_terms)+"\"]"]
        cmd_py += ["from pyMeSHSim.Sim.similarity import termComp"]
        cmd_py += ["import pandas as pd"]
        cmd_py += ["import numpy as np"]
        cmd_py += ["simCom = termComp()"]
        cmd_py += ['calcSim = lambda xii,xjj: simCom.termSim(dui1=xii, dui2=xjj, method=\"'+method+'\", category=\"'+category+'\")']
        cmd_py += ['exec("def calcSimWrapper(xii, xjj):\\n\\ttry:\\n\\t\\treturn calcSim(xii,xjj)\\n\\texcept:\\n\\t\\treturn 0\\n")']
        cmd_py += ["simmat = pd.DataFrame([],index=diseases_)"]
        cmd_py += ['exec("for i, xi in enumerate(list(set(diseases_))):\\n\\tsimmat[xi] = [0]*(i+1)+[calcSimWrapper(xi,xj) for j, xj in enumerate(diseases_[(i+1):])]\\n\\n")']
        cmd_py += ['simmat.to_csv("disease_phenotype_similarity.out")']
        cmd += ["python3 -c \'"+";".join(cmd_py)+"\'"]
        cmd += ["~/public_mm/bin/wsdserverctl stop"]
        cmd += ["~/public_mm/bin/skrmedpostctl stop"]
        cmd += ["conda deactivate"]
        sb.Popen([y for x in cmd for y in x.split(" ")+[";"]])
        sb.Popen(["mv", "disease_phenotype_similarity.out", predict_folder+"DISEASE_PHENOTYPE_SIMILARITY/"])

    simmat = pd.read_csv(predict_folder+"DISEASE_PHENOTYPE_SIMILARITY/disease_phenotype_similarity.out", index_col=0)
    disease_phenotype_similarity_df = simmat[mesh_terms]
    disease_phenotype_similarity_df += disease_phenotype_similarity_df.T
    np.fill_diagonal(disease_phenotype_similarity_df.values,1)

    disease_id = [str(disease_MeshTERMS.loc[x]["Disease UMLS"]) for x in mesh_terms]
    disease_phenotype_similarity_df.index = disease_id
    disease_phenotype_similarity_df.columns = disease_id

    disease_phenotype_similarity_df.index = disease_phenotype_similarity_df.index.astype(str)
    disease_phenotype_similarity_df.columns = disease_phenotype_similarity_df.columns.astype(str)
    
    disease_phenotype_similarity_df.to_csv(predict_folder+"disease_phenotype_similarity_df.csv")

disease_phenotype_custom = pd.read_csv(predict_folder+"disease_phenotype_similarity_df.csv", index_col=0)
disease_phenotype_custom

Unnamed: 0,C0042133,C1704272,C1858361,C4310232,C0029456,C1851377,C0270327,C1833268,C0024141,C1852092,...,C0012813,C0014474,C1845903,C1956346,C0242770,C0022661,C0236792,C1135191,C0016667,C0039445
C0042133,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.316805,0.000000,0.000000,0.0,0.000000,0.0,0.00000,0.000000,0.000000
C1704272,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.307626,0.307626,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.388069,0.0,0.00000,0.000000,0.000000
C1858361,0.0,0.000000,0.000000,0.000000,0.422435,0.000000,0.000000,0.000000,0.390624,0.000000,...,0.398025,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.00000,0.000000,0.000000
C4310232,0.0,0.000000,0.000000,0.000000,0.430398,0.000000,0.000000,0.000000,0.000000,0.432798,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.00000,0.000000,0.000000
C0029456,0.0,0.000000,0.422435,0.430398,0.000000,0.000000,0.000000,0.000000,0.000000,0.477832,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.00000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0022661,0.0,0.388069,0.000000,0.000000,0.000000,0.000000,0.427991,0.427991,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.00000,0.000000,0.000000
C0236792,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.00000,0.000000,0.000000
C1135191,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.391671,0.0,0.000000,0.0,0.00000,0.000000,0.245870
C0016667,0.0,0.000000,0.000000,0.000000,0.000000,0.481842,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.839769,0.000000,0.0,0.000000,0.0,0.00000,0.000000,0.401891


### III.2 Disease-disease similarity: Semantic phenotypic disease-disease similarities 

"Hierarchical structure of the HPO together with the mapping provided by HPO between ontology nodes and OMIM diseases to construct a semantic similarity score based on Resnik."

This yields a dataframe `disease_semantic_similarity_df.csv` of size #diseases x #diseases, where identifiers are MedGen Concept ID identifiers, which contains the semantic similarity score between disease ontologies. We use scripts from the following [repository](https://github.com/DGRC-PT/HPOSim_Helper_Scripts).

In [14]:
semantic_folder = predict_folder+"DISEASE_SEMANTIC_SIMILARITY/"
sb.Popen(("mkdir -p "+semantic_folder).split(" "))

if (not os.path.exists(predict_folder+"disease_semantic_similarity_df.csv")):

    A = pd.read_csv(paths_global.data_folder+"FEATURELESS_v1.0.0/all_ratings.csv", index_col=0)
    diseases = list(A.columns) # MedGen Concept IDs
    disease_list = list(set([di_medgenid2diseasename[disease.split(".")[0]] for disease in diseases]))

    inv_map = {v:k for k, v in di_omimid2diseasename.items()}
    disease_list_ = ["OMIM:"+(inv_map.get(d)[1:]) for d in disease_list if (d in inv_map)]

    if (not os.path.exists(semantic_folder+"disease_semantic_similarity.out")):

        process = sb.Popen("bash ../src/Bashscript_SEMANTICsim.sh".split(" "))
        process.wait()

        with open("diseases.txt", "w") as f:
            f.write(",".join(disease_list_)+"\n")

        sb.Popen(("mkdir -p "+semantic_folder).split(" "))
        cmd_R = ["library(HPOSim);ls<-unlist(read.delim(\"diseases.txt\", sep=\",\", header=F))"]
        cmd_R += ["mat<-getDiseaseListSim(ls,combinemethod=\"funSimMax\",method=\"Resnik\",verbose=TRUE)"]
        cmd_R += ["write.csv(mat,\"disease_semantic_similarity.out\")"]
        sb.Popen("R -e '"+(";".join(cmd_R)))

        sb.Popen("rm diseases.txt".split(" "))
        sb.Popen(("mv disease_semantic_similarity.out "+semantic_folder).split(" "))

    disease_semantic_similarity_df = pd.read_csv(semantic_folder+"disease_semantic_similarity.out", index_col=0)
    disease_semantic_similarity_df.index = omim_list_
    disease_semantic_similarity_df.columns = omim_list_
    disease_list_ = [di_omimid2diseasename["D"+x.split("OMIM:")[-1]] for x in omim_list_]
    invert_map = {v: k for k,v in di_medgenid2diseasename.items()}
    disease_list_ = [invert_map.get(d,None) for d in disease_list_ if (invert_map.get(d,None) is not None)]
    disease_semantic_similarity_df.index = disease_list_
    disease_semantic_similarity_df.columns = disease_list_
    disease_semantic_custom = disease_semantic_similarity_df
    disease_semantic_custom.index = disease_semantic_custom.index.astype(str)
    disease_semantic_custom.columns = disease_semantic_custom.columns.astype(str)
    disease_semantic_custom.to_csv(predict_folder+"disease_semantic_similarity_df.csv")
    
disease_semantic_similarity_df = pd.read_csv(predict_folder+"disease_semantic_similarity_df.csv", index_col=0)
disease_semantic_similarity_df

Unnamed: 0,C0008732,C1832855,C1868001,C1852223,C1843771,C1852092,C0270853,CN263178,C2750850,C1840264,...,C1840333,C5399971,C1861864,C1840172,C1850141,C1835047,C1843765,C1851055,C1840373,C3714927
C0008732,6.385549,0.000000,0.000000,4.405463,3.836702,4.405463,4.405463,0.000000,4.320906,2.202732,...,5.589068,3.031711,4.426020,3.070166,3.492540,0,4.405463,2.226376,3.086376,4.145003
C1832855,0.000000,7.439855,2.409154,0.000000,0.000000,0.000000,0.000000,1.606103,0.000000,0.000000,...,0.000000,0.000000,0.000000,5.169734,0.000000,0,0.000000,0.000000,0.000000,0.000000
C1868001,0.000000,2.409154,5.359532,0.000000,2.520726,0.000000,0.000000,2.409154,0.000000,0.000000,...,0.000000,2.520726,0.000000,3.619338,0.000000,0,0.000000,0.000000,2.520726,0.000000
C1852223,4.405463,0.000000,0.000000,3.421025,4.405463,4.579616,2.088279,0.000000,0.000000,2.088279,...,0.000000,4.405463,0.000000,0.000000,4.405463,0,4.579616,0.000000,4.405463,4.405463
C1843771,3.836702,0.000000,2.520726,4.405463,5.456336,4.405463,4.405463,2.077075,4.320906,4.282263,...,0.607193,4.614710,0.556594,3.020263,2.832083,0,4.884694,2.226376,4.085042,3.145428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C1835047,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000,0.000000
C1843765,4.405463,0.000000,0.000000,4.579616,4.884694,4.798484,4.405463,0.000000,0.000000,2.667575,...,0.000000,4.884694,0.000000,0.000000,4.405463,0,6.267000,0.000000,4.884694,4.405463
C1851055,2.226376,0.000000,0.000000,0.000000,2.226376,0.000000,0.000000,0.000000,0.000000,0.000000,...,2.226376,2.226376,2.226376,4.401776,3.659651,0,0.000000,6.672326,3.659651,2.226376
C1840373,3.086376,0.000000,2.520726,4.405463,4.085042,4.405463,4.405463,1.005913,0.000000,4.282263,...,4.690214,3.350557,3.519859,1.994234,4.862964,0,4.884694,3.659651,4.559734,2.879483


### III.3 Disease-disease similarity: Genetic based disease-disease similarities

"Given genetic signatures of diseases obtained from gene expression experiments, we used a Jaccard score between every pair of signatures, taking into account the direction of the response of each gene. That is, the total number of mutual upregulated genes and mutual downregulated genes over the unified list of all genes. Signature genes with inconsistent regulation directionality for the same disease across various experiments (*i*.*e*., registered as both upregulated and downregulated across various experiments for the same disease) were filtered, allowing for up to 10% expression measurement errors." 

Transcriptomic phenotypes were obtained in the notebook *TRANSCRIPT_dataset.ipynb*.

This yields a dataframe `disease_genetic_similarity_df.csv` of size #diseases x #diseases, where identifiers are MedGen Concept ID identifiers, which contains the Jaccard similarity score between disease transcriptomic phenotypes.

Similar to Section [**II.6 Drug-drug similarity 6: Transcriptomic-based**](http://localhost:8888/notebooks/PREDICT_dataset.ipynb#II.6-Drug-drug-similarity-6:-Transcriptomic-based).

In [15]:
if (not os.path.exists(predict_folder+"disease_signature_similarity_df.csv")):

    from sklearn.metrics import pairwise_distances

    P = pd.read_csv(paths_global.data_folder+"TRANSCRIPT_v1.0.0/all_diseases.csv", index_col=0, header=0)
    ## Look only at the direction of differential expression
    P[P<0] = -1
    P[P>0] = 1
    # sum of mutual up/down-regulated genes divided by total number of genes
    Dists = (P.shape[0]-pairwise_distances(P.T, metric='l1'))/P.shape[0] 
    disease_signature_custom = pd.DataFrame(Dists, index=P.columns, columns=P.columns)
    disease_signature_custom.index = disease_signature_custom.index.astype(str)
    disease_signature_custom.columns = disease_signature_custom.columns.astype(str)
    disease_signature_custom.to_csv(predict_folder+"disease_signature_similarity_df.csv")

disease_signature_custom = pd.read_csv(predict_folder+"disease_signature_similarity_df.csv", index_col=0)
disease_signature_custom

Unnamed: 0,C2936783,C2239176,C3553462,C0035235,C0032285,C0010346,C0009324,C0029408,C0001973,C3495559,...,C1260899,C1527336,C0275804,C0017168,C0003615,C0003872,C0014544,C0040028,C0040034,C0018802
C2936783,1.000000,1.000000,0.913972,0.913972,0.920584,0.928940,0.928940,0.925816,0.927487,0.924435,...,0.924508,0.930102,0.933953,0.926760,0.931992,0.914554,0.926687,0.918768,0.930538,0.929449
C2239176,1.000000,1.000000,0.913972,0.913972,0.920584,0.928940,0.928940,0.925816,0.927487,0.924435,...,0.924508,0.930102,0.933953,0.926760,0.931992,0.914554,0.926687,0.918768,0.930538,0.929449
C3553462,0.913972,0.913972,1.000000,1.000000,0.913900,0.914699,0.913246,0.913173,0.923418,0.913246,...,0.913754,0.914699,0.928141,0.913536,0.913100,0.915716,0.914045,0.913391,0.925452,0.923491
C0035235,0.913972,0.913972,1.000000,1.000000,0.913900,0.914699,0.913246,0.913173,0.923418,0.913246,...,0.913754,0.914699,0.928141,0.913536,0.913100,0.915716,0.914045,0.913391,0.925452,0.923491
C0032285,0.920584,0.920584,0.913900,0.913900,1.000000,0.921456,0.921311,0.917024,0.927559,0.919712,...,0.922982,0.920294,0.931846,0.921601,0.922183,0.919858,0.921819,0.923491,0.927850,0.926179
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0003872,0.914554,0.914554,0.915716,0.915716,0.919858,0.915280,0.914554,0.913609,0.923563,0.921529,...,0.914336,0.915280,0.928577,0.913100,0.915861,1.000000,0.918114,0.915571,0.925888,0.924217
C0014544,0.926687,0.926687,0.914045,0.914045,0.921819,0.930902,0.921601,0.929085,0.931047,0.932936,...,0.925307,0.926833,0.934026,0.923926,0.930902,0.918114,1.000000,0.921601,0.931192,0.929521
C0040028,0.918768,0.918768,0.913391,0.913391,0.923491,0.924580,0.921093,0.922328,0.928795,0.922401,...,0.924798,0.923127,0.931338,0.924726,0.922110,0.915571,0.921601,1.000000,0.927777,0.926978
C0040034,0.930538,0.930538,0.925452,0.925452,0.927850,0.928213,0.930538,0.929884,0.942454,0.934607,...,0.928722,0.929085,0.948194,0.929521,0.933735,0.925888,0.931192,0.927777,1.000000,0.941510


### III.4 Disease-disease similarity: Gene sequence based disease-disease similarities

This yields a dataframe `disease_sequence_similarity_df.csv` of size #diseases x #diseases, where identifiers are MedGen Concept ID identifiers, which contains, when computable, average drug-pairwise alignment scores between gene target sequences. We use the normalization suggested in [Bleakley and Yamanishi (2009)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2735674/).

Similar to Section [**II.3 Drug-drug similarity 3: (Target) sequence drug-drug similarities**](http://localhost:8888/notebooks/PREDICT_dataset.ipynb#II.3-Drug-drug-similarity-3:-(Target)-sequence-drug-drug-similarities). If one truly wants an equivalent to the computations made for drugs, they should use gene targets retrieved from DisGeNet.

In [16]:
sb.Popen(["mkdir", "-p", predict_folder+"DISEASE_TARGET_SIMILARITY/"])

from Bio import Align
from Bio import SeqIO
from time import time

seq_cst_similarity_file = predict_folder+"DISEASE_TARGET_SIMILARITY/disease_sequence_cst_similarity_df.txt"
if (not os.path.exists(seq_cst_similarity_file)):
    sequence_aligner = Align.PairwiseAligner()
    fname = drugbank_folder+"TARGET SEQUENCES/Drug Target Sequences/gene.fasta"
    ## Target sequences
    target_sequences = [record.seq for record in SeqIO.parse(fname, "fasta")]
    
    ## Following the normalization suggested in Bleakley and Yamanishi (2009)
    from scipy.stats.mstats import gmean as geometric_mean
    start=time()
    sequence_CST_similarity = [sequence_aligner.score(seq, seq) for seq in target_sequences]
    end=time()-end
    print("Computation took %.2f sec." % (end))
    norm_cst = geometric_mean(sequence_CST_similarity)
    with open(seq_cst_similarity_file, "w+") as f:
        f.write(str(norm_cst))
else:
    with open(seq_cst_similarity_file, "r") as f:
        norm_cst = float(f.read())
        
seq_fname = paths_global.drugbank_folder+"TARGET SEQUENCES/Drug Target Sequences/gene.fasta"
proteins_list = sb.check_output("cat '"+seq_fname+"' | grep '>drugbank_target|' | cut -d' ' -f1 | cut -d'|' -f2", shell=True).decode("utf-8").split("\n")[:-1]

sb.Popen(["mkdir", "-p", predict_folder+"DisGeNet/"])

if (not os.path.exists(predict_folder+"DisGeNet/disease_db_DisGeNet.csv")):
    
    from NORDic.UTILS.DISGENET_utils import get_user_key_DISGENET, get_genes_proteins_from_DISGENET
    A = pd.read_csv(paths_global.data_folder+"FEATURELESS_v1.0.0/all_ratings.csv", index_col=0)
    disease_list = list(A.columns)
    
    user_key = get_user_key_DISGENET(paths_global.disgenet_file)
    disease_db = get_genes_proteins_from_DISGENET(disease_list, limit=3000, source="CURATED", min_score=0.35, 
                                min_ei=1., min_dsi=0.25, min_dpi=0, chunksize=100, user_key=user_key, quiet=False)
    
    disease_db.to_csv(predict_folder+"DisGeNet/disease_db_DisGeNet.csv")
    
disease_db = pd.read_csv(predict_folder+"DisGeNet/disease_db_DisGeNet.csv", index_col=0)

if (not os.path.exists(predict_folder+"disease_sequence_similarity_df.csv")):
    sequence_aligner = Align.PairwiseAligner()
    fname = paths_global.drugbank_folder+"TARGET SEQUENCES/Drug Target Sequences/gene.fasta"
    ## Target sequences
    target_sequences = [record.seq for record in SeqIO.parse(fname, "fasta")]
    ndiseases=disease_db.shape[0]
    
    precompute_scores = {}
    def compute_score(si,sj):
        return precompute_scores.get((si,sj), sequence_aligner.score(si, sj))
    
    def disease_seq_function(i, j):
        print((i,j,ndiseases))
        if (i>j):
            return 0.
        else:
            disease_i = disease_db.index[i]
            disease_j = disease_db.index[j]
            ## all sequences in which disease_i appears
            target_i_ids = [ix for ix, protein in enumerate(proteins_list) if (protein in disease_db.loc[disease_i]["Protein"])]
            target_seq_i = [target_sequences[idx_i] for idx_i in target_i_ids]
            if (i==j):
                align_scores = [compute_score(s1,s2) for s1 in target_seq_i for s2 in target_seq_i]
            else:
                ## all sequences in which disease_j appears
                target_j_ids = [ix for ix, protein in enumerate(proteins_list) if (protein in disease_db.loc[disease_j]["Protein"])]
                target_seq_j = [target_sequences[idx_j] for idx_j in target_j_ids]
                align_scores = [compute_score(s1,s2) for s1 in target_seq_i for s2 in target_seq_j]
            score = np.mean(align_scores) if (len(align_scores)>0) else np.nan
            return score

    start = time()
    if (os.path.exists(predict_folder+"DISEASE_TARGET_SIMILARITY/disease_sequence_similarity.out")):
        disease_sequence_similarity = np.loadtxt(predict_folder+"DISEASE_TARGET_SIMILARITY/disease_sequence_similarity.out")
    else:
        disease_sequence_similarity = np.zeros((ndiseases, ndiseases))
    start_i = 0
    for i in range(start_i, ndiseases):
        for j in range(i, ndiseases):
            disease_sequence_similarity[i,j] = disease_seq_function(i, j)
        np.savetxt(predict_folder+"DISEASE_TARGET_SIMILARITY/disease_sequence_similarity.out", disease_sequence_similarity)
    end = time()-start
    print("Computation took %.2f sec." % end)

    disease_sequence_similarity = np.loadtxt(predict_folder+"DISEASE_TARGET_SIMILARITY/disease_sequence_similarity.out")
    disease_sequence_similarity_df = pd.DataFrame(disease_sequence_similarity, index=disease_db.index, columns=disease_db.index)
    disease_sequence_similarity_df

    diag = np.diag(disease_sequence_similarity_df.values)
    disease_sequence_similarity_df += disease_sequence_similarity_df.T
    np.fill_diagonal(disease_sequence_similarity_df.values, diag)
    ## normalisation
    disease_sequence_custom = disease_sequence_similarity_df/norm_cst
    disease_sequence_custom.index = disease_sequence_custom.index.astype(str)
    disease_sequence_custom.columns = disease_sequence_custom.columns.astype(str)
    
    disease_sequence_custom.to_csv(predict_folder+"disease_sequence_similarity_df.csv")

disease_sequence_custom = pd.read_csv(predict_folder+"disease_sequence_similarity_df.csv", index_col=0)

### III.5 Disease-disease similarity: Closeness in the human PPI network disease-disease similarities

This yields a dataframe `disease_network_similarity_df.csv` of size #diseases x #diseases, where identifiers are MedGen Concept ID identifiers, which contains, when computable, the transformed distance between drug targets in the PPI using the Floyd-Warshall algorithm. We apply to the Floyd-Warshall cost matrix the transformation suggested by the [PREDICT paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3159979/).

Similar to Section [**II.4 Drug-drug similarity 4: Closeness in the human PPI network drug-drug similarities**](http://localhost:8888/notebooks/PREDICT_dataset.ipynb#II.4-Drug-drug-similarity:-Closeness-in-the-human-PPI-network-drug-target-to-drug-target-similarities).

In [17]:
network_folder = predict_folder+"DISEASE_NETWORK_SIMILARITY/"
sb.Popen(["mkdir", "-p", network_folder])

sb.Popen(["mkdir", "-p", predict_folder+"DisGeNet/"])
if (not os.path.exists(predict_folder+"DisGeNet/disease_db_DisGeNet.csv")):
    
    from NORDic.UTILS.DISGENET_utils import get_user_key_DISGENET, get_genes_proteins_from_DISGENET
    A = pd.read_csv(paths_global.data_folder+"FEATURELESS_v1.0.0/all_ratings.csv", index_col=0)
    disease_list = list(A.columns)
    
    user_key = get_user_key_DISGENET(paths_global.disgenet_file)
    disease_db = get_genes_proteins_from_DISGENET(disease_list, limit=3000, source="CURATED", min_score=0.35, 
                                min_ei=1., min_dsi=0.25, min_dpi=0, chunksize=100, user_key=user_key, quiet=False)
    
    disease_db.to_csv(predict_folder+"DisGeNet/disease_db_DisGeNet.csv")
    
disease_db = pd.read_csv(predict_folder+"DisGeNet/disease_db_DisGeNet.csv", index_col=0)
protein_targets = list(set([y for x in disease_db["Protein"] for y in x.split("; ")]))

if (not os.path.exists(predict_folder+"disease_network_similarity_df.csv")):
    from NORDic.UTILS.STRING_utils import get_app_name_STRING, get_network_from_STRING
    network_fname = network_folder+"disease_human_ppi_network.csv"
    ## 9606=human, retrieve PPI from STRING
    if (not os.path.exists(network_fname)):
        gene_list = list(protein_db["Gene Name"].unique())
        taxon_id=9606 #human
        app_name = get_app_name_STRING(paths_global.string_file)
        ppi = get_network_from_STRING(gene_list, taxon_id, min_score=0, network_type="functional", add_nodes=0, 
                                      app_name=app_name, version="11.5", quiet=False)
        ppi.to_csv(network_fname)
    ppi = pd.read_csv(network_fname, index_col=0)

    ## selection of interactions
    score_thres=0

    ## Get disease-gene associations
    gene_targets = list(set([y for x in disease_db["Gene Name"] for y in x.split("; ")]))

    ppi = pd.read_csv(network_fname, sep="\t", header=0, index_col=0)
    ppi_subset = ppi.loc[ppi["score"]>score_thres][["preferredName_A","preferredName_B","score"]]
    ppi_subset = ppi_subset.loc[[(a in gene_targets) and (b in gene_targets) for a,b in zip(list(ppi_subset["preferredName_A"]),list(ppi_subset["preferredName_B"]))]]
    ppi_subset["cost"] = 1-ppi_subset[["score"]].values.flatten() #cost
    ppi_subset = ppi_subset[["preferredName_A","preferredName_B","cost"]]
    ppi_subset = ppi_subset.drop_duplicates(keep="first")

    ## 1 is the maximum cost of edge
    max_cost=1.
    wam = ppi_subset.pivot(index='preferredName_A', columns='preferredName_B', values='cost').fillna(max_cost)
    for g in wam.index:
        if (g not in wam.columns):
            wam[g] = [max_cost]*wam.shape[0]
    for g in wam.columns:
        if (g not in wam.index):
            wam.loc[g] = [max_cost]*wam.shape[1]
    wam = wam.loc[wam.index][wam.index]

    ## Compute distance between target genes
    from scipy.sparse.csgraph import floyd_warshall

    dists = floyd_warshall(wam.values.copy(order='C'), directed=False)
    dists_df = pd.DataFrame(dists, index=wam.index, columns=wam.columns)

    A_mat, b_mat = 0.9*np.exp(1), 1.
    disease_network_similarity = A_mat*np.exp(-b_mat*dists_df)
    disease_network_similarity /= np.max(disease_network_similarity)

    diseases = pd.read_csv(ratings_file, index_col=0).columns
    disease_network_similarity_1 = pd.DataFrame(0., index=diseases, columns=disease_network_similarity.columns)
    for disease in diseases:
        if (disease not in disease_db.index):
            continue
        genes_disease = disease_db.loc[disease]["Gene Name"].split("; ")
        targeted_genes = [x for x in genes_disease if (x in disease_network_similarity.index)]
        disease_network_similarity_1.loc[disease] = list(disease_network_similarity.loc[targeted_genes].mean(axis=0))

    disease_network_similarity_df = pd.DataFrame([], index=diseases)
    for disease in diseases:
        if (disease not in disease_db.index):
            disease_network_similarity_df[disease] = np.nan
            continue
        genes_disease = disease_db.loc[disease]["Gene Name"].split("; ")
        targeted_genes = [x for x in genes_disease if (x in disease_network_similarity.index)]
        disease_network_similarity_df[disease] = disease_network_similarity_1[targeted_genes].mean(axis=1)
    disease_network_similarity_df.index = disease_network_similarity_df.index.astype(str)
    disease_network_similarity_df.columns = disease_network_similarity_df.columns.astype(str)
    
    disease_network_similarity_df.to_csv(predict_folder+"disease_network_similarity_df.csv")
    
disease_network_custom = pd.read_csv(predict_folder+"disease_network_similarity_df.csv", index_col=0)

### III.6 Disease-disease similarity: Gene Onthology disease-disease similarities

This yields a dataframe `disease_go_similarity_df` of size #diseases x #diseases, where identifiers are MedGen Concept ID identifiers, which contains the similarity between Gene Onthology annotations.

Similar to Section [**II.5 Drug-drug similarity: Gene Onthology drug-drug similarities**](http://localhost:8888/notebooks/PREDICT_dataset.ipynb#II.5-Drug-drug-similarity:-Gene-Onthology-drug-drug-similarities).

In [18]:
sb.Popen(["mkdir", "-p", predict_folder+"DisGeNet/"])
if (not os.path.exists(predict_folder+"DisGeNet/disease_db_DisGeNet.csv")):
    
    from NORDic.UTILS.DISGENET_utils import get_user_key_DISGENET, get_genes_proteins_from_DISGENET
    A = pd.read_csv(paths_global.data_folder+"FEATURELESS_v1.0.0/all_ratings.csv", index_col=0)
    disease_list = list(A.columns)
    
    user_key = get_user_key_DISGENET(paths_global.disgenet_file)
    disease_db = get_genes_proteins_from_DISGENET(disease_list, limit=3000, source="CURATED", min_score=0.35, 
                                min_ei=1., min_dsi=0.25, min_dpi=0, chunksize=100, user_key=user_key, quiet=False)
    
    disease_db.to_csv(predict_folder+"DisGeNet/disease_db_DisGeNet.csv")
    
disease_db = pd.read_csv(predict_folder+"DisGeNet/disease_db_DisGeNet.csv", index_col=0)

if (not os.path.exists(predict_folder+"disease_go_similarity_df.csv")):
    disease_go_sim_fname = "disease_go_similarity"
    if (not os.path.exists(predict_folder+"DISEASE_GO_SIMILARITY/"+disease_go_sim_fname+".csv")):
        sb.Popen(("mkdir -p "+predict_folder+"DISEASE_GO_SIMILARITY/").split(" "))
        proteins = list(set([y for x in list(disease_db["Protein"]) for y in x.split("; ")]))
        with open("proteins.txt", "w+") as f:
            f.write(",".join(proteins))
            process = sb.Popen(["Rscript", "../src/Rscript_GOsim.R"])
            process.wait()
            sb.Popen(["mv", "disease_go_similarity.csv", "disease_go_similarity_MF.csv", 
                      "disease_go_similarity_CC.csv", "disease_go_similarity_BP.csv", 
                      predict_folder+"DISEASE_GO_SIMILARITY/"])
    ## annotated by protein
    disease_go_similarity = pd.read_csv(predict_folder+"DISEASE_GO_SIMILARITY/"+disease_go_sim_fname+".csv", index_col=0)

    disease_go_similarity_mat = disease_go_similarity.fillna(0.)

    diseases = pd.read_csv(paths_global.data_folder+"FEATURELESS_v1.0.0/all_ratings.csv", index_col=0).columns
    disease_go_similarity_1 = pd.DataFrame(0., index=diseases, columns=disease_go_similarity_mat.columns)
    for disease in diseases:
        if (disease not in disease_db.index):
            continue
        protein_disease = disease_db.loc[disease]["Protein"].split("; ")
        targeted_proteins = [x for x in protein_disease if (x in disease_go_similarity_mat.index)]
        disease_go_similarity_1.loc[disease] = list(disease_go_similarity_mat.loc[targeted_proteins].mean(axis=0))
    disease_go_similarity_1

    disease_go_similarity_df = []
    for disease in diseases:
        if (disease not in disease_db.index):
            disease_go_similarity_df.append(pd.DataFrame([], index=diseases, columns=[disease]))
            continue
        proteins_disease = disease_db.loc[disease]["Protein"].split("; ")
        targeted_proteins = [x for x in proteins_disease if (x in disease_go_similarity_1.columns)]
        disease_go_similarity_df.append(pd.DataFrame(disease_go_similarity_1[targeted_proteins].mean(axis=1), columns=[disease]))
    disease_go_similarity_df = disease_go_similarity_df[0].join(disease_go_similarity_df[1:], how="outer")
    disease_go_similarity_df.index = disease_go_similarity_df.index.astype(str)
    disease_go_similarity_df.columns = disease_go_similarity_df.columns.astype(str)
    
    disease_go_similarity_df.to_csv(predict_folder+"disease_go_similarity_df.csv")
    
disease_go_similarity_df = pd.read_csv(predict_folder+"disease_go_similarity_df.csv", index_col=0)
disease_go_similarity_df

Unnamed: 0,C1851649,C0042133,C5193005,C2676676,C1704272,C4722327,C1858361,C2676676.1,C4310232,C0029456,...,C0242770,C1880129,C0022661,C0236792,C1135191,C0149516,C1835407,C0016667,C0039445,C5203670
C1851649,,0.000000,,0.000000,,,0.000000,,0.000000,0.000000,...,,,0.000000,0.000000,,,,,0.000000,
C0042133,,0.417667,,0.188667,,,0.142500,,0.099500,0.139000,...,,,0.141179,0.136333,,,,,0.157333,
C5193005,,0.000000,,0.000000,,,0.000000,,0.000000,0.000000,...,,,0.000000,0.000000,,,,,0.000000,
C2676676,,0.188667,,0.437333,,,0.142500,,0.124167,0.130567,...,,,0.148026,0.133333,,,,,0.191889,
C1704272,,0.000000,,0.000000,,,0.000000,,0.000000,0.000000,...,,,0.000000,0.000000,,,,,0.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0149516,,0.000000,,0.000000,,,0.000000,,0.000000,0.000000,...,,,0.000000,0.000000,,,,,0.000000,
C1835407,,0.000000,,0.000000,,,0.000000,,0.000000,0.000000,...,,,0.000000,0.000000,,,,,0.000000,
C0016667,,0.000000,,0.000000,,,0.000000,,0.000000,0.000000,...,,,0.000000,0.000000,,,,,0.000000,
C0039445,,0.157333,,0.191889,,,0.133722,,0.097056,0.146489,...,,,0.142205,0.124000,,,,,0.264185,


## IV. Build final  drug-drug, disease-disease and drug-disease matrices

Disease ids: Concept IDs, drug ids: DrugBank or PubChem CIDs. Build matrices A (ratings) drugs $\times$ diseases, P (disease features) disease features $\times$ diseases, S (drug features) drug features $\times$ drugs, with NaN for missing values.

### IV.1. Merge matrices

In [19]:
path_dataset = predict_folder

## Drug-disease associations
custom = pd.read_csv(paths_global.data_folder+"FEATURELESS_v1.0.0/all_ratings.csv", index_col=0)

## Drug-drug similarities
ftypes = ["chemical", "se", "sequence", "network", "go", "signature"]
## Disease-disease similaries
ftypes += ["disease_"+x for x in ["phenotype", "semantic", "sequence", "network", "go", "signature"]]

fnames = [path_dataset+x+"_similarity_df.csv" for x in ftypes]
fdict = {ftype: fnames[ift] for ift, ftype in enumerate(ftypes)}

for ftype in fdict:
    df = pd.read_csv(fdict[ftype], index_col=0, header=0)
    df.index = [ftype+"-"+str(x) for x in df.index]
    globals()[ftype+"_custom"] = df
    
## Concatenate all features
S = pd.concat([eval(ftype+"_custom") for ftype in ftypes if ("disease" not in ftype)], axis=0, join="outer")
P = pd.concat([eval(ftype+"_custom") for ftype in ftypes if ("disease" in ftype)], axis=0, join="outer")

## Restrict to diseases in custom (association matrix)
P = P.drop_duplicates(keep="first")[list(set([p for p in custom.columns if (p in P.columns)]))]
P_PREDICT = P.loc[~P.index.duplicated()]

## Restrict to drugs in custom (association matrix)
S = S.drop_duplicates(keep="first")[list(set([x for x in list(custom.index) if (x in S.columns)]))]
S_PREDICT = S.loc[~S.index.duplicated()]

## Restrict association matrix
A = custom[P.columns].loc[S.columns]
A_PREDICT = A.loc[~A.index.duplicated()]
A_PREDICT.index = A_PREDICT.index.astype(str)

S_PREDICT.to_csv(predict_folder+"items.csv")
P_PREDICT.to_csv(predict_folder+"users.csv")
A_PREDICT.to_csv(predict_folder+"ratings_mat.csv")

ratings_A = utils.matrix2ratings(A_PREDICT, "ind_id", "drug_id", "rating")
print("Sparsity = "+str(utils.compute_sparsity(A_PREDICT))+"%")
print("%d drug features %d disease features" % (S_PREDICT.shape[0], P_PREDICT.shape[0]))
utils.print_dataset(ratings_A, "ind_id", "drug_id", "rating")
ratings_A.T

Sparsity = 0.3823575264626351%
6030 drug features 2361 disease features
Ndrugs=1395	Ndiseases=1501
8240 positive	295 negative	2085360 unknown matchings


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8525,8526,8527,8528,8529,8530,8531,8532,8533,8534
ind_id,C0003467,C1269683.1,C0040558,C1332061,C0008513,C0040560,C0009324,C0037299,C0032302,C0151175,...,C1853965,C0009443,C0027424,C0006266,C1260880,C1859648,C0035455,C0037195,C1869116,C0265962
drug_id,3299,3299,5215,5215,5215,5215,5215,5215,54671203,54671203,...,9294,9294,9294,9294,9294,9294,9294,9294,9294,9294
rating,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### IV.2. Generating matrices which can be shared 

In [20]:
A_PREDICT = pd.read_csv(predict_folder+"ratings_mat.csv")
drugs, diseases = list(A_PREDICT.index), list(A_PREDICT.columns)

restrict = lambda df, ls : df[[s for s in ls if (s in df.columns)]]

for matrix_name in ["se","signature","disease_phenotype","disease_semantic"]:
    df = restrict(eval(matrix_name+"_custom"), drugs if ("disease" not in matrix_name) else diseases)
    df.to_csv(matrix_name+"_PREDICT_matrix.csv")