# Creating the "PREDICT" dataset based on various data sources

Version 2.0.0 (May 29th 2023). Please run the notebooks "FEATURELESS_dataset.ipynb" and "TRANSCRIPT_dataset" beforehand.

## Librairies

In [1]:
import pandas as pd
import numpy as np
import subprocess as sb
import os
import pickle

from multiprocessing import cpu_count
from joblib import Parallel, delayed
from time import time
from itertools import product
n_jobs=cpu_count()-2
assert n_jobs > 0 and n_jobs < cpu_count()
parallel = Parallel(n_jobs=n_jobs, backend='loky')

import sys
sys.path.insert(0, "../src/")

import paths_global
import utils
import data_processing

## Local paths

In [2]:
## Where database files are stored
print('root_folder="%s"' % paths_global.root_folder)
## Where intermediary files are stored
print('data_folder="%s"' % paths_global.data_folder)

root_folder="/media/kali/1b80f30d-2803-4260-a792-9ae206084252/Code/M30/data/"
data_folder="/media/kali/1b80f30d-2803-4260-a792-9ae206084252/RECeSS/cfdr/data/"


In [3]:
predict_folder = paths_global.data_folder+"PREDICT/"
sb.Popen(["mkdir", "-p", predict_folder])
## Where PREDICT dataset files are stored
print('predict_folder="%s"' % predict_folder)

predict_folder="/media/kali/1b80f30d-2803-4260-a792-9ae206084252/RECeSS/cfdr/data/PREDICT/"


## Drug and disease identifiers

In [4]:
assert os.path.exists(paths_global.data_folder+"drugbankid2drugname.pck")
with open(paths_global.data_folder+"drugbankid2drugname.pck", "rb") as f:
    di_drugbankid2drugname = pickle.load(f)
    
assert os.path.exists(paths_global.data_folder+"omimid2diseasename.pck")
with open(paths_global.data_folder+"omimid2diseasename.pck", "rb") as f:
    di_omimid2diseasename = pickle.load(f)
    
cids_file = paths_global.data_folder+"medgenid2diseasename.pck"
if (not os.path.exists(cids_file)):
    di_medgenid2diseasename = {}
else:
    with open(cids_file, "rb") as f:
        di_medgenid2diseasename = pickle.load(f)
        
pubchem_file = paths_global.data_folder+"pubchemid2drugname.pck"
if (not os.path.exists(pubchem_file)):
    di_pubchemid2drugname = {}
else:
    with open(pubchem_file, "rb") as f:
        di_pubchemid2drugname = pickle.load(f)

## I. Matrix A : $N_S \times N_D$ of drug-disease associations

In [5]:
A = utils.load_dataset("FEATURELESS", save_folder=paths_global.data_folder)["ratings_mat"]
A

Unnamed: 0,C0272275,C0585362,C3163899,C1319317,C0280324,C0007102,C0010674,C0079773,C0003873,CN263340,...,C1865810,C1848042,C1838261,C1832605,C1832474,C1866519,C1866041,C1866040,C1864068,C5203670
CID104999,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CID442021,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CID442872,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DB13415,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DB00001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DB16355,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DB16393,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DB16394,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DB16416,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
ratings_A = utils.matrix2ratings(A, "ind_id", "drug_id", "rating")

print("Sparsity = "+str(utils.compute_sparsity(A))+"%")
utils.print_dataset(ratings_A, "ind_id", "drug_id", "rating")
ratings_A.T

Sparsity = 0.33728805072386664%
Ndrugs=1600	Ndiseases=1576
8397 positive	225 negative	2512978 unknown matchings


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8612,8613,8614,8615,8616,8617,8618,8619,8620,8621
ind_id,C0006840,C0006840,C0006840,C0006840,C0149893,C0035235,C0339170,C0042510,C0149782,C0279639,...,C1851649,C1851649,C1851649,C1851649,C1851649,C1851649,C1851649,C1851649,C1851649,C1851649
drug_id,CID104999,CID442021,CID442872,DB13415,DB00001,DB00002,DB00002,DB00002,DB00002,DB00002,...,DB14761,DB15661,DB15718,DB15940,DB15941,DB16355,DB16393,DB16394,DB16416,DB16691
rating,-1,-1,-1,-1,1,1,1,1,1,1,...,-1,1,-1,1,1,1,1,1,1,1


## II. Drug-drug similarity matrix

We build a larger version of the similarity matrices mentioned in the [PREDICT](https://dx.doi.org/10.1038%2Fmsb.2011.26) paper.

### II.1. Drug-drug similarity: Chemical drug-drug similarities (from structures in DrugBank or in PubChem)

This yields a dataframe `chemical_similarity_df` of size #drugs x #drugs, where identifiers are PubChem or DrugBank identifiers, which contains, when computable, drug-pairwise Tanimoto scores between SMILES fingerprints.

In [7]:
drugbank_ids = [x if (x[:len("DB")]=="DB") else None for x in list(A.index)]
pubchem_ids = [int(x[len("CID"):]) if (x[:len("CID")]=="CID") else None for x in list(A.index)]
assert len([x for x in drugbank_ids+pubchem_ids if (x is not None)])==A.shape[0]
chemical_similarity_file = predict_folder+"chemical_similarity_df.csv"

if (not os.path.exists(chemical_similarity_file)):
    drugbank_chem_file = paths_global.drugbank_folder+"STRUCTURES/Structure External Links/structure links.csv"
    if (not os.path.exists(drugbank_chem_file)):
        sb.call("unzip "+path+"drugbank_all_structure_links.csv.zip", shell=True)
    chemical_based = pd.read_csv(drugbank_chem_file, sep=",", index_col=0, header=0)[["SMILES"]]
    chemical_based = chemical_based.loc[[d for d in drugbank_ids if (d in chemical_based.index)]].dropna()
    chemical_based2 = pd.DataFrame([[data_processing.get_pubchem_smiles(cid)] 
                                    for cid in pubchem_ids if (cid is not None)], 
                            index=["CID"+str(x) for x in pubchem_ids if (x is not None)], columns=["SMILES"])
    chemical_based = pd.concat((chemical_based,chemical_based2), axis=0)
    ndrugs=chemical_based.shape[0]
    print("Found drugs %d/%d" % (ndrugs, len(drugbank_ids)))
    
    def job_function(i, j, run_id=None, seed=None):
        if (str(seed)!="None"):
            np.random.seed(seed)
        if (i<j):
            return 0.
        elif (i==j):
            return 1.
        else:
            return data_processing.smiles_similarity(chemical_based.values[i,0], chemical_based.values[j,0])

    start = time()
    print("%d drugs, %d jobs" % (ndrugs, n_jobs))
    if (n_jobs==1):
        chemical_similarity = [job_function(i, j) for i, j in product(range(ndrugs), range(ndrugs))]
    else:
        seeds = [np.random.randint(int(10e6)) for _ in range(ndrugs*ndrugs)]
        chemical_similarity = parallel(delayed(job_function)(run_id//ndrugs, run_id%ndrugs, run_id, seed) for run_id, seed in enumerate(seeds))
    
    chemical_similarity = np.array(chemical_similarity).reshape((ndrugs, ndrugs))
    chemical_similarity += chemical_similarity.T
    chemical_similarity[range(ndrugs), range(ndrugs)] = 1.
    end = time()-start

    drug_index=chemical_based.index  
    chemical_similarity_df = pd.DataFrame(chemical_similarity, index=drug_index, columns=drug_index)
    chemical_similarity_df.to_csv(chemical_similarity_file)
    print("Computation took %.2f sec. on %d jobs" % (end, n_jobs))
chemical_similarity_df = pd.read_csv(chemical_similarity_file, header=0, index_col=0, sep=",")

### II.2. Drug-drug similarity: Side effect drug-drug similarities (from reported SE in SIDER)

This yields a dataframe `se_similarity_df` of size #drugs x #drugs, where identifiers are PubChem or DrugBank identifiers, which contains, when computable, drug-pairwise Jaccard scores between lists of SIDER-reported side effects.

In [8]:
drugbank_ids = [x if (x[:len("DB")]=="DB") else None for x in list(A.index)]
pubchem_ids = [int(x[len("CID"):]) if (x[:len("CID")]=="CID") else None for x in list(A.index)]
assert len([x for x in drugbank_ids+pubchem_ids if (x is not None)])==A.shape[0]
se_similarity_file = predict_folder+"se_similarity_df.csv"

sb.Popen(["mkdir", "-p", predict_folder+"SE_SIMILARITY/"])

if (not os.path.exists(se_similarity_file)):
    drug_names = pd.read_csv(paths_global.sider_folder+"drug_names.tsv", header=None, sep="\t", 
                             index_col=0)
    drug_names.columns = ["drug_name"]
    drug_names = drug_names.to_dict()["drug_name"]
    se = pd.read_csv(paths_global.sider_folder+"meddra_all_se.tsv", header=None, sep="\t", 
                           index_col=0)
    se.index = [drug_names[s] for s in se.index]
    se = se[se.columns[[2, 3]]]
    se.columns = ['MedDRA SE concept type', 'UMLS concept id for MedDRA term']
    ## preferred term, unique
    se = se.loc[se['MedDRA SE concept type'] == "PT"][se.columns[-1:]]
    
    drugnames = list(set(list(se.index)))
    if (not os.path.exists(predict_folder+"SE_SIMILARITY/se_drugnames_PubChem.csv")):
        pubchem_ids_SIDER = data_processing.get_pubchemcid_from_SIDER(drugnames, paths_global.data_folder)
        df = pd.DataFrame([pubchem_ids_SIDER], index=["PubChemCID"], columns=drugnames).T
        df.to_csv(predict_folder+"SE_SIMILARITY/se_drugnames_PubChem.csv")
    if (not os.path.exists(predict_folder+"SE_SIMILARITY/se_drugnames_DrugBank.csv")):
        drugbank_ids_SIDER = data_processing.get_drugbank_ids(drugnames, paths_global.data_folder)
        df = pd.DataFrame([drugbank_ids_SIDER], index=["DrugBank"], columns=drugnames).T
        df.to_csv(predict_folder+"SE_SIMILARITY/se_drugnames_DrugBank.csv")
        
    ## Missing DrugBank ids
    missing_drugbank_ids = {'Locorten': None, 'APAs': None, 'luliconazole': "DB08933", 'acipimox': "DB09055", 
            'demethyl': None, 'monomethylfumarate': "DB14219", '18F-flutemetamol': "DB09151", 
            'tetrahydrozoline': "DB06764", 'LMWH': "DB01109", 'arsenic': "DB01169", 'Sativex': "DB00470", 
            'lactate': None, 'Almeta': None, 'clobetasone': "DB13158", 'iodide': "DB12754", 
            'NSC': None, 'pirfenidone': "DB04951", 'mTHPC': None, 'n-3': None, 'dextran': "DB09255",
            'Lu': "DB09068", 'nalidixic': "DB00779", 'Vancocine': "DB00512", 'chloride': "DB14547", 
            'Madopar': "DB01235", 'nitrogen': "DB09152", 'emedastine': "DB01084", 'cyproterone': "DB04839", 
            'fusidic': "DB02703", 'alpha-methyl-p-tyrosine': "DB16306", 'teduglutide': "DB08900", 
            'MK-462': "DB00953", 'mebeverine': "DB12554", 'Protirelin': "DB09421", 'telavancin': "DB06402", 
            'copolymer': "DB05259", 'nefopam': "DB12293", 'pyrantel': "DB11156", 
            'yttrium': "DB13076", 'x': None, 'estramustine': "DB01196", 'sodium': "DB14516", 'citric': "DB04272", 
            'TR-700': "DB14569", 'methylene': None, 'Metrodin': "DB00094", 'lodoxamide': "DB06794", 
            'rufinamide': "DB06201", 'poly(styrene': "DB01344", '17-hydroxyprogesterone': "DB14570", 
            'olodaterol': "DB09080", 'thiazide': "DB00880", 'ethacrynic': "DB00903", 'AN2690': "DB09041", 
            'iloperidone': "DB04946", 'tramazoline': "DB13064", 'SOM230': "DB06663", 'zoledronic': "DB00399", 
            'graphene': "DB15994", 'nitrite': "DB12529", 'anthracycline': "DB00694", 'UDCA': "DB01586", 
            'LiOH': "DB14506", 'S-benzoylmercaptoacetyltriglycine': "DB14082", 'megestrol': "DB00351", 
            'SU5416': "DB06436", 'tenoxicam': "DB00469", 'MnDPDP': "DB06796", 'delta': None, 
            'clevidipine': "DB04920", 'Glat': None, 'trimethoprim-sulfamethoxazole': None, 'Revasc': "DB11095", 
            'methylergometrine': "DB00353", 'belinostat': "DB05015", 'Vitrum': "DB09532", 'quinaprilat': "DB14217",
            'nicorandil': "DB09220", 'ramiprilat': "DB14208", 'isopropyl': "DB02325", 'mesna': "DB09110", 
            'Benicar-HCT': "DB00999", 'chromium': "DB11136", '4-PBA': "DB06819", 'isomannide': "DB09401", 
            'lithium': "DB14509", 'diethylenetriaminepentaacetic': "DB14007", 'doripenem': "DB06211", 
            'hydroxypropyl': None, 'penicillin': "DB01053", 'fenofibric': "DB13873", 'cytokinin': "DB11336", 
            'samarium': "DB12403", 'FAMP': "DB01073", 'mefenamic': "DB00784", 'gold': "DB14154", 
            'umeclidinium': "DB09076", 'trans': None, 'CGP': None, 'dextrorphan': "DB14682", 'IdUrd': "DB00249", 
            'ammonium': "DB06768", 'Triphasil': "DB00977", 'bicarbonate': "DB15926", 'o291': None,
            '5-methyltetrahydrofolate': "DB11256", 'AC1O3HA7': None, 'polythiazide': "DB01324", 'K779': None, 
            'biguanide': "DB13100", 'dothiepin': "DB09167", 'methyl': "DB00992", 'amorolfine': "DB09056", 
            'CAS': None, 'lasofoxifene': "DB06202", 'benzyl': None, 'acetate': "DB14511", 'Depreotide': "DB11628", 
            'halobetasol': "DB00596", 'Histrelin': "DB06788", 'fidaxomicin': "DB08874", 'strontium': None, 
            'zidovudine/lamivudine': "DB00495", 'eicosapentaenoic': "DB00159", 'besifloxacin': "DB06771", 
            'calcium': "DB01373", '2-hydroxysuccinaldehyde': None, 'sacrosidase': "DB06760", 
            'N-carbamylglutamate': "DB06775", 'Diane-35': "DB04839", 'polymyxin': "DB00781", 
            'Hoe': "DB06196", 'phenylbutyric': "DB06819", 'hepatitis': "DB11627", 'ICI': "DB00947", 
            '18F-FDG': "DB09502", 'NTBC': "DB00348", 'p-aminohippurate': "DB00345", 'lacosamide': "DB06218", 
            'Nuvocid': None, 'silver': "DB12965", 'bile': None, 'Brolene': None, 'paraldehyde': "DB09117", 
            'd-telaprevir': "DB05521", 'fluoride': "DB11257", 'EACA': "DB00513", 
            'tetrofosmin': "DB11180", 'hetastarch': "DB09106", 'colestimide': "DB11634", 'tiaprofenic': "DB01600", 
            'Paroven': "DB15826", 'alogliptin': "DB06203", 'radium': None, 'Prussian': "DB06783", 
            'retinoic': "DB00755", 'nitrous': "DB09112", 'phosphate': "DB02831", 'Insulin': "DB00030", 
            'epitopic': "DB06781", 'moxonidine': "DB09242", 'glucose': None, 
            'Tadenan':"DB00632", 'sulfonamide': "DB00259", 'isosulfan': "DB09136", 'lormetazepam': "DB13872", 
            'gadolinium': "DB12091", 'oCRH': "DB09067", 'diflorasone': "DB00223", 
            'phenyllactate': None, 'FP-CIT': None, 'dexpanthenol': "DB09357", 'moexiprilat': "DB14210", 
            'selenium': "DB11135", 'sulconazole': "DB06820", 'salmon': None, 'polyoxyethylene': None, 
            'fluticasone/salmeterol': "DB13867", 'Oestrogen': "DB00783", 'SonoVue': "DB11104", 'nitric': "DB00435", 
            'Fe(III': "DB01592", 'ceftizoxime': "DB01332", 'Ge-132': "DB17608", 'conjugated': None, 
            'cortisone': "DB14681", 'T-A2-3': "DB06149", 'L-threo-DOPS': None, 'Cantril': "DB04843", 
            'bromcresol': None, 'ACTH(1-39': "DB01285", 'NuvaRing': "DB00977", 
            'irbesartan-hydrochlorothiazide': "DB00999", 'ammonia': "DB11118", 'cephem': "DB00833", 'ALX': None, 
            'Optison': "DB00556", 'bendamustine': "DB06769", 'hydroxyl': None, 'acetohydroxamic': "DB00551", 
            'A77': "DB08880", 'zinc': "DB01593", 'VACV': None, 'FdUrd': "DB00322", 
            'Gd-EOB-DTPA': "DB08884", 'Prednefrin': "DB15566", 'clobetasol': "DB11750", 'vitamin': None, 
            'Org': "DB06785", 'lanthanum': "DB06792", 'hydroxybutyrate': "DB01440", 
            'flumethasone': "DB00663", 'mitomycin': "DB00305", 'Kaluril': "DB00594", 'technetium-99m': "DB14227", 
            'insulin': "DB00030", 'florbetaben': "DB09148", 'hemin': "DB03404", 'iron': "DB01592", 
            'pentastarch': "DB09111", 'benzylpenicilloyl': "DB00895", 'malvidin': None, 'thiosulfate': None, 
            'pitavastatin': "DB08860", 'magnesium': "DB14513", '5-aminolevulinic': "DB00855", 'barium': "DB11150", 
            'abacavir-lamivudine': "DB01048", 'copper': "DB09130", 'tasimelteon': "DB09071", 
            'florbetapir': "DB09149", 'ziconotide': "DB06283", 'tesamorelin': "DB08869", '68Ga': "DB14524", 
            'pizotifen': "DB06153", 'gamma-aminobutyric': "DB02530", 'trandolaprilat': "DB14209", 
            'medroxyprogesterone': "DB00603", 'linaclotide': "DB08890", 'fluocinolone': "DB12553", 
            'pentosan': "DB00686", 'Trisequens': None, 'mafenide': "DB06795", 
            'pertechnetate': "DB09314", 'dimercaptosuccinic': "DB14089", 'glucagon': "DB00040", 
            'tropisetron':"DB11699", 'v': None, 'levodopa/carbidopa': "DB01235", 
            'Timentin': "DB00766", 'tetraen': "DB00375", 'Asp-Tyr(SO3H)-Met-Gly-Trp-Met-Asp-Phe-NH2': "DB09142", 
            'mycophenolic': "DB01024", 'vorapaxar': "DB09030", 'Stalevo': "DB00494", 'benzathine': "DB01053", 
            'estrone': None, 'mebrofenin': "DB15779", 'benzoyl': "DB09096", 'betaine': "DB06756", 'L-Dmp': None, 
            'HMDP': "DB14159", 'iotrolan': "DB09487", 'spinosad': "DB08823", 'benzydamine': "DB09084", 
            'estriol': "DB04573", 'TPGS': "DB11635", 'indacaterol': "DB05039", 'prostaglandin': None, 
            'potassium': None
    }
    
    pubchem_df = pd.read_csv(predict_folder+"SE_SIMILARITY/se_drugnames_PubChem.csv", index_col=0)
    drugbank_df = pd.read_csv(predict_folder+"SE_SIMILARITY/se_drugnames_DrugBank.csv", index_col=0) 
    drugbank_df[drugbank_df.columns[0]] = [missing_drugbank_ids.get(x,drugbank_df.loc[x]["DrugBank"]) for x in drugbank_df.index]
    print("Found drugs: %d/%d (PubChem) %d/%d (DrugBank)" % (pubchem_df.dropna().shape[0], pubchem_df.shape[0],
                                                            drugbank_df.dropna().shape[0],drugbank_df.shape[0]))
    
    if (pubchem_df.dropna().shape[0]!=pubchem_df.shape[0]):
        missing = list(pubchem_df.loc[pd.isnull(pubchem_df["PubChemCID"])].index)
        #print(missing)

    pubchem_di = pubchem_df.loc[~pd.isnull(pubchem_df["PubChemCID"])]["PubChemCID"].astype(int).to_dict()
    inv_map = {int(v):k for k,v in pubchem_di.items()}
    drugnames_inA_pubchem = [inv_map.get(int(x),x) for x in pubchem_ids if (x in inv_map)]
    print("Found drugs in A (PubchemCID): %d/%d" % (len(drugnames_inA_pubchem), len(pubchem_ids)))
    drugbank_di = drugbank_df.loc[~pd.isnull(drugbank_df["DrugBank"])]["DrugBank"].astype(str).to_dict()
    inv_map = {v:k for k,v in drugbank_di.items()}
    drugnames_inA_drugbank = [inv_map[x] for x in drugbank_ids if (x in inv_map)]
    drugnames_inA = drugnames_inA_pubchem+drugnames_inA_drugbank
    print("Found drugs in A (DrugBank): %d/%d (total %d)" % (len(drugnames_inA_drugbank), len(pubchem_ids),
                                                            len(drugnames_inA)))
    
    se = se.loc[[d for d in drugnames_inA if (d in se.index)]]
    se.index = ["CID"+str(pubchem_di[x]) if (x in drugnames_inA_pubchem) else drugbank_di[x] for x in se.index]

    print("%d drugs" % (len(se.index.unique())))
    start = time()
    se_similarity_df = data_processing.sideeffect_similarity(se)
    end = time()-start
    print("Computation took %.2f sec." % end)
    
    se_similarity_df.columns = se_similarity_df.index
    se_similarity_df.to_csv(se_similarity_file)
se_similarity_df = pd.read_csv(se_similarity_file, header=0, index_col=0)
se_similarity_df

Unnamed: 0,DB00001,DB00007,DB00014,DB00030,DB00035,DB00040,DB00046,DB00047,DB00049,DB00063,...,DB09101,DB09110,DB09112,DB09118,DB09152,DB09167,DB09265,DB09357,DB09462,DB09526
DB00001,1.000000,0.150289,0.168459,0.150327,0.180851,0.049020,0.129771,0.141104,0.157895,0.283465,...,0.076923,0.183673,0.061947,0.075630,0.100000,0.111111,0.114035,0.090909,0.142857,0.020000
DB00007,0.150289,1.000000,0.436314,0.290023,0.227666,0.023026,0.117460,0.160714,0.101538,0.108761,...,0.071429,0.181250,0.041801,0.074434,0.067692,0.163636,0.074675,0.033113,0.101266,0.009901
DB00014,0.168459,0.436314,1.000000,0.277045,0.259786,0.030172,0.138211,0.177778,0.139442,0.161417,...,0.079498,0.228916,0.050000,0.074380,0.082677,0.181818,0.092827,0.043478,0.117409,0.012987
DB00030,0.150327,0.290023,0.277045,1.000000,0.233766,0.027132,0.159091,0.241877,0.118280,0.142349,...,0.075758,0.173145,0.041199,0.071161,0.067376,0.141892,0.083650,0.035019,0.135338,0.015625
DB00035,0.180851,0.227666,0.259786,0.233766,1.000000,0.054688,0.213793,0.215909,0.159236,0.136905,...,0.157895,0.303226,0.057143,0.155556,0.096154,0.223529,0.156716,0.079365,0.246377,0.031746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DB09167,0.111111,0.163636,0.181818,0.141892,0.223529,0.068966,0.107438,0.124183,0.138211,0.137405,...,0.178947,0.230769,0.139785,0.163265,0.093220,1.000000,0.177083,0.105882,0.160714,0.011494
DB09265,0.114035,0.074675,0.092827,0.083650,0.156716,0.142857,0.140845,0.166667,0.144737,0.129412,...,0.204082,0.126316,0.127660,0.150943,0.101449,0.177083,1.000000,0.108108,0.203125,0.029412
DB09357,0.090909,0.033113,0.043478,0.035019,0.079365,0.166667,0.127273,0.075269,0.133333,0.084507,...,0.142857,0.100000,0.062500,0.105263,0.096154,0.105882,0.108108,1.000000,0.160000,0.066667
DB09462,0.142857,0.101266,0.117409,0.135338,0.246377,0.096154,0.272727,0.192982,0.209302,0.141414,...,0.245902,0.265306,0.076923,0.164179,0.094118,0.160714,0.203125,0.160000,1.000000,0.040000


### II.3 Drug-drug similarity: (Target) sequence drug-drug similarities

This yields a dataframe `sequence_similarity_df` of size #drugs x #drugs, where identifiers are PubChem or DrugBank identifiers, which contains, when computable, average drug-pairwise alignment scores between gene target sequences. We use the normalization suggested in [Bleakley and Yamanishi (2009)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2735674/).

In [9]:
from Bio import SeqIO
    
seq_similarity_file = predict_folder+"sequence_similarity_df.csv"

drugbank_ids = [x if (x[:len("DB")]=="DB") else None for x in list(A.index)]

fname = paths_global.drugbank_folder+"TARGET SEQUENCES/Drug Target Sequences/gene.fasta"
## Target sequences
target_sequences = [record.seq for record in SeqIO.parse(fname, "fasta")]
## Associated DrugBank ids
sequence_drug_ids = sb.check_output("cat '"+fname+"' | grep '>drugbank_target|' | sed -e 's/^.*(//g' | sed -e 's/)$//g'", shell=True).decode("utf-8").split("\n")
drug_ids_per_seq = [[x for x in idx.split("; ") if (len(x)>0)] for idx in sequence_drug_ids]
drug_ids_per_seq = [d for d in drug_ids_per_seq if (len(d)>0)]
## unique DrugBank ids
drug_ids = list(set([y for x in drug_ids_per_seq for y in x]))
## consider only drugbank ids in dataset
drug_ids = [x for x in drug_ids if (x in drugbank_ids)]
print("Found drugs %d/%d" % (len(drug_ids), len(drugbank_ids)))

def compute_target_similarity(seq_similarity_file, targets_ids, targets_ids_per_seq, target_sequences,
                              njobs, save_folder="./"):
    from Bio import Align
    from scipy.stats.mstats import gmean as geometric_mean

    if (not os.path.exists(save_folder+"sequence_cst_similarity_df.txt")):
        sequence_aligner = Align.PairwiseAligner()
        fname = paths_global.drugbank_folder+"TARGET SEQUENCES/Drug Target Sequences/gene.fasta"
        ## Target sequences
        target_sequences = [record.seq for record in SeqIO.parse(fname, "fasta")]

        start=time()
        sequence_CST_similarity = [sequence_aligner.score(seq, seq) for seq in target_sequences]
        end=time()-start
        print("Computation took %.2f sec." % (end))
        norm_cst = geometric_mean(sequence_CST_similarity)
        with open(save_folder+"sequence_cst_similarity_df.txt", "w+") as f:
            f.write(str(norm_cst))
    else:
        with open(save_folder+"sequence_cst_similarity_df.txt", "r") as f:
            norm_cst = float(f.read())

    if (not os.path.exists(seq_similarity_file)):
        start = time()
        sequence_similarity_df = data_processing.sequence_similarity(targets_ids, targets_ids_per_seq, 
                                    target_sequences, save_folder=save_folder, njobs=njobs)
        end = time()-start
        print("Computation took %.2f sec. on %d jobs" % (end, n_jobs))
        sequence_similarity_df.to_csv(seq_similarity_file)
    return norm_cst

sb.Popen(["mkdir", "-p", predict_folder+"TARGET_SIMILARITY/"])
norm_cst = compute_target_similarity(seq_similarity_file, drug_ids, drug_ids_per_seq, target_sequences, 
                          save_folder=predict_folder+"TARGET_SIMILARITY/", njobs=n_jobs)
sequence_similarity_df = pd.read_csv(seq_similarity_file, index_col=0)/norm_cst

Found drugs 1372/1622


### II.4 Drug-drug similarity: Closeness in the human PPI network drug target to drug target similarities

This yields a dataframe `network_similarity_df` of size #drugs x #drugs, where identifiers are PubChem or DrugBank identifiers, which contains, when computable, the transformed distance between drug targets in the PPI using the Floyd-Warshall algorithm. We apply to the Floyd-Warshall cost matrix the transformation suggested by the [PREDICT paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3159979/).

In [10]:
network_folder = predict_folder+"NETWORK_SIMILARITY/"
network_similarity_file = predict_folder+"network_similarity_df.csv"

## 1. Mapping drugs and genes
if (not os.path.exists(network_folder+"mapping_gene_drugs.csv")):
    fname=paths_global.drugbank_folder+"PROTEIN IDENTIFIERS/Drug Target Identifiers/all.csv"
    protein_db = pd.read_csv(fname, index_col=5).query("Species=='Humans'")[["Gene Name", "Drug IDs"]]
    drug_protein_db = protein_db[["Drug IDs"]].groupby(level=0).apply(lambda x : "; ".join(list(sorted(set(list(x.values.flatten()))))))
    drug_gene_db = protein_db[["Gene Name"]].groupby(level=0).apply(lambda x : "; ".join(list(map(str,set(x.values.flatten())))))
    protein_db = pd.DataFrame([], index=drug_protein_db.index)
    protein_db["Drug IDs"] = drug_protein_db
    protein_db["Gene Name"] = drug_gene_db
    drug_ids = list(set([y for x in list(protein_db["Drug IDs"]) for y in x.split("; ")]))
    ## consider only drugbank ids in custom dataset for computational reasons
    drug_ids = [x for x in drug_ids if (x in A.index)]
    keep_ids = [ix for ix, x in enumerate(list(protein_db["Drug IDs"])) if (any([y in drug_ids for y in x.split("; ")]))]
    protein_db = protein_db.iloc[keep_ids].loc[protein_db["Gene Name"]!="nan"]
    protein_db["Drug IDs"] = ["; ".join([xx for xx in x.split("; ") if (xx in drug_ids)]) for x in protein_db["Drug IDs"]]
    protein_db.to_csv(network_folder+"mapping_gene_drugs.csv")
protein_db = pd.read_csv(network_folder+"mapping_gene_drugs.csv", index_col=0)

protein_db["Target"] = protein_db["Drug IDs"]

def compute_network_similarity(network_folder, network_similarity_file, protein_db):
    from scipy.sparse.csgraph import floyd_warshall
    ## 2. Retrieve the PPI network involving these genes using the package NORDic to call the STRING database
    from NORDic.UTILS.STRING_utils import get_app_name_STRING, get_network_from_STRING

    network_fname = network_folder+"human_ppi_network.csv"
    sb.Popen(["mkdir", "-p", "/".join(network_fname.split("/")[:-1])])

    if (not os.path.exists(network_fname)):
        gene_list = list(protein_db["Gene Name"].unique())
        taxon_id=9606 #human
        app_name = get_app_name_STRING(paths_global.string_file)
        ppi = get_network_from_STRING(gene_list, taxon_id, min_score=0, network_type="functional", add_nodes=0, 
                                      app_name=app_name, version="11.5", quiet=False)
        ppi.to_csv(network_fname)
    ppi = pd.read_csv(network_fname, index_col=0)
    ppi["cost"] = 1-ppi["score"].values.flatten() #cost for the Floyd-Warshall algorithm
    ppi = ppi[["preferredName_A","preferredName_B","cost"]]

    ## 3. Prepare the cost matrix for the Floyd-Warshall algorithm (1 is the maximum cost of edge)
    max_cost=1.
    wam = ppi.pivot(index='preferredName_A', columns='preferredName_B', values='cost').fillna(max_cost)
    for g in wam.index:
        if (g not in wam.columns):
            wam[g] = [max_cost]*wam.shape[0]
    for g in wam.columns:
        if (g not in wam.index):
            wam.loc[g] = [max_cost]*wam.shape[1]
    wam = wam.loc[wam.index][wam.index]

    ## 4. Compute distance between target genes
    dists = floyd_warshall(wam.values.copy(order='C'), directed=False)
    dists_df = pd.DataFrame(dists, index=wam.index, columns=wam.columns)

    ## 5. Apply the transformation
    A_mat, b_mat = 0.9*np.exp(1), 1.
    net_sim = A_mat*np.exp(-b_mat*dists_df)
    net_sim /= np.max(net_sim)

    cols, ids = [], []
    for x in net_sim.columns:
        idnames = list(protein_db.loc[protein_db["Gene Name"]==x]["Target"])
        if (len(idnames)==0):
            continue
        nb = len(idnames[0].split("; "))
        cols += [x]*nb
        ids += idnames[0].split("; ")
    network_similarity = net_sim.loc[cols][cols]
    network_similarity.columns = ids
    network_similarity.index = ids

    ## 6. Combine similarity from all targets for all drugs
    network_similarity_df = network_similarity.groupby(level=0).mean().T.groupby(level=0).mean().T
    network_similarity_df.to_csv(network_similarity_file)

compute_network_similarity(network_folder, network_similarity_file, protein_db)
network_similarity_df = pd.read_csv(network_similarity_file, index_col=0)

### II.5 Drug-drug similarity: Gene Onthology drug-drug similarities

This yields a dataframe `go_similarity_df` of size #drugs x #drugs, where identifiers are PubChem or DrugBank identifiers, which contains the similarity between Gene Onthology annotations.

In [11]:
go_folder = predict_folder+"GO_SIMILARITY/"
go_sim_fname = predict_folder+"go_similarity_df.csv"

## Get drug-protein-gene associations
assert os.path.exists(predict_folder+"NETWORK_SIMILARITY/mapping_gene_drugs.csv")
protein_db = pd.read_csv(predict_folder+"NETWORK_SIMILARITY/mapping_gene_drugs.csv", index_col=0)
protein_db["Target"] = protein_db["Drug IDs"]

def compute_GO_similarity(go_sim_fname, go_folder, protein_db):
    sb.Popen(["mkdir", "-p", go_folder])
    
    if (not os.path.exists(go_sim_fname)):
        if (not os.path.exists(go_folder+"GO_similarity.csv")):
            proteins = list(set(list(protein_db.index)))
            with open("proteins.txt", "w+") as f:
                f.write(",".join(proteins))
            process = sb.Popen(["Rscript", "../src/Rscript_GOsim.R"])
            process.wait()
            sb.Popen(["mv", "GO_similarity.csv", "GO_similarity_MF.csv", 
                      "GO_similarity_CC.csv", "GO_similarity_BP.csv", go_folder])

        ## Annotated by protein UniProt ID
        go_protein_similarity = pd.read_csv(go_folder+"GO_similarity.csv", index_col=0).fillna(0.)

        cols, ids = [], []
        for x in go_protein_similarity.columns:
            idnames = protein_db.loc[x]["Target"].split(";")
            nb = len(idnames)
            cols += [x]*nb
            ids += idnames
        go_similarity_df = go_protein_similarity.loc[cols][cols]
        go_similarity_df.columns = ids
        go_similarity_df.index = ids
        
        ## Combine similarity from all targets for all drugs
        go_similarity_df = go_similarity_df.groupby(level=0).mean().T.groupby(level=0).mean().T
        go_similarity_df.to_csv(go_sim_fname)
    
compute_GO_similarity(go_sim_fname, go_folder, protein_db)
go_similarity_df = pd.read_csv(go_sim_fname, index_col=0)
go_similarity_df

Unnamed: 0,DB00005,DB00013,DB00014,DB00015,DB00022,DB00031,DB00041,DB00046,DB00047,DB00051,...,DB09310,DB09331,DB09343,DB09462,DB09517,DB09568,DB11363,DB11365,DB11595,DB11606
DB00005,0.227313,0.147571,0.134762,0.147714,0.161762,0.139971,0.176175,0.145833,0.145833,0.147429,...,0.072762,0.130476,0.071714,0.062048,0.083429,0.064886,0.141048,0.105476,0.174762,0.138524
DB00013,0.147571,0.316185,0.096000,0.299815,0.131611,0.284156,0.136111,0.178833,0.178833,0.227333,...,0.116222,0.135111,0.114667,0.092400,0.116889,0.093778,0.170333,0.126333,0.203000,0.233111
DB00014,0.134762,0.096000,0.432333,0.083333,0.182333,0.093400,0.167333,0.161167,0.161167,0.099000,...,0.054000,0.083000,0.065333,0.033267,0.089667,0.074933,0.120000,0.096000,0.090333,0.083000
DB00015,0.147714,0.299815,0.083333,0.333593,0.112444,0.289822,0.123370,0.168667,0.168667,0.235444,...,0.127667,0.142667,0.113667,0.093178,0.127444,0.090333,0.163333,0.129444,0.211444,0.267000
DB00022,0.161762,0.131611,0.182333,0.112444,0.401417,0.118400,0.257833,0.143667,0.143667,0.134167,...,0.047333,0.105833,0.044667,0.044233,0.043000,0.064333,0.128833,0.116833,0.150500,0.094833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DB09568,0.064886,0.093778,0.074933,0.090333,0.064333,0.085867,0.071511,0.115967,0.115967,0.103267,...,0.056533,0.113933,0.143800,0.162307,0.101800,0.258133,0.123800,0.081400,0.101733,0.083267
DB11363,0.141048,0.170333,0.120000,0.163333,0.128833,0.174933,0.159111,0.231667,0.231667,0.193667,...,0.104667,0.145000,0.106333,0.111800,0.114000,0.123800,0.403667,0.109000,0.153333,0.127667
DB11365,0.105476,0.126333,0.096000,0.129444,0.116833,0.118333,0.110333,0.143500,0.143500,0.138333,...,0.054333,0.084333,0.103333,0.066667,0.108667,0.081400,0.109000,0.454667,0.154667,0.121667
DB11595,0.174762,0.203000,0.090333,0.211444,0.150500,0.188267,0.176333,0.160000,0.160000,0.214333,...,0.064000,0.166667,0.105667,0.092333,0.102000,0.101733,0.153333,0.154667,0.354667,0.173667


### II.6 Drug-drug similarity 6: Transcriptomic-based

"Given genetic signatures of diseases obtained from gene expression experiments, we used a Jaccard score between every pair of signatures, taking into account the direction of the response of each gene. That is, the total number of mutual upregulated genes and mutual downregulated genes over the unified list of all genes. Signature genes with inconsistent regulation directionality for the same disease across various experiments (i.e., registered as both upregulated and downregulated across various experiments for the same disease) were filtered, allowing for up to 10% expression measurement errors." 

This yields a dataframe `go_similarity_df` of size #drugs x #drugs, where identifiers are PubChem or DrugBank identifiers, which contains the similarity between Gene Onthology annotations.

Signatures were obtained in the notebook TRANSCRIPT_dataset.ipynb.

In [12]:
genetic_fname = predict_folder+"signature_similarity_df.csv"

def compute_genetic_similarity(df, genetic_fname):
    from sklearn.metrics import pairwise_distances
    ## Look only at the direction of differential expression
    df[df<0] = -1
    df[df>0] = 1
    # sum of mutual up/down-regulated genes divided by total number of genes
    Dists = (df.shape[0]-pairwise_distances(df.T, metric='l1'))/df.shape[0] 
    genetic_similarity_df = pd.DataFrame(Dists, index=df.columns, columns=df.columns)
    genetic_similarity_df.to_csv(genetic_fname)
    
S = pd.read_csv(paths_global.data_folder+"TRANSCRIPT/all_drugs_+LINCS.csv", index_col=0, header=0)
compute_genetic_similarity(S, genetic_fname)
genetic_similarity_df = pd.read_csv(genetic_fname, index_col=0)
genetic_similarity_df

Unnamed: 0,DB00091,DB00121,DB00126,DB00130,DB00133,DB00136,DB00158,DB00163,DB00177,DB00181,...,DB09213,DB09256,DB09324,DB09462,DB09477,DB09555,DB09570,DB11582,DB13740,DB14126
DB00091,1.000000,1.000000,0.921566,0.922298,0.920834,0.923364,0.923364,0.933617,0.921699,0.935482,...,0.325255,0.323923,0.324988,0.321127,0.325255,0.323390,0.325122,0.321526,0.323257,0.322991
DB00121,1.000000,1.000000,0.921566,0.922298,0.920834,0.923364,0.923364,0.933617,0.921699,0.935482,...,0.325255,0.323923,0.324988,0.321127,0.325255,0.323390,0.325122,0.321526,0.323257,0.322991
DB00126,0.921566,0.921566,1.000000,0.926560,0.931620,0.930421,0.925894,0.939610,0.927692,0.942806,...,0.338971,0.334044,0.340169,0.346961,0.341900,0.338571,0.341234,0.341634,0.343099,0.342433
DB00130,0.922298,0.922298,0.926560,1.000000,0.924496,0.927159,0.924629,0.935548,0.925894,0.937679,...,0.337173,0.338238,0.337572,0.339037,0.333844,0.334510,0.339836,0.337173,0.335575,0.336374
DB00133,0.920834,0.920834,0.931620,0.924496,1.000000,0.927692,0.924096,0.939410,0.928158,0.943006,...,0.336507,0.336108,0.338238,0.336374,0.336640,0.332512,0.338771,0.340502,0.335974,0.336241
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DB09555,0.323390,0.323390,0.338571,0.334510,0.332512,0.335442,0.336907,0.337706,0.336041,0.339037,...,0.336174,0.390505,0.340302,0.385578,0.366269,1.000000,0.302217,0.385312,0.343099,0.334576
DB09570,0.325122,0.325122,0.341234,0.339836,0.338771,0.337572,0.344231,0.343165,0.337772,0.342899,...,0.320727,0.350556,0.396498,0.414342,0.366269,0.302217,1.000000,0.366136,0.441641,0.420867
DB11582,0.321526,0.321526,0.341634,0.337173,0.340502,0.339836,0.337839,0.344497,0.345895,0.340902,...,0.350556,0.305014,0.340835,0.370664,0.349224,0.385312,0.366136,1.000000,0.377988,0.387842
DB13740,0.323257,0.323257,0.343099,0.335575,0.335974,0.336773,0.334110,0.341967,0.341634,0.339836,...,0.315267,0.357081,0.359611,0.413143,0.318730,0.343099,0.441641,0.377988,1.000000,0.443105


## III. Disease-disease similarity database

### III.1 Disease-disease similarity: Phenotype disease-disease similarities

Identifying similarity between MeSH terms appearing in the medical description of diseases from the OMIM database. 

"We used the phenotypic similarity constructed by [van Driel et al (2006)](https://www.nature.com/articles/5201585.pdf). The phenotypic similarity was constructed by identifying similarity between MeSH terms (Lipscomb, 2000) appearing in the medical description of diseases from the OMIM database (Hamosh et al, 2002)". We actually use a posterior package for computing similarities, described in [this paper](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-020-03583-6).

In order to run this code, you need to sign up to [a UMLS Terminology Services Account](https://uts.nlm.nih.gov/uts/signup-login) (with your academic e-mail address for instance). The review of the registration application takes up to 3 business days. Then
1. Download file [public_mm_linux_main_2020.tar.bz2](https://data.lhncbc.nlm.nih.gov/umls-restricted/ii/tools/MetaMap/download/public_mm_linux_main_2020.tar.bz2) once logged in
2. Place that file in folder `~/`

For more information about MetaMap, please check the [documentation](https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/README.html#Getting%20the%20distribution).

This yields a dataframe `disease_phenotype_similarity_df.csv` of size #diseases x #diseases, where identifiers are MedGen Concept ID identifiers, which contains the similarity between disease phenotype annotations.

In [13]:
if (not os.path.exists(predict_folder+"disease_phenotype_similarity_df.csv")):

    A = utils.load_dataset("FEATURELESS", save_folder=paths_global.data_folder)["ratings_mat"]
    sb.Popen(["mkdir", "-p", predict_folder+"DISEASE_PHENOTYPE_SIMILARITY/"])

    ## Retrieve MeSH terms from MedGen Concept IDs
    if (not os.path.exists(predict_folder+"DISEASE_PHENOTYPE_SIMILARITY/disease_phenotype_MeshTERMS.out")):
        with open("diseases.txt", "w+") as f:
            f.write(','.join(list(A.columns)))  
        process = sb.Popen("bash ../src/Bashscript_PHENOsim.sh")
        process.wait()
        sb.Popen(["mv", "disease_phenotype_MeshTERMS.out", predict_folder+"DISEASE_PHENOTYPE_SIMILARITY/"])

    disease_MeshTERMS = pd.read_csv(predict_folder+"DISEASE_PHENOTYPE_SIMILARITY/disease_phenotype_MeshTERMS.out", index_col=None, header=None)
    disease_MeshTERMS.columns = ["Disease UMLS","MeSH ID"]
    disease_MeshTERMS = disease_MeshTERMS.loc[disease_MeshTERMS["MeSH ID"].astype(str)!=" None"]
    disease_MeshTERMS["MeSH ID"] = [x[2:-1] for x in list(disease_MeshTERMS["MeSH ID"])]
    disease_MeshTERMS.index = list(disease_MeshTERMS["MeSH ID"])
    disease_MeshTERMS["Disease UMLS"] = [x[1:-1] for x in list(disease_MeshTERMS["Disease UMLS"])]
    mesh_terms = list(disease_MeshTERMS["MeSH ID"])

    ## Compute MeSH term similarity
    if (not os.path.exists(predict_folder+"DISEASE_PHENOTYPE_SIMILARITY/disease_phenotype_similarity.out")):
        method, category = "lin", "C"
        cmd = ["source ~/miniconda3/etc/profile.d/conda.sh"]
        cmd += ["conda activate pysim"]
        cmd += ["~/public_mm/bin/skrmedpostctl start"]
        cmd += ["~/public_mm/bin/wsdserverctl start"]
        cmd_py = ["diseases_=[\""+"\",\"".join(mesh_terms)+"\"]"]
        cmd_py += ["from pyMeSHSim.Sim.similarity import termComp"]
        cmd_py += ["import pandas as pd"]
        cmd_py += ["import numpy as np"]
        cmd_py += ["simCom = termComp()"]
        cmd_py += ['calcSim = lambda xii,xjj: simCom.termSim(dui1=xii, dui2=xjj, method=\"'+method+'\", category=\"'+category+'\")']
        cmd_py += ['exec("def calcSimWrapper(xii, xjj):\\n\\ttry:\\n\\t\\treturn calcSim(xii,xjj)\\n\\texcept:\\n\\t\\treturn 0\\n")']
        cmd_py += ["simmat = pd.DataFrame([],index=diseases_)"]
        cmd_py += ['exec("for i, xi in enumerate(list(set(diseases_))):\\n\\tsimmat[xi] = [0]*(i+1)+[calcSimWrapper(xi,xj) for j, xj in enumerate(diseases_[(i+1):])]\\n\\n")']
        cmd_py += ['simmat.to_csv("disease_phenotype_similarity.out")']
        cmd += ["python3 -c \'"+";".join(cmd_py)+"\'"]
        cmd += ["~/public_mm/bin/wsdserverctl stop"]
        cmd += ["~/public_mm/bin/skrmedpostctl stop"]
        cmd += ["conda deactivate"]
        sb.Popen([y for x in cmd for y in x.split(" ")+[";"]])
        sb.Popen(["mv", "disease_phenotype_similarity.out", predict_folder+"DISEASE_PHENOTYPE_SIMILARITY/"])

    simmat = pd.read_csv(predict_folder+"DISEASE_PHENOTYPE_SIMILARITY/disease_phenotype_similarity.out", index_col=0)
    disease_phenotype_similarity_df = simmat[mesh_terms]
    disease_phenotype_similarity_df += disease_phenotype_similarity_df.T
    for i in range(disease_phenotype_similarity_df.shape[0]):
        disease_phenotype_similarity_df.iloc[i, i] =1 
    disease_phenotype_similarity_df = disease_phenotype_similarity_df.loc[~disease_phenotype_similarity_df.index.duplicated()]
    disease_phenotype_similarity_df = disease_phenotype_similarity_df.T.loc[~disease_phenotype_similarity_df.columns.duplicated()].T

    cols, ids = [], []
    for x in mesh_terms:
        idnames = str(disease_MeshTERMS.loc[x]["Disease UMLS"]).split("\n")
        if (len(idnames)>1):
            idnames = idnames[:-1]
        nb = len(idnames)
        cols += [x]*nb
        ids += idnames

    disease_phenotype_similarity_df = disease_phenotype_similarity_df.loc[cols][cols]
    disease_phenotype_similarity_df.index = ids
    disease_phenotype_similarity_df.columns = ids
    disease_phenotype_similarity_df.to_csv(predict_folder+"disease_phenotype_similarity_df.csv")

disease_phenotype_similarity_df = pd.read_csv(predict_folder+"disease_phenotype_similarity_df.csv", index_col=0)
disease_phenotype_similarity_df

Unnamed: 0,C0007102,D003550 C0010674,D003550 C0392164,C0079773,C0003873,C0003872,C0038013,C0034013,C0014175,C0524910,...,C2930981,C0149530,C1865810,C1838261,C1832605,C1832474,C1866519,C1866041,C1866040,C1864068
C0007102,1.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
D003550 C0010674,0.0,1.0,1.0,0.00000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.462672,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
D003550 C0392164,0.0,1.0,1.0,0.00000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.462672,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
C0079773,0.0,0.0,0.0,1.00000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.403220,0.403220,0.403220,0.403220,0.403220,0.403220,0.403220
C0003873,0.0,0.0,0.0,0.00000,1.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.672742,0.672742,0.672742,0.672742,0.672742,0.672742,0.672742
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C1832474,0.0,0.0,0.0,0.40322,0.672742,0.0,0.0,0.427785,0.0,0.0,...,0.0,0.0,0.000000,2.000000,2.000000,1.000000,2.000000,2.000000,2.000000,2.000000
C1866519,0.0,0.0,0.0,0.40322,0.672742,0.0,0.0,0.427785,0.0,0.0,...,0.0,0.0,0.000000,2.000000,2.000000,2.000000,1.000000,2.000000,2.000000,2.000000
C1866041,0.0,0.0,0.0,0.40322,0.672742,0.0,0.0,0.427785,0.0,0.0,...,0.0,0.0,0.000000,2.000000,2.000000,2.000000,2.000000,1.000000,2.000000,2.000000
C1866040,0.0,0.0,0.0,0.40322,0.672742,0.0,0.0,0.427785,0.0,0.0,...,0.0,0.0,0.000000,2.000000,2.000000,2.000000,2.000000,2.000000,1.000000,2.000000


### III.2 Disease-disease similarity: Semantic phenotypic disease-disease similarities 

"Hierarchical structure of the HPO together with the mapping provided by HPO between ontology nodes and OMIM diseases to construct a semantic similarity score based on Resnik."

This yields a dataframe `disease_semantic_similarity_df.csv` of size #diseases x #diseases, where identifiers are MedGen Concept ID identifiers, which contains the semantic similarity score between disease ontologies. We use scripts from the following [repository](https://github.com/DGRC-PT/HPOSim_Helper_Scripts).

In [14]:
semantic_folder = predict_folder+"DISEASE_SEMANTIC_SIMILARITY/"

if (not os.path.exists(predict_folder+"disease_semantic_similarity_df.csv")):

    A = utils.load_dataset("FEATURELESS", save_folder=paths_global.data_folder)["ratings_mat"]
    diseases = list(A.columns) # MedGen Concept IDs
    disease_list = list(set([di_medgenid2diseasename[disease.split(".")[0]] for disease in diseases]))

    inv_map = {v:k for k, v in di_omimid2diseasename.items()}
    disease_list_ = ["OMIM:"+(inv_map.get(d)[1:]) for d in disease_list if (d in inv_map)]

    if (not os.path.exists(semantic_folder+"disease_semantic_similarity.out")):

        process = sb.Popen("bash ../src/Bashscript_SEMANTICsim.sh".split(" "))
        process.wait()

        with open("diseases.txt", "w") as f:
            f.write(",".join(disease_list_)+"\n")

        sb.Popen(("mkdir -p "+semantic_folder).split(" "))
        cmd_R = ["library(HPOSim);ls<-unlist(read.delim(\"diseases.txt\", sep=\",\", header=F))"]
        cmd_R += ["mat<-getDiseaseListSim(ls,combinemethod=\"funSimMax\",method=\"Resnik\",verbose=TRUE)"]
        cmd_R += ["write.csv(mat,\"disease_semantic_similarity.out\")"]
        sb.Popen("R -e '"+(";".join(cmd_R)))

        sb.Popen("rm diseases.txt".split(" "))
        sb.Popen(("mv disease_semantic_similarity.out "+semantic_folder).split(" "))

    disease_semantic_similarity_df = pd.read_csv(semantic_folder+"disease_semantic_similarity.out", index_col=0)
    disease_semantic_similarity_df.index = disease_list_
    disease_semantic_similarity_df.columns = disease_list_
    disease_list_ = [di_omimid2diseasename["D"+x.split("OMIM:")[-1]] for x in disease_list_]
    invert_map = {v: k for k,v in di_medgenid2diseasename.items()}
    disease_list_ = [invert_map.get(d,None) for d in disease_list_ if (invert_map.get(d,None) is not None)]
    disease_semantic_similarity_df.index = disease_list_
    disease_semantic_similarity_df.columns = disease_list_
    disease_semantic_similarity_df.to_csv(predict_folder+"disease_semantic_similarity_df.csv")
    
disease_semantic_similarity_df = pd.read_csv(predict_folder+"disease_semantic_similarity_df.csv", index_col=0)
disease_semantic_similarity_df

Unnamed: 0,C0020481,C3463897,C1868001,C2749485,C2675609,C0003467,C4551825,C0001973,C1274743,C0238344,...,C1840264,C5193005,C1843773,C1835028,C0745103,C0029456,C0023234,C1861864,C0162309,C1848066
C0020481,4.991208,0.000000,1.119318,1.119318,0.000000,0,1.119318,0.000000,0.000000,1.119318,...,0.0,0.000000,0.000000,1.119318,3.880516,0.000000,0.000000,0.000000,1.119318,0.000000
C3463897,0.000000,0.677136,0.000000,0.677136,0.000000,0,0.677136,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
C1868001,1.119318,0.000000,0.857426,0.857426,0.000000,0,0.410417,0.000000,0.000000,0.410417,...,0.0,0.447009,0.447009,0.410417,0.410417,0.000000,0.000000,0.000000,0.857426,0.447009
C2749485,1.119318,0.677136,0.857426,3.289999,inf,0,1.547995,0.341902,1.360326,1.023991,...,0.0,1.795841,1.348896,1.044522,1.733418,0.701652,inf,1.032564,inf,1.237351
C2675609,0.000000,0.000000,0.000000,inf,3.180463,0,0.000000,0.000000,0.642474,0.000000,...,0.0,1.126739,0.000000,0.000000,0.000000,0.701652,inf,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0029456,0.000000,0.000000,0.000000,0.701652,0.701652,0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.701652,0.000000,0.000000,0.000000,2.073377,0.701652,0.000000,0.000000,0.000000
C0023234,0.000000,0.000000,0.000000,inf,inf,0,0.000000,0.000000,0.354614,0.347833,...,0.0,0.603164,0.000000,0.216186,0.202601,0.701652,2.920103,0.000000,0.000000,0.000000
C1861864,0.000000,0.000000,0.000000,1.032564,0.000000,0,0.000000,0.000000,0.000000,0.000000,...,0.0,1.032564,0.000000,0.000000,1.032564,0.000000,0.000000,2.236650,1.032564,0.000000
C0162309,1.119318,0.000000,0.857426,inf,0.000000,0,0.644520,0.341902,1.360326,0.534524,...,0.0,0.885896,1.190497,0.428788,0.880471,0.000000,0.000000,1.032564,3.239884,1.226721


### III.3 Disease-disease similarity: Genetic based disease-disease similarities

"Given genetic signatures of diseases obtained from gene expression experiments, we used a Jaccard score between every pair of signatures, taking into account the direction of the response of each gene. That is, the total number of mutual upregulated genes and mutual downregulated genes over the unified list of all genes. Signature genes with inconsistent regulation directionality for the same disease across various experiments (*i*.*e*., registered as both upregulated and downregulated across various experiments for the same disease) were filtered, allowing for up to 10% expression measurement errors." 

Transcriptomic phenotypes were obtained in the notebook *TRANSCRIPT_dataset.ipynb*.

This yields a dataframe `disease_genetic_similarity_df.csv` of size #diseases x #diseases, where identifiers are MedGen Concept ID identifiers, which contains the Jaccard similarity score between disease transcriptomic phenotypes.

Similar to Section [**II.6 Drug-drug similarity 6: Transcriptomic-based**](#II.6-Drug-drug-similarity-6:-Transcriptomic-based).

In [15]:
genetic_similarity_fname = paths_global.data_folder+"PREDICT/disease_genetic_similarity_df.csv"

P = pd.read_csv(paths_global.data_folder+"TRANSCRIPT/all_diseases.csv", index_col=0, header=0)
compute_genetic_similarity(P, genetic_similarity_fname)
disease_genetic_similarity_df = pd.read_csv(genetic_similarity_fname, index_col=0)
disease_genetic_similarity_df

Unnamed: 0,C0346629,C2239176,C3553462,C0035235,C0032285,C0010346,C0009324,C0029408,C0001973,C2973725,...,C0275804,C0017168,C0003615,C0003872,C0014544,C0038436,C0040028,C0040034,C0018802,C0036323
C0346629,1.000000,1.000000,0.920887,0.920887,0.926968,0.934652,0.934652,0.931779,0.933316,0.928304,...,0.939262,0.932647,0.937458,0.921422,0.932581,0.927903,0.925297,0.936122,0.935120,0.931511
C2239176,1.000000,1.000000,0.920887,0.920887,0.926968,0.934652,0.934652,0.931779,0.933316,0.928304,...,0.939262,0.932647,0.937458,0.921422,0.932581,0.927903,0.925297,0.936122,0.935120,0.931511
C3553462,0.920887,0.920887,1.000000,1.000000,0.920821,0.921556,0.920219,0.920152,0.929574,0.921622,...,0.933917,0.920486,0.920086,0.922491,0.920954,0.921221,0.920353,0.931445,0.929641,0.931912
C0035235,0.920887,0.920887,1.000000,1.000000,0.920821,0.921556,0.920219,0.920152,0.929574,0.921622,...,0.933917,0.920486,0.920086,0.922491,0.920954,0.921221,0.920353,0.931445,0.929641,0.931912
C0032285,0.926968,0.926968,0.920821,0.920821,1.000000,0.927770,0.927636,0.923694,0.933382,0.924362,...,0.937325,0.927903,0.928438,0.926300,0.928104,0.928772,0.929641,0.933650,0.932113,0.932915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0038436,0.927903,0.927903,0.921221,0.921221,0.928772,0.926567,0.929507,0.926366,0.932981,0.926901,...,0.938928,0.928972,0.928037,0.924161,0.930910,1.000000,0.928705,0.935921,0.933449,0.932915
C0040028,0.925297,0.925297,0.920353,0.920353,0.929641,0.930643,0.927436,0.928571,0.934518,0.925631,...,0.936857,0.930776,0.928371,0.922357,0.927903,0.928705,1.000000,0.933583,0.932848,0.933917
C0040034,0.936122,0.936122,0.931445,0.931445,0.933650,0.933984,0.936122,0.935521,0.947080,0.934585,...,0.952359,0.935186,0.939062,0.931846,0.936723,0.935921,0.933583,1.000000,0.946211,0.944073
C0018802,0.935120,0.935120,0.929641,0.929641,0.932113,0.932313,0.932848,0.932915,0.943004,0.939463,...,0.950555,0.933249,0.936055,0.930309,0.935186,0.933449,0.932848,0.946211,1.000000,0.940799


### III.4 Disease-disease similarity: Gene sequence based disease-disease similarities

This yields a dataframe `disease_sequence_similarity_df.csv` of size #diseases x #diseases, where identifiers are MedGen Concept ID identifiers, which contains, when computable, average drug-pairwise alignment scores between gene target sequences. We use the normalization suggested in [Bleakley and Yamanishi (2009)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2735674/).

Similar to Section [**II.3 Drug-drug similarity 3: (Target) sequence drug-drug similarities**](#II.3-Drug-drug-similarity-3:-(Target)-sequence-drug-drug-similarities).

#### III.4.i Using genes from the disease phenotypes as in [Section III.3](#III.3-Disease-disease-similarity:-Genetic-based-disease-disease-similarities)

If one uses genes from disease "phenotypes" as computed above, then they should use this code:

In [16]:
sb.Popen(["mkdir", "-p", predict_folder+"DISEASE_TARGET_SIMILARITY/"])

if (not os.path.exists(predict_folder+"DISEASE_TARGET_SIMILARITY/disease_db_phenotypes.csv")):

    seq_fname = paths_global.drugbank_folder+"TARGET SEQUENCES/Drug Target Sequences/gene.fasta"
    proteins_list = sb.check_output("cat '"+seq_fname+"' | grep '>drugbank_target|' | cut -d' ' -f1 | cut -d'|' -f2", shell=True).decode("utf-8").split("\n")[:-1]

    protein_fname=paths_global.drugbank_folder+"PROTEIN IDENTIFIERS/Drug Target Identifiers/all.csv"
    protein_db = pd.read_csv(protein_fname, index_col=5).query("Species=='Humans'")[["Gene Name", "Drug IDs"]]
    drug_protein_db = protein_db[["Drug IDs"]].groupby(level=0).apply(lambda x : "; ".join(list(sorted(set(list(x.values.flatten()))))))
    drug_gene_db = protein_db[["Gene Name"]].groupby(level=0).apply(lambda x : "; ".join(list(map(str,set(x.values.flatten())))))
    protein_db = pd.DataFrame([], index=drug_protein_db.index)
    protein_db["Drug IDs"] = drug_protein_db
    protein_db["Gene Name"] = drug_gene_db
    protein_in_list = [x for x in protein_db.index if (x in proteins_list)]
    protein_db = protein_db.loc[protein_in_list]
    protein_db["Protein"] = protein_in_list
    protein_db.index = protein_db["Gene Name"]

    P = pd.read_csv(paths_global.data_folder+"TRANSCRIPT/all_diseases.csv", index_col=0, header=0)
    diseases_in_list = [[P.index[ix] for ix, x in enumerate(list(P[disease])) if (x != 0)] for disease in P.columns]

    disease_db = pd.DataFrame([], index=P.columns, columns=["Protein", "Gene Name"])
    for ip, p in enumerate(P.columns):
        genes = [g for g in diseases_in_list[ip] if (g in protein_db.index)]
        proteins = list(protein_db.loc[genes]["Protein"])
        disease_db.loc[p] = ["; ".join(proteins), "; ".join(genes)]
        
    disease_db.to_csv(predict_folder+"DISEASE_TARGET_SIMILARITY/disease_db_phenotypes.csv")
    
disease_db = pd.read_csv(predict_folder+"DISEASE_TARGET_SIMILARITY/disease_db_phenotypes.csv", index_col=0)

#### III.4.ii Using disease-associated genes from DisGeNet

However, if one truly wants an equivalent to the computations made for drugs, they should use gene targets retrieved from DisGeNet (using package NORDic).

In [17]:
sb.Popen(["mkdir", "-p", predict_folder+"DISEASE_TARGET_SIMILARITY/"])

if (not os.path.exists(predict_folder+"DISEASE_TARGET_SIMILARITY/disease_db_DisGeNet.csv")):
    
    from NORDic.UTILS.DISGENET_utils import get_user_key_DISGENET, get_genes_proteins_from_DISGENET
    A = utils.load_dataset("FEATURELESS", save_folder=paths_global.data_folder)["ratings_mat"]
    disease_list = list(A.columns)
    
    user_key = get_user_key_DISGENET(paths_global.disgenet_file)
    disease_db = get_genes_proteins_from_DISGENET(disease_list, limit=3000, source="CURATED", min_score=0.35, 
                                min_ei=1., min_dsi=0.25, min_dpi=0, chunksize=100, user_key=user_key, quiet=False)
    
    disease_db.to_csv(predict_folder+"DISEASE_TARGET_SIMILARITY/disease_db_DisGeNet.csv")
    
disease_db = pd.read_csv(predict_folder+"DISEASE_TARGET_SIMILARITY/disease_db_DisGeNet.csv", index_col=0)
disease_db

Unnamed: 0_level_0,Protein,Gene Name
diseaseid,Unnamed: 1_level_1,Unnamed: 2_level_1
C0001206,Q96P66; O00170,AIP; GPR101
C0003872,O43734; P29460,IL12B; TRAF3IP2
C0003873,Q13609; Q8N103; P19878; Q7Z698; O95267; P51451...,ACAN; AFF3; AGER; ALOX5; ANKRD55; ANXA3; ARID5...
C0004096,P05305; Q8TAX9; P08887; P05091; Q6K0P9; Q6ZTQ4...,ALDH2; AREG; ARG2; BCL2; CDHR3; CTNNA3; DNAH5;...
C0007102,P26358; P26447; Q9UIF7; Q05209; P01579; Q9UJU2...,AXIN2; BAX; BCL2; BECN1; CBR1; CCAT1; CCND2; D...
...,...,...
C3495438,P78363,ABCA4
C3550234,P50542,PEX5
C3888093,O75342,ALOX12B
C4083045,Q6Y7W6,GIGYF2


In [18]:
disease_db = pd.read_csv(predict_folder+"DISEASE_TARGET_SIMILARITY/disease_db_DisGeNet.csv", index_col=0)
disease_db.to_csv(predict_folder+"DISEASE_TARGET_SIMILARITY/mapping_gene_diseases.csv")

#### III.4.iii Compute gene target sequence similarities

In [19]:
## Get disease-protein associations
assert os.path.exists(predict_folder+"DISEASE_TARGET_SIMILARITY/mapping_gene_diseases.csv")
disease_db = pd.read_csv(predict_folder+"DISEASE_TARGET_SIMILARITY/mapping_gene_diseases.csv", index_col=0)

from Bio import SeqIO
fname = paths_global.drugbank_folder+"TARGET SEQUENCES/Drug Target Sequences/gene.fasta"
## Target sequences
target_sequences = [record.seq for record in SeqIO.parse(fname, "fasta")]
proteins_list = sb.check_output("cat '"+fname+"' | grep '>drugbank_target|' | cut -d' ' -f1 | cut -d'|' -f2", shell=True).decode("utf-8").split("\n")[:-1]
## One-to-one correspondance
protein_ids_per_seq = [[p] for p in proteins_list]
## Consider only proteins associated with diseases in dataset
protein_ids_inA = [y for x in disease_db["Protein"] for y in x.split("; ")]
protein_ids = [p for p in proteins_list if (p in protein_ids_inA)]
print("Found disease-associated proteins %d/%d" % (len(protein_ids), len(protein_ids_inA)))

disease_seq_similarity_file = predict_folder+"disease_sequence_similarity_df.csv"
norm_cst = compute_target_similarity(disease_seq_similarity_file, protein_ids, protein_ids_per_seq, 
                    target_sequences, save_folder=predict_folder+"DISEASE_TARGET_SIMILARITY/", njobs=n_jobs)
disease_sequence_similarity_df = pd.read_csv(disease_seq_similarity_file, index_col=0)/norm_cst

Found disease-associated proteins 926/3333


### III.5 Disease-disease similarity: Closeness in the human PPI network disease-disease similarities

This yields a dataframe `disease_network_similarity_df.csv` of size #diseases x #diseases, where identifiers are MedGen Concept ID identifiers, which contains, when computable, the transformed distance between drug targets in the PPI using the Floyd-Warshall algorithm. We apply to the Floyd-Warshall cost matrix the transformation suggested by the [PREDICT paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3159979/).

Similar to Section [**II.4 Drug-drug similarity 4: Closeness in the human PPI network drug-drug similarities**](#II.4-Drug-drug-similarity:-Closeness-in-the-human-PPI-network-drug-target-to-drug-target-similarities).

In [20]:
## Get disease-protein associations
assert os.path.exists(predict_folder+"DISEASE_TARGET_SIMILARITY/mapping_gene_diseases.csv")
disease_db = pd.read_csv(predict_folder+"DISEASE_TARGET_SIMILARITY/mapping_gene_diseases.csv", index_col=0)

protein_list = [y for p in disease_db["Protein"] for y in p.split("; ") if (len(y)>0)]
gene_list = [y for p in disease_db["Gene Name"] for y in p.split("; ")  if (len(y)>0)]
protein_list, gene_list, disease_list = [], [], []
for idx in disease_db.index:
    protein_ls = [y for y in disease_db.loc[idx]["Protein"].split("; ")]
    gene_ls = [y for y in disease_db.loc[idx]["Gene Name"].split("; ")][:len(protein_ls)]
    if (len(protein_ls)!=len(gene_ls)):
        print(idx)
    protein_list += protein_ls
    gene_list += gene_ls
    disease_list += [idx]*len(protein_ls)
di = dict(zip(protein_list, gene_list))
disease_db2 = pd.DataFrame(disease_list, index=protein_list, columns=["Target"])
disease_db2 = pd.DataFrame(disease_db2.groupby(level=0).apply(lambda x : "; ".join(list(x["Target"]))), 
                           columns=["Target"])
disease_db2["Protein"] = disease_db2.index
disease_db2["Gene Name"] = [di[i] for i in disease_db2.index]

disease_network_folder = predict_folder+"DISEASE_NETWORK_SIMILARITY/"
disease_network_similarity_file = predict_folder+"disease_network_similarity_df.csv"

compute_network_similarity(disease_network_folder, disease_network_similarity_file, disease_db2)
disease_network_similarity_file = pd.read_csv(disease_network_similarity_file, index_col=0)

### III.6 Disease-disease similarity: Gene Onthology disease-disease similarities

This yields a dataframe `disease_go_similarity_df` of size #diseases x #diseases, where identifiers are MedGen Concept ID identifiers, which contains the similarity between Gene Onthology annotations.

Similar to Section [**II.5 Drug-drug similarity: Gene Onthology drug-drug similarities**](#II.5-Drug-drug-similarity:-Gene-Onthology-drug-drug-similarities).

In [21]:
## Get disease-protein associations
assert os.path.exists(predict_folder+"DISEASE_TARGET_SIMILARITY/mapping_gene_diseases.csv")
disease_db = pd.read_csv(predict_folder+"DISEASE_TARGET_SIMILARITY/mapping_gene_diseases.csv", index_col=0)

protein_list = [y for p in disease_db["Protein"] for y in p.split("; ") if (len(y)>0)]
gene_list = [y for p in disease_db["Gene Name"] for y in p.split("; ")  if (len(y)>0)]
protein_list, gene_list, disease_list = [], [], []
for idx in disease_db.index:
    protein_ls = [y for y in disease_db.loc[idx]["Protein"].split("; ")]
    gene_ls = [y for y in disease_db.loc[idx]["Gene Name"].split("; ")][:len(protein_ls)]
    if (len(protein_ls)!=len(gene_ls)):
        print(idx)
    protein_list += protein_ls
    gene_list += gene_ls
    disease_list += [idx]*len(protein_ls)
di = dict(zip(protein_list, gene_list))
disease_db2 = pd.DataFrame(disease_list, index=protein_list, columns=["Target"])
disease_db2 = pd.DataFrame(disease_db2.groupby(level=0).apply(lambda x : "; ".join(list(x["Target"]))), 
                           columns=["Target"])
disease_db2["Protein"] = disease_db2.index
disease_db2["Gene Name"] = [di[i] for i in disease_db2.index]
disease_db2 = disease_db2.loc[[s for s in disease_db2.index if (s!="None")]]

disease_go_folder = predict_folder+"DISEASE_GO_SIMILARITY/"
disease_go_sim_fname = predict_folder+"disease_go_similarity_df.csv"

try:
    compute_GO_similarity(disease_go_sim_fname, disease_go_folder, disease_db2)
except:
    GO_similarity = pd.read_csv(predict_folder+"DISEASE_GO_SIMILARITY/GO_similarity.csv", index_col=0)
    GO_similarity = GO_similarity.loc[[s for s in GO_similarity.index if (len(str(s).split("."))==1 and (str(s)!="None"))]]
    GO_similarity = GO_similarity[[s for s in GO_similarity.columns if (len(str(s).split("."))==1 and (str(s)!="None"))]]
    GO_similarity.to_csv(predict_folder+"DISEASE_GO_SIMILARITY/GO_similarity.csv")
    compute_GO_similarity(disease_go_sim_fname, disease_go_folder, disease_db2)
disease_go_similarity_df = pd.read_csv(disease_go_sim_fname, index_col=0)
disease_go_similarity_df

Unnamed: 0,C0001126,C0001418,C0001627,C0001973,C0002170,C0002171,C0002395,C0002736,C0002874,C0002895,...,C4310232,C4310803,C4551482,C4551825,C4551864,C4551906,C4551951,C4551981,C4552070,C4721891
C0001126,0.354889,0.080994,0.054500,0.100187,0.104556,0.076778,0.113359,0.081333,0.078222,0.085333,...,0.061556,0.087889,0.042750,0.136333,0.052611,0.057947,0.109444,0.098889,0.114111,0.164444
C0001418,0.080994,0.157003,0.122944,0.112458,0.127407,0.110049,0.146031,0.120355,0.155404,0.146037,...,0.130926,0.093185,0.119764,0.090796,0.130870,0.079229,0.167463,0.083417,0.145593,0.137741
C0001627,0.054500,0.122944,0.264167,0.108523,0.131333,0.102889,0.123090,0.098500,0.125967,0.133500,...,0.110500,0.083333,0.119583,0.070000,0.094000,0.081228,0.164333,0.063667,0.115000,0.103667
C0001973,0.100187,0.112458,0.108523,0.145978,0.115674,0.094727,0.129338,0.106611,0.109800,0.118747,...,0.167682,0.102288,0.093288,0.099955,0.092333,0.073234,0.133500,0.086780,0.128015,0.118136
C0002170,0.104556,0.127407,0.131333,0.115674,0.246750,0.115944,0.126628,0.106444,0.123500,0.116722,...,0.110000,0.136667,0.114417,0.077000,0.114500,0.066658,0.129000,0.079917,0.160667,0.107000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C4551906,0.057947,0.079229,0.081228,0.073234,0.066658,0.069076,0.080167,0.084649,0.099375,0.070667,...,0.054965,0.042614,0.075149,0.067649,0.093325,0.206158,0.092246,0.069675,0.071018,0.087456
C4551951,0.109444,0.167463,0.164333,0.133500,0.129000,0.120889,0.183385,0.141333,0.174733,0.176333,...,0.162333,0.089000,0.157583,0.125667,0.126000,0.092246,0.511667,0.112333,0.151667,0.205000
C4551981,0.098889,0.083417,0.063667,0.086780,0.079917,0.087278,0.108141,0.084222,0.081700,0.089722,...,0.090000,0.075500,0.070958,0.104833,0.072667,0.069675,0.112333,0.239417,0.080833,0.095167
C4552070,0.114111,0.145593,0.115000,0.128015,0.160667,0.143111,0.158872,0.136167,0.135800,0.144667,...,0.133333,0.162667,0.099500,0.099333,0.112333,0.071018,0.151667,0.080833,0.523000,0.144667


## IV. Build final  drug-drug, disease-disease and drug-disease matrices

Disease ids: Concept IDs, drug ids: DrugBank or PubChem CIDs. Build matrices A (ratings) drugs $\times$ diseases, P (disease features) disease features $\times$ diseases, S (drug features) drug features $\times$ drugs, with NaN for missing values.

### IV.1. Merge matrices

In [22]:
## Drug-disease associations
A = utils.load_dataset("FEATURELESS", save_folder=paths_global.data_folder)["ratings_mat"]

## Drug-drug similarities
ftypes = ["chemical", "se", "sequence", "network", "go", "signature"]
## Disease-disease similaries
ftypes += ["disease_"+x for x in ["phenotype", "semantic", "sequence", "network", "go", "phenotype"]]

fnames = [predict_folder+x+"_similarity_df.csv" for x in ftypes]
fdict = {ftype: fnames[ift] for ift, ftype in enumerate(ftypes)}

for ftype in fdict:
    df = pd.read_csv(fdict[ftype], index_col=0, header=0)
    df.index = [ftype+"-"+str(x) for x in df.index]
    globals()[ftype] = df
    
## Concatenate all features
S = pd.concat([eval(ftype) for ftype in ftypes if ("disease" not in ftype)], axis=0, join="outer")
P = pd.concat([eval(ftype) for ftype in ftypes if ("disease" in ftype)], axis=0, join="outer")

## Restrict to diseases in A (association matrix)
P = P.drop_duplicates(keep="first")[list(set([p for p in A.columns if (p in P.columns)]))]
P_PREDICT = P.loc[~P.index.duplicated()]

## Restrict to drugs in A (association matrix)
S = S.drop_duplicates(keep="first")[list(set([x for x in list(A.index) if (x in S.columns)]))]
S_PREDICT = S.loc[~S.index.duplicated()]

## Restrict association matrix
A = A[P.columns].loc[S.columns]
A_PREDICT = A.loc[~A.index.duplicated()]

S_PREDICT.to_csv(predict_folder+"items.csv")
P_PREDICT.to_csv(predict_folder+"users.csv")
A_PREDICT.to_csv(predict_folder+"ratings_mat.csv")

ratings_A = utils.matrix2ratings(A_PREDICT, "ind_id", "drug_id", "rating")
print("Sparsity = "+str(utils.compute_sparsity(A_PREDICT))+"%")
print("%d drug features %d disease features" % (S_PREDICT.shape[0], P_PREDICT.shape[0]))
utils.print_dataset(ratings_A, "ind_id", "drug_id", "rating")
ratings_A.T

Sparsity = 0.34230379461772326%
6265 drug features 2914 disease features
Ndrugs=1351	Ndiseases=1066
5624 positive	152 negative	1434390 unknown matchings


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5766,5767,5768,5769,5770,5771,5772,5773,5774,5775
ind_id,C0279702,C0032463,C2930802,C0011608,C3245525,C0268242,C0043388,C0030593,C1276801,C0007131,...,C0006840,C2713442,C1969710,C1845336,C1839927,C1842632,C0700345,C0023234,C0030593,C0007131
drug_id,DB09276,DB08871,DB01262,DB01262,DB01262,DB00591,DB00591,DB00591,DB00591,DB00591,...,DB00776,DB00776,DB00776,DB00776,DB00776,DB00776,DB00776,DB00547,DB00547,DB00547
rating,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [23]:
dataset = utils.load_dataset("PREDICT", save_folder=paths_global.data_folder)
[(d, dataset[d].shape) for d in dataset]

[('ratings_mat', (1577, 1070)),
 ('users', (2914, 1070)),
 ('items', (6265, 1577))]

### IV.2. Generating matrices which can be shared 

In [24]:
drugs, diseases = list(dataset["ratings_mat"].index), list(dataset["ratings_mat"].columns)

restrict = lambda df, ls : df[[s for s in ls if (s in df.columns)]]

for matrix_name in ["se","signature","disease_phenotype","disease_semantic"]:
    df = restrict(eval(matrix_name), drugs if ("disease" not in matrix_name) else diseases)
    df.to_csv(matrix_name+"_PREDICT_matrix.csv")