# Creating the "TRANSCRIPT" dataset based on transcriptomics

Version 2.0.0 (May 29th 2023). Please run the notebook "FEATURELESS_dataset.ipynb" beforehand.

## Libraries

In [1]:
import numpy as np
import pandas as pd
import subprocess as sb
import os

from time import sleep
from tqdm import tqdm
import requests
import pickle
import json

import sys
sys.path.insert(0, "../src/")

import utils
import paths_global
import data_processing

from joblib import Parallel, delayed
from multiprocessing import cpu_count
njobs=min(max(1,cpu_count()-2),3)

## Use DrugBank IDs or PubChem CIDs?
use_drugbank_ids = True

## Local paths

In [2]:
## Where database files are stored
print('root_folder="%s"' % paths_global.root_folder)
## Where intermediary files are stored
print('data_folder="%s"' % paths_global.data_folder)

root_folder="/media/kali/1b80f30d-2803-4260-a792-9ae206084252/Code/M30/data/"
data_folder="/media/kali/1b80f30d-2803-4260-a792-9ae206084252/RECeSS/cfdr/data/"


In [3]:
transcript_folder = paths_global.data_folder+"TRANSCRIPT/"
sb.Popen(["mkdir", "-p", transcript_folder])
## Where TRANSCRIPT dataset files are stored
print('transcript_folder="%s"' % transcript_folder)

transcript_folder="/media/kali/1b80f30d-2803-4260-a792-9ae206084252/RECeSS/cfdr/data/TRANSCRIPT/"


## Drug and disease identifiers

In [4]:
assert os.path.exists(paths_global.data_folder+"drugbankid2drugname.pck")
with open(paths_global.data_folder+"drugbankid2drugname.pck", "rb") as f:
    di_drugbankid2drugname = pickle.load(f)
    
assert os.path.exists(paths_global.data_folder+"omimid2diseasename.pck")
with open(paths_global.data_folder+"omimid2diseasename.pck", "rb") as f:
    di_omimid2diseasename = pickle.load(f)
    
cids_file = paths_global.data_folder+"medgenid2diseasename.pck"
if (not os.path.exists(cids_file)):
    di_medgenid2diseasename = {}
else:
    with open(cids_file, "rb") as f:
        di_medgenid2diseasename = pickle.load(f)
        
pubchem_file = paths_global.data_folder+"pubchemid2drugname.pck"
if (not os.path.exists(pubchem_file)):
    di_pubchemid2drugname = {}
else:
    with open(pubchem_file, "rb") as f:
        di_pubchemid2drugname = pickle.load(f)

## I. Matrix A : $N_S \times N_D$ of drug-disease associations

In [5]:
A = utils.load_dataset("FEATURELESS", save_folder=paths_global.data_folder)['ratings_mat']
A

Unnamed: 0,C0272275,C0585362,C3163899,C1319317,C0280324,C0007102,C0010674,C0079773,C0003873,CN263340,...,C1865810,C1848042,C1838261,C1832605,C1832474,C1866519,C1866041,C1866040,C1864068,C5203670
CID104999,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CID442021,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CID442872,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DB13415,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DB00001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DB16355,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DB16393,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DB16394,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DB16416,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
ratings_A = utils.matrix2ratings(A, "ind_id", "drug_id", "rating")

print("Sparsity = "+str(utils.compute_sparsity(A))+"%")
utils.print_dataset(ratings_A, "ind_id", "drug_id", "rating")
ratings_A.T

Sparsity = 0.33728805072386664%
Ndrugs=1600	Ndiseases=1576
8397 positive	225 negative	2512978 unknown matchings


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8612,8613,8614,8615,8616,8617,8618,8619,8620,8621
ind_id,C0006840,C0006840,C0006840,C0006840,C0149893,C0035235,C0339170,C0042510,C0149782,C0279639,...,C1851649,C1851649,C1851649,C1851649,C1851649,C1851649,C1851649,C1851649,C1851649,C1851649
drug_id,CID104999,CID442021,CID442872,DB13415,DB00001,DB00002,DB00002,DB00002,DB00002,DB00002,...,DB14761,DB15661,DB15718,DB15940,DB15941,DB16355,DB16393,DB16394,DB16416,DB16691
rating,-1,-1,-1,-1,1,1,1,1,1,1,...,-1,1,-1,1,1,1,1,1,1,1


## II. Build matrix P : $N_F \times N_D$ of disease features

Here, features are disease "differential" phenotypes, that is vectors of genewise expression changes due to disease (by comparing differential expression between patient and healthy groups). We use [CREEDS](https://maayanlab.cloud/CREEDS/) (free access, without registration), which reports manually curated disease phenotypes and crowd-sourced phenotypes (see [paper](https://www.nature.com/articles/ncomms12846)).

### II.a. Conversion of genes into their corresponding human orthologs

Sometimes the phenotypes are computed on animal models, hence the need to convert (one-to-one) orthologs from that species to humans. Here, we only consider mice models outside of humans.
- go to https://www.ensembl.org/biomart/martview/
- select in "Dataset": "Mouse genes (GRCm39)"
- "Filters": None
- "Attributes": 
	Gene > "Gene Stable ID", "Gene name"
	Homologues > Human > "Human gene stable ID", "Human homology type", "Human gene name"
- select "Results" and "Go"

In [7]:
## Where the orthologs files are stored
print('orthologs_folder="%s"' % paths_global.orthologs_folder)

orthologs_folder="/media/kali/1b80f30d-2803-4260-a792-9ae206084252/Code/M30/data/orthologs/"


In [8]:
mouse_human = pd.read_csv(paths_global.orthologs_folder+"mouse_human_matchings.tsv", sep='\t', header=0, index_col=None)
mouse_human = mouse_human.loc[mouse_human["Human homology type"]=="ortholog_one2one"][["Gene name","Human gene name"]]
mouse_human.index = mouse_human["Gene name"]
mouse_human = mouse_human.to_dict()["Human gene name"]

### II.b. Retrieval of differential phenotypes

In [9]:
## Where the CREEDS files are stored
print('creeds_folder="%s"' % paths_global.creeds_folder)

creeds_folder="/media/kali/1b80f30d-2803-4260-a792-9ae206084252/Code/M30/data/CREEDS/"


In [10]:
type_di = {"manual": "v", "automatic": "p"}
fs = ["disease_signatures", "single_drug_perturbations"]
base_url = "https://maayanlab.cloud/CREEDS/download/"

sb.call("mkdir -p "+paths_global.creeds_folder, shell=True)
for f in fs:
    for kt in type_di:
        fname = f+"-"+type_di[kt]+"1.0.json"
        if (not os.path.exists(paths_global.creeds_folder+fname)):
            sb.call("wget -O "+paths_global.creeds_folder+fname+" "+base_url+fname, shell=True)

def iterator_json(fname, idi):
    cmd = "cat "+fname+" | sed 's/\\}/\\n/g' | head -n"+str(idi+1)+" | tail -n1"
    res = sb.check_output(cmd, shell=True).decode("utf-8")[:-1]
    if (len(res) == 0):
        return None
    res = res[1:-1]+"\"}"
    content = json.loads(res)
    return content

if (not os.path.exists(transcript_folder+"P_manual.csv")):
    P_manual = {}
    for k in ["manual"]:
        fname=paths_global.creeds_folder+"disease_signatures-"+type_di[k]+"1.0.json"
        ii = 1
        while (True):
            content = iterator_json(fname, ii)
            if (str(content)=="None"):
                break
            disease_cid = content["umls_cui"]
            organism = content["organism"]
            print((disease_cid, organism))
            ii += 1
            if (organism not in ["human", "mouse"]):
                continue
            if ((disease_cid in A.columns) and (disease_cid not in P_manual)):
                if (organism == "human"):
                    sig = dict(content['down_genes']+content['up_genes'])
                elif (organism == "mouse"):
                    sig = dict([[mouse_human[g],gi] for g,gi in content['down_genes']+content['up_genes'] if (g in mouse_human)])
                P_manual.setdefault(disease_cid, sig)
    pd.DataFrame(P_manual).fillna(0.).to_csv(transcript_folder+"P_manual.csv")
P_manual = pd.read_csv(transcript_folder+"P_manual.csv", index_col=0)
P_manual

Unnamed: 0,C0028043,C0011860,C0023418,C0033860,C0032460,C0004096,C0025202,C0236780,C0003873,C0020474,...,C0017168,C0003615,C0003872,C0014544,C0024419,C0038436,C0040028,C0040034,C0018802,C0036323
BEX1,-0.087223,0.0,0.0,0.0,0.147680,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000e+00
ZNF423,-0.077739,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000e+00
NME1,-0.063306,0.0,0.0,0.0,0.021828,0.0,0.0,0.000000,0.0,0.0,...,0.027088,0.022987,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000e+00
ISL1,-0.061242,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000e+00
TUBB2B,-0.060581,0.0,0.0,0.0,0.000000,0.0,0.0,0.018135,0.0,0.0,...,0.000000,0.000000,0.0,0.070766,0.0,0.0,0.0,0.0,0.0,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LCN8,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,1.553410e-10
ACSF2,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,1.527251e-10
WDR55,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,1.523806e-10
MBD3L1,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,1.516381e-10


In [11]:
if (not os.path.exists(transcript_folder+"P_auto.csv")):
    P_auto = {}
    seen_diseases = []
    seen_diseases = seen_diseases[:-1]
    for k in ["automatic"]:
        fname=paths_global.creeds_folder+"disease_signatures-"+type_di[k]+"1.0.json"
        ii = 1
        while (True):
            content = iterator_json(fname, ii)
            if (str(content)=="None"):
                print("End of file")
                break
            disease_cid = []   
            ii += 1
            organism = content["organism"]
            if (not "disease_name" in content):
                continue
            if (not content["disease_name"] in seen_diseases):
                print((ii-1,organism, content["disease_name"]))
            if (organism not in ["mouse", "human"] and str(organism)!=str(list(sorted(['human', 'mouse'])))):
                continue
            if (not content["disease_name"] in seen_diseases):
                seen_diseases.append(content["disease_name"])
                for disease in content["disease_name"].split("|"):
                    unspecified_diseases = ["Syndrome", "incurable diseases","Virus Diseases"]
                    unspecified_diseases += ["Possessed","Disease","Disease Progression","disease transmission"]
                    if (disease in unspecified_diseases):
                        ## unspecified diseases
                        continue
                    try:
                        cid = utils.get_concept_id(disease)
                        if ((cid in A.columns) and (cid not in P_auto)):
                            disease_cid.append(cid)
                    except:
                        if (disease == "Seryl-tRNA synthetase, mitochondrial"):
                            disease = "Seryl-tRNA synthetase"
                        elif (disease == "Alzheimer's Disease Pathway KEGG"):
                            disease = "Alzheimer Disease"
                        elif (disease == "Metastatic Renal Cell Cancer"):
                            disease = "Renal Cell Cancer"
                        elif (disease == "1-acylglycerol-3-phosphate O-acyltransferase ABHD5"):
                            disease = "ABHD5"
                        elif (disease == "S-adenosyl-L-methionine"):
                            ## unspecified
                            continue
                        elif (disease == "metaplastic cell transformation"):
                            ## unspecified
                            continue
                        elif (disease == "Lipoma-preferred partner"):
                            disease = "Lipoma"
                        elif (disease == "lipophosphoglycan"):
                            continue
                        elif (disease == "Refractory anaemia with excess blasts"):
                            disease = "Refractory cytopenias"
                        else:
                            print(disease)
                            raise ValueError
                            continue
                        cid = utils.get_concept_id(disease)
                        if ((cid in A.columns) and (cid not in P_auto)):
                            disease_cid.append(cid)
            if (len(disease_cid) == 0):
                continue
            for dcid in disease_cid:
                if ((organism == "human") or ("human" in organism)):
                    sig = dict(content['down_genes']+content['up_genes'])
                elif (organism == "mouse"):
                    sig = dict([[mouse_human[g],gi] for g,gi in content['down_genes']+content['up_genes'] if (g in mouse_human)])
                P_auto.setdefault(dcid, sig)
    pd.DataFrame(P_auto).fillna(0.).to_csv(transcript_folder+"P_auto.csv")
P_auto = pd.read_csv(transcript_folder+"P_auto.csv", index_col=0)
P_auto

Unnamed: 0,C0346629,C2239176,C3553462,C0035235,C0032285,C0010346,C0009324,C0029408,C0001973,C2973725,...,C0019655,C5230306,C0020951,C0024299,C0085311,C0027708,C0038013,C1835407,C0345908,C0400827
USMG5,-0.158146,-0.158146,0.000000,0.000000,0.0,0.038415,0.038415,0.038415,0.0,-0.015251,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
SLC26A2,-0.131811,-0.131811,0.000000,0.000000,0.0,-0.075141,-0.075141,-0.075141,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
GUCA2A,-0.130771,-0.130771,0.000000,0.000000,0.0,-0.101133,-0.101133,-0.101133,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.113759
AQP8,-0.126526,-0.126526,0.000000,0.000000,0.0,-0.035313,-0.035313,-0.035313,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014923
CEACAM6,-0.125712,-0.125712,0.050276,0.050276,0.0,0.118054,0.118054,0.118054,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.118032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GAK,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010037
CAPN5,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010035
ARHGAP44,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009932
GOLGA3,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009742


Merge the two datasets (favoring manually curated signatures over the other ones)

In [12]:
P = P_auto.to_dict()
P.update(P_manual.to_dict())
P = pd.DataFrame(P).fillna(0.)
P.to_csv(transcript_folder+"all_diseases.csv")
P

Unnamed: 0,C0346629,C2239176,C3553462,C0035235,C0032285,C0010346,C0009324,C0029408,C0001973,C2973725,...,C0275804,C0017168,C0003615,C0003872,C0014544,C0038436,C0040028,C0040034,C0018802,C0036323
USMG5,-0.158146,-0.158146,0.000000,0.000000,0.0,0.038415,0.000000,0.0,0.0,-0.015251,...,0.0,0.000000,0.000000,0.0,0.022347,0.0,0.0,0.0,0.0,0.000000e+00
SLC26A2,-0.131811,-0.131811,0.000000,0.000000,0.0,-0.075141,-0.112273,0.0,0.0,0.000000,...,0.0,0.000000,-0.036328,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000e+00
GUCA2A,-0.130771,-0.130771,0.000000,0.000000,0.0,-0.101133,-0.167974,0.0,0.0,0.000000,...,0.0,0.000000,-0.026553,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000e+00
AQP8,-0.126526,-0.126526,0.000000,0.000000,0.0,-0.035313,-0.085438,0.0,0.0,0.000000,...,0.0,0.000000,-0.033508,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000e+00
CEACAM6,-0.125712,-0.125712,0.050276,0.050276,0.0,0.118054,0.000000,0.0,0.0,0.000000,...,0.0,-0.017911,-0.014329,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NCAPG2,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,1.558824e-10
WDR90,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,1.556552e-10
WDR55,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,1.523806e-10
MBD3L1,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,1.516381e-10


In [13]:
"%d/%d (%.2f perc.)" % (len([x for x in A.columns if (x in P.columns)]), A.shape[1], len([x for x in A.columns if (x in P.columns)])*100/A.shape[1])

'151/1576 (9.58 perc.)'

In [14]:
"%d/%d (%.2f perc.)" % (len([x for x in P.columns if (x in A.columns)]), P.shape[1], len([x for x in P.columns if (x in A.columns)])*100/P.shape[1])

'151/151 (100.00 perc.)'

## III. Build matrix S : $N_F \times N_S$ of drug features

### III.a. Using [CREEDS](https://maayanlab.cloud/CREEDS/) dataset

Here, features are drug signatures, that is vectors of genewise expression changes due to treatment (by comparing differential expression between treated and untreated groups). We use [CREEDS](https://maayanlab.cloud/CREEDS/) (free access, without registration), which reports manually curated and crowd-sourced drug signatures (see [paper](https://www.nature.com/articles/ncomms12846)). Contrary to [LINCS](https://lincsproject.org/LINCS/tools/workflows/find-the-best-place-to-obtain-the-lincs-l1000-data) L1000 data (see [paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5990023/)), those drug signatures are computed on GEO datasets, which might avoid the caveat of using measures from immortalized cells, and to rely on the computational inference for most genes. Similarly, whenever necessary, we convert gene names to their one-to-one orthologs.

In [15]:
## Where the orthologs files are stored
print('orthologs_folder="%s"' % paths_global.orthologs_folder)
## Where the CREEDS files are stored
print('creeds_folder="%s"' % paths_global.creeds_folder)

orthologs_folder="/media/kali/1b80f30d-2803-4260-a792-9ae206084252/Code/M30/data/orthologs/"
creeds_folder="/media/kali/1b80f30d-2803-4260-a792-9ae206084252/Code/M30/data/CREEDS/"


In [16]:
type_di = {"manual": "v", "automatic": "p"}
fs = ["disease_signatures", "single_drug_perturbations"]
base_url = "https://maayanlab.cloud/CREEDS/download/"

sb.call("mkdir -p "+paths_global.creeds_folder, shell=True)
for f in fs:
    for kt in type_di:
        fname = f+"-"+type_di[kt]+"1.0.json"
        if (not os.path.exists(paths_global.creeds_folder+fname)):
            sb.call("wget -O "+paths_global.creeds_folder+fname+" "+base_url+fname, shell=True)
            
def iterator_json(fname, idi):
    cmd = "cat "+fname+" | sed 's/\\}/\\n/g' | head -n"+str(idi+1)+" | tail -n1"
    res = sb.check_output(cmd, shell=True).decode("utf-8")[:-1]
    if (len(res) == 0):
        return None
    res = res[1:-1]+"\"}"
    content = json.loads(res)
    return content

if (not os.path.exists(transcript_folder+"S_manual.csv")):
    S_manual = {}
    for k in ["manual"]:
        fname=paths_global.creeds_folder+"single_drug_perturbations-"+type_di[k]+"1.0.json"
        ii = 1
        while (True):
            content = iterator_json(fname, ii)
            if (content is None):
                print("End of file")
                break
            drug_cid = str(content["pubchem_cid"])
            organism = content["organism"]
            print((ii,drug_cid, organism))
            ii += 1
            if (organism not in ["human", "mouse"]):
                continue
            if (drug_cid == "None"):
                continue
            drug_cid += "--"+str(content["drugbank_id"])
            is_inA = (drug_cid.split("--")[-1] in A.index) or ("CID"+drug_cid.split("--")[0] in A.index)
            if (is_inA and drug_cid not in S_manual):
                if (organism == "human"):
                    sig = dict(content['down_genes']+content['up_genes'])
                elif (organism == "mouse"):
                    sig = dict([[mouse_human[g],gi] for g,gi in content['down_genes']+content['up_genes'] if (g in mouse_human)])
                S_manual.setdefault(drug_cid, sig)
    pd.DataFrame(S_manual).fillna(0.).to_csv(transcript_folder+"S_manual.csv")
S_manual = pd.read_csv(transcript_folder+"S_manual.csv", index_col=0)

id_matchings = {int(s.split("--")[0]):s.split("--")[1] for s in S_manual.columns}
S_manual.columns = [int(s.split("--")[0]) for s in S_manual.columns]

S_manual

Unnamed: 0,2771,3385,702,444795,14985,123631,5288826,446541,1983,441203,...,149096,10836,5379,5311054,57469,3410,5754,31401,6013,5394
CYAT1,-0.373633,0.0,0.0,0.0,0.0,0.0,0.0,-0.137561,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014591,0.000000
IGLC7,-0.332506,0.0,0.0,0.0,0.0,0.0,0.0,-0.023814,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
IGHV4-31,-0.168607,0.0,0.0,0.0,0.0,0.0,0.0,-0.028592,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027523,0.000000
IGHM,-0.158196,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
IGHV3-23,-0.145723,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SH2B3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.021457
ACTR1A,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.016300
SCAP,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.015281
TMEM237,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.014068


In [17]:
## Convert PubChem to DrugBank in S_auto
if (not os.path.exists(transcript_folder+"pubchem2drugbank.pck")):
    converted_pubchem = utils.get_pubchem_drugbank(list(S_auto.columns), id_folder=paths_global.data_folder)

    id_matchings_auto = dict(zip(list(S_auto.columns), converted_pubchem))
    id_matchings_auto.pop(None)
    
    missing_PubChem_ids = {
        5702198: "DB00515", 23939: "DB12257", 23400779: None, 139199449: None, 11354606: "DB08871", 93405: None, 
        679: "DB01093", 2554: "DB00564", 19649: "DB01082", 2349: "DB01053", 439530: "DB08437", 4125253: "DB09255", 
        6279: "DB00603", 24947: None, 5997: "DB04540", 10133: "DB06777", 439520: None, 5793: "DB01914", 
        51063134:"DB11228", 136022106: None, 9887053: "DB00526", 962: "DB09145", 6072: "DB16771", 180: "DB02203", 
        6001: None, 62389: None, 2336: None, 712: "DB03843", 44176389: "DB11191", 60823: "DB01076", 148124: "DB01248", 
        3026: "DB13716", 5282315: "DB00864", 442530: None, 8133: "DB09287", 8894: "DB02412", 5462310: "DB09278", 
        6741: "DB00959", 2284: "DB00181", 6087: "DB01603", 22880: None, 5092: "DB01954", 11030410: None, 
        3715: "DB00328", 55245: "DB00834", 54675783: "DB01017", 47499: "DB04570", 4054: "DB01043", 4497: "DB00393", 
        1775: "DB00252", 4946: "DB00571", 54454: "DB00641", 22420: "DB00531", 6781: None, 445154: "DB02709", 
        13804: "DB12153", 31348: "DB04272", 199: "DB08838", 5920: "DB00279", 6140: "DB00120", 2797: None, 2331: None, 
        445858: "DB07767", 5128032: "DB00142", 5238: "DB11119", 445639: "DB04224", 985: "DB03796", 445638: "DB04257", 
        5281: "DB03193", 15951529: "DB08899", 5995: "DB01420", 54671203: "DB00254", 175: "DB14511", 176870: "DB00530", 
        161500: "DB04568", 253877: None, 216345: "DB17039", 6137: "DB00134", 5962: "DB00123", 123591: None, 15625: None,
        63090: None, 5359596: None, 439260: "DB00368", 3397: "DB00499", 5280961: "DB01645", 5281708: "DB13182", 
        60961: "DB00640", 5152: "DB00938", 1388: None, 45480545: None, 6436079: None, 6251: "DB00742", 
        3713609: "DB00988", 31268: "DB16858", 5460769: "DB00290", 5991: "DB00977", 1088: "DB12519", 1068: None, 
        186907: None, 16132265: "DB01285", 124886: "DB00143", 5951: "DB00133", 101620072: None, 123865: "DB04263", 
        54676228: "DB00554", 6569: None, 6579: None, 8343: None, 65999: "DB00966", 247: "DB06756", 5460341: "DB01373", 
        16682999: "DB13909", 5287971: "DB15834", 9877608: None, 5280352: None, 4098: "DB04819", 313: "DB13366", 
        5284616: "DB00877", 3339:"DB01039", 5282796: None, 77999: "DB00412", 213013: "DB12792", 16131098: None, 
        41322: None, 6057: "DB00135", 61739: "DB11123", 522834: None, 4463: "DB00238", 45488079: None, 
        216326: "DB00480", 10095659: "DB12645", 2726: "DB00477", 6623: "DB06973", 7577: None, 171548: "DB00121", 
        5284373: "DB00091", 3776: "DB02325", 2214: "DB12618", 3961: "DB00678", 4763: "DB01174", 26945: None, 
        2723628: "DB01154", 2189414: None, 446953: None, 3696: "DB00458", 45588096: "DB01276", 457193: "DB00970", 
        5921: None, 60700: "DB01030", 5234: "DB09153", 5282379: "DB00982", 2796: "DB00636", 590836: None, 
        135436526: None, 5280878: "DB11858", 439456: "DB03401", 3440921: None, 892: "DB13178", 52940265: "DB11858", 
        11092:"DB16830", 5591: "DB00197", 73509: "DB02624", 10221437: None, 10635: "DB02901", 3821: "DB01221", 
        5988: "DB02772", 261166: None, 237332: "DB12298", 259: None, 1549107:"DB02509", 25201456: None, 
        132477936: None, 5961: "DB00130", 443295: None, 53630484: "DB13178", 969516: "DB11672", 5280805: "DB01698", 
        5352062: "DB06176", 54675776: "DB00759", 71384: "DB00380", 40469134: "DB00997", 5957: "DB00171", 
        100016: "DB03068", 5280492: "DB12961", 3899: "DB01097", 387447: "DB00188", 44450793: "DB00369",
        5359581: None, 5073: "DB00734", 6442177: "DB01590", 5280453: "DB00136", 6575: "DB13323", 31373: None, 
        54670067: "DB00126", 25199807: None, 6124: None, 312: 'DB14547', 440667: "DB08830", 5530: "DB00752", 
        64802: None, 23963: "DB14160", 26042: "DB09536", 444732: "DB04297", 242: "DB03793", 223368: "DB02187", 
        5035: "DB00481", 446155: "DB01095", 4174: "DB01011", 6342: None, 5280906: None, 13711: "DB02594", 
        11953960: None, 14806: "DB09321", 11561907: None, 445580: "DB03756", 7618: "DB13747", 135761165: "DB00158", 
        135410875: "DB00642", 123286: None, 23974: None, 25058126: None, 5773: "DB09008", 3712: "DB12881", 
        244: "DB06770", 22094277: None, 402: None, 3779: "DB01064", 87642: "DB01992", 5282452: "DB08860", 
        5870: "DB00655", 5461123: "DB12982", 62640: "DB09322", 5154: None, 157278295: None, 3036:"DB13424", 
        9929901: "DB12647", 65126: None, 166653: None, 6047: "DB01235", 25074470: "DB06825", 887: None, 
        107969: "DB08868", 5280335: "DB03203", 5283560: None, 107970: "DB08868", 5329102: "DB01268", 107689: "DB14475", 
        23662274: "DB00119", 20806: None, 5978: "DB00541", 13342: "DB00570", 6167: "DB01394", 265237: None, 
        12967: None, 61672: None, 4837: "DB00592", 2162: "DB00381", 10039: None, 91474: None, 60846: "DB00177", 
        190217: None, 5943: None, 23976: "DB11136", 3136: None, 6287: "DB00161", 1794427: "DB12029", 
        5280343: "DB04216", 5281600: None
    }
    id_matchings_all = id_matchings.copy()
    id_matchings_all.update(id_matchings_auto)
    id_matchings_all.update(missing_PubChem_ids)

    with open(transcript_folder+"pubchem2drugbank.pck", "wb") as f:
        pickle.dump(id_matchings_all, f)

with open(transcript_folder+"pubchem2drugbank.pck", "rb") as f:
    id_matchings_all = pickle.load(f) 

In [18]:
def iterator_json(fname, idi):
    cmd = "cat "+fname+" | sed 's/\\}/\\n/g' | head -n"+str(idi+1)+" | tail -n1"
    res = sb.check_output(cmd, shell=True).decode("utf-8")[:-1]
    if (len(res) == 0):
        return None
    res = res[1:-1]+"\"}"
    try:
        content = json.loads(res)
        return content
    except:
        ## sanitize...
        res_ = res.split("drug_name\": \"")[0]+"drug_name\": \""
        res_ += "".join("".join("".join("".join("".join("".join(res.split("drug_name\": \"")[-1].split("\"")[0].split("{")).split("}")).split("[")).split("]")).split("\'")).split("\\"))
        res_ += "\""+res.split("drug_name\": \"")[-1].split("\"")[-1]
        res = res_
        try:
            content = json.loads(res)
            return content
        except:
            return 0
        
if (not os.path.exists(transcript_folder+"S_auto.csv")):
    S_auto = {}
    seen_drugs = []
    for k in ["automatic"]:
        fname=paths_global.creeds_folder+"single_drug_perturbations-"+type_di[k]+"1.0.json"
        ii = 1
        while (True):
            content = iterator_json(fname, ii)
            if (str(content)=="None"):
                ## end of file
                print("End of file")
                break
            ii += 1
            if (str(content)=="0"):
                ## ill-formated 
                continue
            if (not "organism" in content):
                continue
            organism = content["organism"]
            if (not "drug_name" in content):
                continue
            if (not content["drug_name"] in seen_drugs):
                print((ii-1,organism, content["drug_name"]))
            drug_cid = []
            if (not content["drug_name"] in seen_drugs):
                seen_drugs.append(content["drug_name"])
                cids = utils.get_pubchem_id(content["drug_name"].split("|"))
                drug_cid = [cid for cid in cids if (cid not in S_auto)]
            if (len(drug_cid) == 0):
                continue
            for dcid in drug_cid:
                if ((organism == "human") or ("human" in organism)):
                    sig = dict(content['down_genes']+content['up_genes'])
                elif (organism == "mouse"):
                    sig = dict([[mouse_human[g],gi] for g,gi in content['down_genes']+content['up_genes'] if (g in mouse_human)])
                S_auto.setdefault(dcid, sig)
    pd.DataFrame(S_auto).fillna(0.).to_csv(transcript_folder+"S_auto.csv")
S_auto = pd.read_csv(transcript_folder+"S_auto.csv", index_col=0)

print(S_auto.shape)

S_auto = S_auto[[s for s in S_auto.columns if ("Unnamed" not in s)]]
S_auto.columns = [int(float(x)) for x in S_auto.columns]

if (use_drugbank_ids):
    print([x for x in S_auto.columns if (x not in id_matchings_all)])
    print("%d drugs retrieved out of %d (%d perc)" % (len([x for x in S_auto.columns if (x in id_matchings_all)]), 
                                S_auto.shape[1], 
                                len([x for x in S_auto.columns if (x in id_matchings_all)])*100/S_auto.shape[1]))

    S_auto.columns = [id_matchings_all.get(s, int(s)) for s in S_auto.columns]
    
S_auto = S_auto[[s for s in S_auto.columns if (s is not None)]]
S_auto = S_auto.T.loc[~S_auto.columns.duplicated()].T

S_auto

(11622, 291)
[]
290 drugs retrieved out of 290 (100 perc)


Unnamed: 0,DB00515,DB12257,DB01229,DB08871,DB01093,DB00564,DB00563,DB00586,DB00755,DB00783,...,DB00541,DB00570,DB01394,DB00592,DB00381,DB00177,DB11136,DB00161,DB12029,DB04216
CDC6,-0.148701,-0.148701,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000
SPP1,-0.119415,-0.119415,-0.014677,-0.014677,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.046817,-0.046817,-0.046817,0.000000,0.0,0.0,0.000000,0.0,0.115339,0.115339
CSDE1,-0.098104,-0.098104,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.031058,0.0,0.0,-0.007861,0.0,0.012258,0.012258
AKR1C2,-0.093002,-0.093002,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.020105,-0.020105,-0.020105,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000
TRIM33,-0.089014,-0.089014,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SMAD2,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,-0.006132,-0.006132
DIS3,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.006887,0.006887
C3orf70,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.006294,0.006294
LIAS,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.005854,0.005854


Merge the two datasets (favoring manually curated signatures over the other ones)

In [19]:
A = utils.load_dataset("FEATURELESS", save_folder=paths_global.data_folder)['ratings_mat']

S = S_auto.to_dict()
S.update(S_manual.to_dict())
S = pd.DataFrame(S).fillna(0.)

if (use_drugbank_ids):
    S.columns = [str(id_matchings_all.get(s, s)) for s in S.columns]
    S.columns = [s if (s[:len("DB")]=="DB") else "CID"+s for s in S.columns]
    S = S[[a for a in A.index if (a in S.columns)]]
else:
    inv_map = {v:k for k,v in id_matchings_all.items()}
    inv_map.pop(None)
    A = A.loc[[a for a in A.index if (a in inv_map)]]
    S = S[[inv_map[a] for a in A.index]]
    
S = S.T.loc[~S.columns.duplicated()].T
S.to_csv(transcript_folder+"all_drugs.csv")
    
S

Unnamed: 0,DB00091,DB00121,DB00126,DB00130,DB00133,DB00136,DB00158,DB00163,DB00177,DB00181,...,DB06770,DB06777,DB06825,DB08860,DB08868,DB08871,DB08899,DB09287,DB09321,DB09322
CDC6,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
SPP1,0.0,0.0,0.054299,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,-0.014677,0.0,0.0,0.0,0.0
CSDE1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,-0.017063,0.0,0.0,...,0.0,0.0,0.0,0.017448,0.0,0.000000,0.0,0.0,0.0,0.0
AKR1C2,0.0,0.0,-0.017830,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
TRIM33,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HRASLS,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
TMUB1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
TWF2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
C19orf60,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0


In [20]:
pubchem_ids = [int(s[len("CID"):]) for s in A.index if (s[:len("DB")]!="DB")]
assert S_auto[[p for p in pubchem_ids if (p in S_auto.columns)]].shape[1]==0
assert S_manual[[p for p in pubchem_ids if (p in S_manual.columns)]].shape[1]==0

### III.b. Using [LINCS L1000](https://maayanlab.cloud/Harmonizome/resource/LINCS+L1000+Connectivity+Map)

In order to populate further matrix S, one can get profiles from LINCS L$1000$ Level $3$ data, and apply Characteristic Direction on them to get signatures similar to the ones in CREEDS.

In [21]:
## Where LINCS L1000 files are stored
print('lincs_folder="%s"' % paths_global.lincs_folder)

lincs_folder="/media/kali/1b80f30d-2803-4260-a792-9ae206084252/Code/M30/NetworkOrientedRepurposingofDrugs/lincs/"


In [22]:
A = utils.load_dataset("FEATURELESS", save_folder=paths_global.data_folder)["ratings_mat"]
P = pd.read_csv(transcript_folder+"all_diseases.csv", engine="python", index_col=0)
S = pd.read_csv(transcript_folder+"all_drugs.csv", engine="python", index_col=0)
with open(transcript_folder+"pubchem2drugbank.pck", "rb") as f:
    id_matchings_all = pickle.load(f)

other_drugs = [x for x in A.index if (str(x) not in S.columns)]

## Reduce the number of considered genes by using var_thres >> 0
var_thres=0
genes = list(S.loc[S.var(axis=1) > var_thres].index)
genes += list(P.loc[P.var(axis=1) > var_thres].index)
genes = list(set(genes))

print("#missing drugs from CREEDS=%d/%d\t#considered genes=%d" % (len(other_drugs), A.shape[0], len(genes)))

#missing drugs from CREEDS=1457/1622	#considered genes=15991


Convert gene names into EntrezGene CIDs using package NORDic

In [23]:
from NORDic.UTILS.utils_data import convert_genes_EntrezGene, convert_EntrezGene_LINCSL1000

entrezgene_fname=transcript_folder+"entrezgenes_ids.csv"
if (not os.path.exists(entrezgene_fname)):
    probes = convert_genes_EntrezGene(genes, 9606, "drug_repurposing_datasets", chunksize=500, quiet=False)
    other_genes = list(set(list(probes[probes["Gene ID"]=="-"].index)))
    list_genes = Parallel(n_jobs=njobs, backend='loky')(delayed(data_processing.get_biomart)("%d/%d" % (x_id+1,len(other_genes)), x) for x_id, x in enumerate(other_genes))
    other_probes = pd.DataFrame(list_genes, index=other_genes, columns=["Gene ID"])
    other_probes = other_probes[other_probes["Gene ID"]!="-"]
    probes = pd.concat((probes[probes["Gene ID"]!="-"], other_probes), axis=0)
    probes = probes.loc[[g for g in genes if ((g not in other_genes) and (g in probes.index))]]
    probes.to_csv(entrezgene_fname)
probes = pd.read_csv(entrezgene_fname,index_col=0)
print("Found genes: %d/%d" % (probes.shape[0], len(genes)))
probes.T

Found genes: 15165/15991


Unnamed: 0,KIAA1244,SCRN2,HNRNPA3,RPF2,GNG12,NRD1,CTR9,TBC1D21,SOX9-AS1,GDA,...,MMP2,NEK10,APOA5,POLR3GL,ITGA8,RAN,ZNF552,N4BP2L1,VAMP2,SFXN1
Gene ID,57221,90507,220988,84154,55970,4898,9646,161514,400618,8034; 9615,...,4313,152110,116519,84265,8516,5901,79818,90634,6844,94081


Convert EntrezGene CIDs into LINCS L1000 gene symbols using package NORDic

In [24]:
from NORDic.UTILS.LINCS_utils import *

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

if (not os.path.exists(transcript_folder+"entrezid2symbols.csv")):
    if (os.path.exists(transcript_folder+"entrezGene_LINCSL1000.pck")):
        sb.Popen(["rm", transcript_folder+"entrezGene_LINCSL1000.pck"])
    user_key = get_user_key(paths_global.lincs_file)
    pert_df_lst = []
    count, sz = 0, 100
    for EntrezGenes in chunks(list(probes["Gene ID"]), sz):
        count += sz
        print("%d/%d" % (min(count, probes.shape[0]), probes.shape[0]))
        if (os.path.exists(transcript_folder+"entrezGene_LINCSL1000_%d.pck" % count)):
            with open(transcript_folder+"entrezGene_LINCSL1000_%d.pck" % count, "rb") as f:
                di = pickle.load(f)
            df = pd.DataFrame([di["entrez_ids"], di["pert_inames"]], 
                              index=["Entrez ID", "Gene Symbol"], columns=EntrezGenes).T
        else:
            print(len(EntrezGenes))
            df = convert_EntrezGene_LINCSL1000(transcript_folder,EntrezGenes,user_key,quiet=True)
            sb.Popen(["mv", transcript_folder+"entrezGene_LINCSL1000.pck", 
                  transcript_folder+"entrezGene_LINCSL1000_%d.pck" % count])
        pert_df_lst.append(df)
    pert_df = pd.concat(tuple(pert_df_lst), axis=0)
    pert_df.to_csv(transcript_folder+"entrezid2symbols.csv")
pert_df = pd.read_csv(transcript_folder+"entrezid2symbols.csv", index_col=0)
pert_df = pert_df.dropna()
pert_df.index = pert_df["Gene Symbol"]
pert_df = pert_df.loc[~pert_df.index.duplicated()]

genes = list(pert_df.index)
genes_inS = [g for g in genes if (g in S.index)]
genes_inP = [g for g in genes if (g in P.index)]

print("Genes in S %.2f perc. (N=%d)" % (len(genes_inS)*100/S.shape[0], len(genes_inS)))
print("Genes in P %.2f perc. (N=%d)" % (len(genes_inP)*100/P.shape[0], len(genes_inP)))

Genes in S 80.19 perc. (N=10870)
Genes in P 80.35 perc. (N=12025)


Convert missing DrugBank drug names into PubChem CIDs

In [25]:
## Convert DrugBank to PubChem CIDs in A.index
if (not os.path.exists(transcript_folder+"drugbank2pubchem_all.pck")):
    
    with open(transcript_folder+"pubchem2drugbank.pck", "rb") as f:
        id_matchings_all = pickle.load(f)

    inv_map = {id_matchings_all[k]:k for k in id_matchings_all if (id_matchings_all[k] is not None)}
    other_drugs_converted = [int(s[len("CID"):]) if (s[:len("CID")]=="CID") else int(inv_map.get(s, 0)) 
                             for s in other_drugs]
    print("Drugs already to PubChemCIDs %d/%d" % (sum([int(x>0) for x in other_drugs_converted]), 
                                                 len(other_drugs_converted)))
    other_drugs_converted_PubChem = [x for x in other_drugs_converted if (x>0)]
    other_drugs_converted_DrugBank = [other_drugs[ix] for ix, x in enumerate(other_drugs_converted) if (x==0)]

    other_drugs_converted_DrugBank2Pubchem = utils.get_pubchem_id(other_drugs_converted_DrugBank)
    print("Drugs automatically converted to PubChemCIDs %d/%d" % (sum([int(x is not None) for x in other_drugs_converted_DrugBank2Pubchem]), 
                                             len(other_drugs_converted_DrugBank2Pubchem)))
    
    missing_PubChem_CIDS = {
        'DB00001': 118856773, 'DB00010': 16132413, 'DB00017': 16220016, 'DB00030': 118984375, 'DB00039': 46506691, 
        'DB00040': 16132283, 'DB00052': 135087, 'DB00063': 448812, 'DB00103': 46508538, 'DB00107': 439302, 
        'DB00109': 16130199, 'DB00286': 656613, 'DB00375': 62816, 'DB00510': 3121, 'DB00667': 774, 'DB00994': 8378, 
        'DB01049': 71717894, 'DB01258': 5493444, "DB13179": 202225, 'DB01361': 202225, 'DB01402': 16682734, 
        'DB04895': 56603655, 'DB04897': 155886531, 'DB04899': 53325981, 'DB04932': 135565962, 'DB05773': 163341910, 
        'DB06219': 16134627, 'DB06283': 16135415, 'DB06285': 16133850, 'DB06655': 16134956, 'DB08900': 16139605, 
        'DB08923': 11751549, 'DB09043': 145994868, 'DB09127': 23672064, 'DB09258': 772, 'DB09468': 23672064, 
        'DB09469': 23672064, 'DB09485': 5862, 'DB09525': 23672064, 'DB11320': 774, 'DB11595': 347911207, 
        'DB15661': 145996610, 'DB16691': 155903259,
    }
    missing_PubChem_SIDS = {
        'DB00002': 46507042, 'DB00003': 46507792, 'DB00004':46506950, 'DB00005': 46506732, 'DB00008': 46504860,
        'DB00009': 46507035, 'DB00013': 46506299, 'DB00015': 46506092, 'DB00016': 46508122, 'DB00019': 46505853,
        'DB00020': 46507000, 'DB00022': 46506669, 'DB00023': 46507633, 'DB00026': 46507944, 'DB00031': 46508657, 
        'DB00036': 46507544, 'DB00038': 46506470, 'DB00041': 46508054, 'DB00043': 46507002, 'DB00046': 46507498, 
        'DB00047': 46507981, 'DB00048': 46506485, 'DB00049': 46504490, 'DB00051': 46504982, 'DB00053': 46508744, 
        'DB00054': 46505910, 'DB00056': 46505767, 'DB00059': 46505366, 'DB00060': 46504899, 'DB00065': 46505602, 
        'DB00068': 46504458, 'DB00072': 46507516, 'DB00073': 46505820, 'DB00074': 46505169, 'DB00078': 46506112, 
        'DB00082': 46505507, 'DB00085': 46504728, 'DB00087': 46507379, 'DB00088': 46507500, 'DB00090': 46505612,
        'DB00092': 46505772, 'DB00095': 46505252, 'DB00099': 46505833, 'DB00100': 46508858, 'DB00102': 46508246,
        'DB00108': 46505849, 'DB00110': 46506637, 'DB00111': 46504469, 'DB00112': 46504473, 'DB00407': 46505194,
        'DB00707': 46508600, 'DB00930': 46505437, 'DB01109': 46507594, 'DB01225': 46507450, 'DB01257': 46505429,
        'DB01266': 46506183, 'DB01269': 46505063, 'DB01270': 46506960, 'DB01271': 46507347, 'DB01272': 46504595,
        'DB01277': 46504889, 'DB01279': 46509151, 'DB01281': 46509198, 'DB01306': 46507309, 'DB01307': 46508877,
        'DB01309': 46504450, 'DB01344': 46507832, 'DB01432': 46506251, 'DB03404': 46504761, 'DB05259': 46505299, 
        'DB04941': 347909853, 'DB05311': 347910073, 'DB05332': 347910087, 'DB05336': 347910089, 'DB05578': 347910183,
        'DB05679': 347910190, 'DB05829': 347910255, 'DB06168': 347910340, 'DB06273': 347910344, 'DB06186': 347910341,
        'DB06317': 347910347, 'DB06366': 347910348, 'DB06372': 347910349, 'DB06415': 347910351, 'DB06439': 175427071,
        'DB06612': 347910353, 'DB06643': 347910354, 'DB06650': 347910355, 'DB06674': 347910358, 'DB06681': 347910359, 
        'DB06692': 347910361, 'DB06720': 347910363, 'DB06760': 347910368, 'DB06761': 347910369, 'DB08870': 347910376, 
        'DB08879': 347910378, 'DB08885': 347910379, 'DB08888': 347910381, 'DB08894': 175427138, 'DB08902': 347910384, 
        'DB08904': 347910385, 'DB08913': 175427151, 'DB08917': 347910386, 'DB08935': 347910388, 'DB09029': 347910391,
        'DB09033': 347910392, 'DB09035': 347910393, 'DB09036': 347910394, 'DB09045': 347910397, 'DB09046': 347910398,
        'DB09051': 347910399, 'DB09052': 347910400, 'DB09077': 347910402, 'DB09105': 347910406, 'DB09107': 347910408,
        'DB09109': 347910410, 'DB09122': 347910413, 'DB09228': 347910420, 'DB09263': 347910426, 'DB09264': 347910427,
        'DB09141': 347910415, 'DB09037': 347910395, 'DB09113': 347910412, 'DB09302': 347910431, 'DB09303': 347910432,
        'DB09310': 347910434, 'DB09331': 347910441, 'DB09559': 347910462, 'DB09560': 46505833, 'DB09564': 347910463,
        'DB09568': 347910465, 'DB11563': 347911202, 'DB11569': 347911203, 'DB11606': 347911216, 'DB11608': 347911218, 
        'DB11767': 347911238, 'DB15718': 434370493, 'DB15940': 472422363, 'DB15941': 472422356, 'DB16355': 472422617,
        'DB16393': 472423048, 'DB16394': 472423072, 'DB16416': 406589401,
    }
    other_drugs_converted_DrugBank2Pubchem2 = [missing_PubChem_CIDS.get(other_drugs_converted_DrugBank[ix]) 
                                               if (x is None) else x
                                               for ix, x in enumerate(other_drugs_converted_DrugBank2Pubchem)]
    missing_ids = [other_drugs_converted_DrugBank2Pubchem[ix] 
                   for ix, x in enumerate(other_drugs_converted_DrugBank2Pubchem2) if (x is None)]
    assert len(missing_ids)==len(missing_PubChem_SIDS)
    print("Drugs converted to PubChemCIDs %d/%d" % (sum([int(x is not None) for x in other_drugs_converted_DrugBank2Pubchem2]), 
                                                 len(other_drugs_converted_DrugBank2Pubchem2)))
    
    di_db2pc = dict([[other_drugs_converted_DrugBank[ix], x] 
                     for ix, x in enumerate(other_drugs_converted_DrugBank2Pubchem2) if (x is not None)]+[
        ["CID"+str(x), x] for x in other_drugs_converted_PubChem
    ])

    with open(transcript_folder+"drugbank2pubchem_all.pck", "wb") as f:
        pickle.dump(di_db2pc, f)

with open(transcript_folder+"drugbank2pubchem_all.pck", "rb") as f:
    di_db2pc = pickle.load(f)
    
other_drugs_converted = [int(s[len("CID"):]) if (s[:len("CID")]=="CID") 
                         else di_db2pc.get(s) for s in other_drugs]
other_drugs_converted = [x for x in other_drugs_converted if (x is not None)]
"#drugs to retrieve = %d" % len(other_drugs_converted)

'#drugs to retrieve = 1314'

In [26]:
if (not os.path.exists(transcript_folder+"save_signatures.csv")):
    entrez_ids = list([int(x) for x in pert_df["Entrez ID"]])
    user_key = get_user_key(paths_global.lincs_file)
    signatures = data_processing.get_profiles(other_drugs_converted, entrez_ids, paths_global.lincs_folder, 
                        user_key, transcript_folder, nsigs=2, njobs=njobs, verbose=1)
    signatures.to_csv(transcript_folder+"save_signatures.csv")
signatures = pd.read_csv(transcript_folder+"save_signatures.csv", index_col=0)
ids = [pert_df.iloc[list(pert_df["Entrez ID"]).index(int(idx)),:]["Gene Symbol"] for idx in signatures.index]
signatures.index = ids

if (use_drugbank_ids):
    with open(transcript_folder+"drugbank2pubchem_all.pck", "rb") as f:
        di_db2pc = pickle.load(f)
    inv_map = {di_db2pc[k]: k for k in di_db2pc}
    signatures.columns = [inv_map.get(int(x),"CID"+x) for x in signatures.columns]

signatures = signatures.T.loc[~signatures.columns.duplicated()].T
signatures.to_csv(transcript_folder+"LINCS_drugs.csv")

signatures

Unnamed: 0,CID104999,CID442021,CID442872,DB13415,DB00140,DB00146,DB00159,DB00162,DB00175,DB00178,...,DB09213,DB09256,DB09324,DB09462,DB09477,DB09555,DB09570,DB11582,DB13740,DB14126
DDR1,-0.011385,-0.007011,0.009737,-0.002140,-0.003961,-0.002325,-0.011763,-0.005968,0.003772,0.003344,...,-0.004071,-0.003107,0.002978,0.011529,-0.009122,-0.004648,0.001134,0.009293,0.014499,-0.004720
PAX8,0.004484,0.004681,0.006501,0.000159,0.001899,0.001326,-0.007291,-0.004797,0.013727,-0.014290,...,0.001130,-0.009153,-0.034952,0.000229,0.001231,0.004582,-0.007928,0.018032,-0.019435,-0.002117
GUCA1A,0.008441,0.004299,0.008334,0.004783,0.006605,0.006012,0.013492,0.003385,0.008456,0.004423,...,-0.002396,0.003924,-0.003996,0.004527,0.000276,-0.004380,0.003478,-0.001546,-0.004345,0.002923
ESRRA,0.005775,-0.007018,-0.002588,0.001703,-0.004231,0.006695,-0.001825,-0.006489,0.006006,0.008460,...,-0.004910,-0.005171,-0.005360,0.002091,0.009425,-0.006334,-0.006153,-0.006528,-0.004779,0.007278
TRADD,-0.000097,0.004065,0.013717,-0.006948,-0.000851,-0.002124,0.003906,0.002239,-0.004117,0.011190,...,-0.013210,0.000880,0.001616,-0.001374,0.002638,-0.006946,0.003484,-0.014328,-0.003361,-0.006004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CX3CL1,0.004330,-0.008391,-0.008366,-0.000073,0.009630,0.002415,0.005253,0.014312,-0.013535,-0.008043,...,0.006633,-0.001438,-0.008364,0.023915,-0.002456,0.003827,0.015452,0.000298,-0.004313,-0.021096
NPEPL1,0.010776,-0.000574,0.022847,0.005441,0.007439,0.003254,-0.012271,0.026158,-0.000092,-0.004647,...,0.002470,-0.000381,-0.005943,-0.001547,0.022926,-0.000473,-0.029043,-0.012521,0.012726,0.010101
ADAP1,0.004514,0.005196,0.015183,-0.008472,-0.001902,0.004233,0.001397,-0.005646,0.003017,-0.009838,...,-0.003458,-0.017579,0.001587,-0.004212,-0.001060,-0.010633,0.004246,0.011107,0.004181,0.014274
LRCH4,0.004431,0.013239,-0.004200,-0.009929,-0.006824,-0.024151,-0.005234,-0.003035,-0.000597,0.005603,...,-0.011609,0.005798,-0.002603,0.015365,-0.001079,0.006861,-0.002426,0.017324,-0.009514,-0.004729


In [27]:
S = pd.read_csv(transcript_folder+"all_drugs.csv", engine="python", index_col=0)
signatures = pd.read_csv(transcript_folder+"LINCS_drugs.csv", engine="python", index_col=0)
S_LINCS = S[[c for c in S.columns if (c not in signatures.columns)]].join(signatures, how="outer").fillna(0)
S_LINCS.to_csv(transcript_folder+"all_drugs_+LINCS.csv")
S_LINCS

Unnamed: 0,DB00091,DB00121,DB00126,DB00130,DB00133,DB00136,DB00158,DB00163,DB00177,DB00181,...,DB09213,DB09256,DB09324,DB09462,DB09477,DB09555,DB09570,DB11582,DB13740,DB14126
A1BG,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
A1CF,-0.018267,-0.018267,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.004380,-0.006586,0.000962,0.000293,0.001181,0.006511,0.004229,0.007622,0.004914,-0.000150
A2M,0.000000,0.000000,-0.122491,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,...,-0.020651,0.011084,-0.011960,0.010650,-0.022148,-0.014183,-0.014089,-0.002230,0.023162,0.023271
A4GALT,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,...,-0.002381,-0.004163,-0.000285,0.002298,0.007853,0.005124,-0.002902,0.000446,-0.006484,-0.000968
AAAS,0.000000,0.000000,0.000000,0.000000,0.0,0.0,-0.017861,0.0,-0.022418,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11B,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ZYX,0.000000,0.000000,0.000000,0.013278,0.0,0.0,0.000000,0.0,0.000000,0.0,...,-0.008080,0.003261,0.023956,-0.025460,-0.016827,-0.021151,0.023835,-0.014698,-0.010986,0.004513
ZZEF1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,...,-0.002808,0.001049,-0.003579,0.004715,0.004407,0.001406,0.000858,0.007074,0.000401,0.005786
ZZZ3,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,...,-0.011529,-0.010208,0.003524,0.000473,-0.007399,-0.002713,-0.001057,-0.003695,-0.009134,0.001602


## IV. Merge A, P and S to get the final "TRANSCRIPT" dataset

Disease ids: Concept IDs, drug ids: DrugBank or PubChem CIDs. Build matrices A (ratings) drugs $\times$ diseases, P (disease features) disease features $\times$ diseases, S (drug features) drug features $\times$ drugs.

In [28]:
A = utils.load_dataset("FEATURELESS", save_folder=paths_global.data_folder)["ratings_mat"]
P = pd.read_csv(transcript_folder+"all_diseases.csv", engine="python", index_col=0)
S = pd.read_csv(transcript_folder+"all_drugs_+LINCS.csv", engine="python", index_col=0)

if (not use_drugbank_ids):
    A = A.loc[[a for a in A.index if (a in di_db2pc)]]
    A.index = [int(di_db2pc[a]) for a in A.index]
    S.columns = S.columns.astype(int)
    S = S[[a for a in S.columns if (a in A.index)]]

SP = S.join(P, how="inner").fillna(0)
S_TRANSCRIPT = SP[S.columns]
P_TRANSCRIPT = SP[P.columns]
A_TRANSCRIPT = A[P_TRANSCRIPT.columns].loc[S_TRANSCRIPT.columns]

S_TRANSCRIPT.to_csv(transcript_folder+"items.csv")
P_TRANSCRIPT.to_csv(transcript_folder+"users.csv")
A_TRANSCRIPT.to_csv(transcript_folder+"ratings_mat.csv")

ratings_A = utils.matrix2ratings(A_TRANSCRIPT, "ind_id", "drug_id", "rating")
print("Sparsity = "+str(utils.compute_sparsity(A_TRANSCRIPT))+"%")
print("%d drug features %d disease features" % (S_TRANSCRIPT.shape[0], P_TRANSCRIPT.shape[0]))
utils.print_dataset(ratings_A, "ind_id", "drug_id", "rating")
ratings_A.T

Sparsity = 0.4451022546805959%
12096 drug features 12096 disease features
Ndrugs=204	Ndiseases=116
401 positive	11 negative	23252 unknown matchings


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,402,403,404,405,406,407,408,409,410,411
ind_id,C0279702,C0028043,C0002395,C0033578,C0007137,C1621958,C0282488,C0346629,C0400827,C3550579,...,C0020951,C0028754,C0279702,C0014544,C0036341,C0085859,C0004626,C0279702,C3553462,C0036202
drug_id,DB00091,DB00091,DB00158,DB00158,DB00184,DB00188,DB00215,DB00244,DB00252,DB00254,...,DB06799,DB06803,DB08895,DB08901,DB09054,DB09118,DB09213,DB09213,DB09324,DB09555
rating,1,1,-1,1,1,-1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [29]:
utils.load_dataset("TRANSCRIPT", save_folder=paths_global.data_folder)

{'ratings_mat':          C0346629  C2239176  C3553462  C0035235  C0032285  C0010346  C0009324   
 DB00091         0         0         0         0         0         0         0  \
 DB00121         0         0         0         0         0         0         0   
 DB00126         0         0         0         0         0         0         0   
 DB00130         0         0         0         0         0         0         0   
 DB00133         0         0         0         0         0         0         0   
 ...           ...       ...       ...       ...       ...       ...       ...   
 DB09555         0         0         0         0         0         0         0   
 DB09570         0         0         0         0         0         0         0   
 DB11582         0         0         0         0         0         0         0   
 DB13740         0         0         0         0         0         0         0   
 DB14126         0         0         0         0         0         0         0   
 