In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
from cmapPy.pandasGEXpress.parse import parse
from scipy.stats import spearmanr as scor
from scipy.stats import mannwhitneyu as mwu

**Downloading LINCS data** \
From Gene Expression Omnibus downloading the [GSE92742](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE92742) and the [GSE70138](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE70138) datasets. From both datasets I dowloaded the Level5 gene expression profiles, the metadata from the sig_info files and a gene info file.\
I downloaded the drug indformation and the sample information for  compound metadata from the [Drug Repurposing Hub](https://clue.io/repurposing)

In [2]:
#checking if the files are in the correct directory
import os
files_needed_GSE92742=['GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx',
               'GSE92742_Broad_LINCS_sig_info.txt','GSE92742_Broad_LINCS_gene_info.txt']
files_needed_GSE70138=['GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328.gctx',
                      'GSE70138_Broad_LINCS_sig_info.txt']
files_needed_rep=['repurposing_drugs_20180907.txt','repurposing_samples_20180907.txt']
files_GSE92742=os.listdir('../data/GSE92742/')
files_GSE70138=os.listdir('../data/GSE70138/')
files_rep=os.listdir('../data/repurposing/')
for f in files_needed_GSE92742:
    assert (f in files_GSE92742)
for f in files_needed_GSE70138:
    assert (f in files_GSE70138)
for f in files_needed_rep:
    assert (f in files_rep)

**Importing gene ids** 

In [3]:
gene_ids=pd.read_csv('../data/GSE92742/GSE92742_Broad_LINCS_gene_info.txt',
                    sep='\t',header=0,index_col=0)
gene_ids.head()

Unnamed: 0_level_0,pr_gene_symbol,pr_gene_title,pr_is_lm,pr_is_bing
pr_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
780,DDR1,discoidin domain receptor tyrosine kinase 1,1,1
7849,PAX8,paired box 8,1,1
2978,GUCA1A,guanylate cyclase activator 1A,0,0
2049,EPHB3,EPH receptor B3,0,1
2101,ESRRA,estrogen related receptor alpha,0,1


In [4]:
fil=gene_ids['pr_is_lm']==1
gene_ids=gene_ids[fil]
gene_ids=gene_ids['pr_gene_symbol']
gene_ids.head()

pr_gene_id
780      DDR1
7849     PAX8
6193     RPS5
23      ABCF1
9552    SPAG7
Name: pr_gene_symbol, dtype: object

In [5]:
gene_ids.index=gene_ids.index.astype(str)

In [6]:
#To calculate consensus signature,I used the MODZ method
#described in the original LINCS manuscript.
def calc_MODZ(data):
    if len(data)==1:
        return data.iloc[0]
    if len(data)==2:
        return np.mean(data,0)
    else:
        CM=scor(data.T)[0]
        fil=CM<0
        CM[fil]=0.01
        weights=np.sum(CM,1)-1
        weights=weights/np.sum(weights)
        weights=weights.reshape((-1,1))
        return pd.Series(np.dot(data.T,weights).reshape((-1,1)[0]),index=data.columns)

**Importing ligand-receptor network**\
I downloaded the receptor-ligand network from [here](https://zenodo.org/record/3260758/files/lr_network.rds), and preprocessed it in R. From it I choose the ligand and receptor genes

In [7]:
#column 'to': receptors, column 'from': ligands
ligand_receptor=pd.read_csv('../data/lr_network.csv', sep=',', header=0, index_col=0)

In [8]:
good_sources=['kegg_cytokines', 'kegg_neuroactive','pharmacology', 'ramilowski_known']

In [9]:
fil=np.in1d(ligand_receptor['source'], good_sources)
ligand_receptor=ligand_receptor[fil]

In [10]:
receptors=ligand_receptor['to'].unique()
ligands=ligand_receptor['from'].unique()

In [11]:
l_r=list(receptors)+list(ligands)

**Getting the drugs from the Drug Repurposing Hub**

In [12]:
drugs=pd.read_csv('../data/repurposing/repurposing_drugs_20180907.txt', sep='\t', header=0, index_col=None, encoding='latin', skiprows=9)

In [13]:
fil=drugs['target'].isnull()
drugs=drugs[~fil]

In [14]:
drugs.head()

Unnamed: 0,pert_iname,clinical_phase,moa,target,disease_area,indication
0,"[sar9,met(o2)11]-substance-p",Preclinical,tachykinin antagonist,TACR1,,
1,A-1070722,Preclinical,glycogen synthase kinase inhibitor,GSK3A|GSK3B,,
2,A-1120,Preclinical,retinoid receptor ligand,RBP4,,
3,A-317491,Preclinical,purinergic receptor antagonist,P2RX3,,
5,A-366,Preclinical,histone lysine methyltransferase inhibitor,EHMT1|EHMT2,,


In [15]:
#creating a file that has the drug, its target and the target type
def split_dataframe(one_line):
    if '|' in one_line['target']:
        targets=one_line['target'].split('|')
        temp=pd.DataFrame(index=range(len(targets)), columns=one_line.index)
        for col in temp.columns:
            temp[col]=one_line[col]
        temp['target']=targets
        return temp
    else:
        return pd.DataFrame(one_line).T

In [16]:
results=pd.DataFrame(columns=drugs.columns)
for i in drugs.index:
    one_line=drugs.loc[i]
    results=pd.concat([results, split_dataframe(one_line)])

In [17]:
results.to_csv('../results/lincs_drugs.csv', sep=',')

In [18]:
results.head()

Unnamed: 0,pert_iname,clinical_phase,moa,target,disease_area,indication
0,"[sar9,met(o2)11]-substance-p",Preclinical,tachykinin antagonist,TACR1,,
0,A-1070722,Preclinical,glycogen synthase kinase inhibitor,GSK3A,,
1,A-1070722,Preclinical,glycogen synthase kinase inhibitor,GSK3B,,
2,A-1120,Preclinical,retinoid receptor ligand,RBP4,,
3,A-317491,Preclinical,purinergic receptor antagonist,P2RX3,,


In [19]:
results.shape

(13097, 6)

In [20]:
d=list(results['pert_iname'].unique())
l_r_d=l_r + d

In [21]:
filt0=np.in1d(results['target'],l_r_d)
results=results[filt0]

In [22]:
results.shape

(2518, 6)

In [23]:
activators=['agonist', 'activator', 'stimulant', 'enhancer', 'reactivator', 'inducer']
inhibitors=['inhibitor', 'antagonist', 'blocker', 'downregulator', 'destabilizer']

In [24]:
fil=~results['moa'].isna()
results=results[fil]

In [25]:
#arrange them in order
results.index=range(len(results.index)) 

In [26]:
#Giving a sign for each drug-target pairs (antagonists: -1, agonists: +1)
results['activator']=0
results['inhibitor']=0
for i in results.index:
    moa=results.loc[i, 'moa']
    is_a=len(set(moa.split()) &set (activators))
    is_i=len(set(moa.split()) &set (inhibitors))
    results.loc[i,['activator', 'inhibitor']]=is_a, is_i

In [27]:
results.head()

Unnamed: 0,pert_iname,clinical_phase,moa,target,disease_area,indication,activator,inhibitor
0,"[sar9,met(o2)11]-substance-p",Preclinical,tachykinin antagonist,TACR1,,,0,1
1,A-987306,Preclinical,histamine receptor antagonist,AVPR1A,,,0,1
2,A-987306,Preclinical,histamine receptor antagonist,CCR1,,,0,1
3,A-987306,Preclinical,histamine receptor antagonist,HTR1A,,,0,1
4,A-987306,Preclinical,histamine receptor antagonist,HTR1B,,,0,1


In [28]:
results.to_csv('../results/lincs_drugs_act_inhib.csv', sep=',')

**Importing the gse92742 and gse70138 files**

In [29]:
gse92742=pd.read_csv('../data/gse92742/GSE92742_Broad_LINCS_sig_info.txt', sep='\t', header=0, index_col=0) 
gse70138=pd.read_csv('../data/gse70138/GSE70138_Broad_LINCS_sig_info.txt', sep='\t', header=0, index_col=0) 

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [30]:
gse92742['pert_type'].unique() 

array(['ctl_vehicle', 'trt_cp', 'ctl_untrt', 'trt_sh.cgs',
       'ctl_vehicle.cns', 'ctl_vector.cns', 'ctl_untrt.cns', 'trt_sh.css',
       'trt_lig', 'ctl_vector', 'trt_sh', 'trt_oe', 'trt_oe.mut'],
      dtype=object)

In [31]:
gse70138['pert_type'].unique()

array(['ctl_vehicle', 'trt_cp', 'trt_xpr', 'ctl_untrt', 'ctl_vector'],
      dtype=object)

In [32]:
#filering by perturbation type
good_92742=['trt_sh.cgs','trt_lig','trt_cp','trt_oe']
fil1=np.in1d(gse92742['pert_type'], good_92742)
gse92742=gse92742[fil1]

good_70138=['trt_cp','trt_xpr']
fil2=np.in1d(gse70138['pert_type'], good_70138)
gse70138=gse70138[fil2]

In [33]:
#filtering receptors and ligans
filt1=np.in1d(gse92742['pert_iname'], l_r_d) 
gse92742=gse92742[filt1]

filt2=np.in1d(gse70138['pert_iname'], l_r_d)
gse70138=gse70138[filt2]

In [34]:
#Giving signs for the perturbations:
#CRIPSR, shRNA : -1 
#ligand, overexpression: +1
gse70138['sign']=0
gse92742['sign']=0

fil_xpr=gse70138['pert_type']=='trt_xpr'
gse70138.loc[gse70138.index[fil_xpr],'sign']=-1
fil_sh=gse92742['pert_type']=='trt_sh.cgs',
gse92742.loc[gse92742.index[fil_sh],'sign']=-1
fil_lig=gse92742['pert_type']=='trt_lig'
gse92742.loc[gse92742.index[fil_lig],'sign']=+1
fil_oe=gse92742['pert_type']=='trt_oe'
gse92742.loc[gse92742.index[fil_oe],'sign']=+1

In [35]:
gse70138.to_csv('../results/LINCS_gse70138.csv', sep=',')
gse92742.to_csv('../results/LINCS_gse92742.csv', sep=',')

**Creating a seperate file for each perturbations**

In [36]:
fil_xpr2=gse70138['pert_type']=='trt_xpr'
gse70138_trt_xpr=gse70138[fil_xpr2]
gse70138_trt_xpr.to_csv('../results/LINCS_gse70138_trt_xpr.csv', sep=',')

In [37]:
fil_cp1=gse70138['pert_type']=='trt_cp'
gse70138_trt_cp=gse70138[fil_cp1]
gse70138_trt_cp.to_csv('../results/LINCS_gse70138_trt_cp.csv', sep=',')

In [38]:
fil_cp2=gse92742['pert_type']=='trt_cp'
gse92742_trt_cp=gse92742[fil_cp2]
gse92742_trt_cp.to_csv('../results/LINCS_gse92742_trt_cp.csv', sep=',')

In [39]:
fil_sh2=gse92742['pert_type']=='trt_sh.cgs'
gse92742_trt_sh=gse92742[fil_sh2]
gse92742_trt_sh.to_csv('../results/LINCS_gse92742_trt_sh.csv', sep=',')

In [40]:
fil_lig2=gse92742['pert_type']=='trt_lig'
gse92742_trt_lig=gse92742[fil_lig2]
gse92742_trt_lig.to_csv('../results/LINCS_gse92742_trt_lig.csv', sep=',')

In [41]:
fil_oe2=gse92742['pert_type']=='trt_oe'
gse92742_trt_oe=gse92742[fil_oe2]
gse92742_trt_oe.to_csv('../results/LINCS_gse92742_trt_oe.csv', sep=',')

**Creating consensus singatures**

In [42]:
gse70138_trt_xpr=pd.read_csv('../results/LINCS_gse70138_trt_xpr.csv', sep=',',header=0, index_col=0, low_memory=False)

In [43]:
#rows are the perturbed genes, columns are the measured landmark genes from LINCS
genes_perturbed=gse70138_trt_xpr['pert_iname'].unique()
consensus_signatures_gse70138_trt_xpr=pd.DataFrame(index=genes_perturbed,columns=gene_ids.index.astype(str))
consensus_signatures_gse70138_trt_xpr.head()

pr_gene_id,780,7849,6193,23,9552,387,10921,10285,533,6194,...,54681,11000,6915,6253,7264,5467,2767,23038,57048,79716
CXCR4,,,,,,,,,,,...,,,,,,,,,,
TGFBR1,,,,,,,,,,,...,,,,,,,,,,
FN1,,,,,,,,,,,...,,,,,,,,,,
TGFBR2,,,,,,,,,,,...,,,,,,,,,,
AXL,,,,,,,,,,,...,,,,,,,,,,


In [44]:
for i in range(len(genes_perturbed)):
    if (i%100)==0:
        print('Done for %i genes' %i)
    gene=genes_perturbed[i]
    fil=gse70138_trt_xpr['pert_iname']==gene
    samples=gse70138_trt_xpr.index[fil]
    expression=parse('../data/gse70138/GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328.gctx',
                 cid=samples,rid=gene_ids.index.astype(str)).data_df.T[gene_ids.index.astype(str)]
    consensus_signatures_gse70138_trt_xpr.loc[gene]=calc_MODZ(expression)

Done for 0 genes


In [45]:
consensus_signatures_gse70138_trt_xpr.columns=gene_ids[consensus_signatures_gse70138_trt_xpr.columns].values

In [46]:
consensus_signatures_gse70138_trt_xpr.to_csv('../results/consensus_signature_gse70138_trt_xpr.csv',sep=',')

In [47]:
gse70138_trt_cp=pd.read_csv('../results/LINCS_gse70138_trt_cp.csv', sep=',',header=0, index_col=0, low_memory=False)

In [48]:
genes_perturbed1=gse70138_trt_cp['pert_iname'].unique()
consensus_signatures_gse70138_trt_cp=pd.DataFrame(index=genes_perturbed1,columns=gene_ids.index)
consensus_signatures_gse70138_trt_cp.head()

pr_gene_id,780,7849,6193,23,9552,387,10921,10285,533,6194,...,54681,11000,6915,6253,7264,5467,2767,23038,57048,79716
CP-724714,,,,,,,,,,,...,,,,,,,,,,
neratinib,,,,,,,,,,,...,,,,,,,,,,
crizotinib,,,,,,,,,,,...,,,,,,,,,,
tozasertib,,,,,,,,,,,...,,,,,,,,,,
PD-0325901,,,,,,,,,,,...,,,,,,,,,,


In [49]:
for i in range(len(genes_perturbed1)):
    if (i%100)==0:
        print('Done for %i genes' %i)
    gene=genes_perturbed1[i]
    fil=gse70138_trt_cp['pert_iname']==gene
    samples=gse70138_trt_cp.index[fil]
    expression=parse('../data/gse70138/GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328.gctx',
                 cid=samples,rid=gene_ids.index).data_df.T[gene_ids.index]
    consensus_signatures_gse70138_trt_cp.loc[gene]=calc_MODZ(expression)

Done for 0 genes
Done for 100 genes
Done for 200 genes
Done for 300 genes
Done for 400 genes
Done for 500 genes
Done for 600 genes
Done for 700 genes
Done for 800 genes
Done for 900 genes
Done for 1000 genes
Done for 1100 genes
Done for 1200 genes
Done for 1300 genes


In [50]:
consensus_signatures_gse70138_trt_cp.columns=gene_ids[consensus_signatures_gse70138_trt_cp.columns].values

In [51]:
consensus_signatures_gse70138_trt_cp.to_csv('../results/consensus_signature_gse70138_trt_cp.csv',sep=',')

In [52]:
gse92742_trt_cp=pd.read_csv('../results/LINCS_gse92742_trt_cp.csv', sep=',',header=0, index_col=0, low_memory=False)

In [53]:
genes_perturbed2=gse92742_trt_cp['pert_iname'].unique()
consensus_signatures_gse92742_trt_cp=pd.DataFrame(index=genes_perturbed2,columns=gene_ids.index)
consensus_signatures_gse92742_trt_cp.head()

pr_gene_id,780,7849,6193,23,9552,387,10921,10285,533,6194,...,54681,11000,6915,6253,7264,5467,2767,23038,57048,79716
trichostatin-a,,,,,,,,,,,...,,,,,,,,,,
geldanamycin,,,,,,,,,,,...,,,,,,,,,,
iloprost,,,,,,,,,,,...,,,,,,,,,,
wortmannin,,,,,,,,,,,...,,,,,,,,,,
calcitriol,,,,,,,,,,,...,,,,,,,,,,


In [54]:
for i in range(len(genes_perturbed2)):
    if (i%100)==0:
        print('Done for %i genes' %i)
    gene=genes_perturbed2[i]
    fil=gse92742_trt_cp['pert_iname']==gene
    samples=gse92742_trt_cp.index[fil]
    expression=parse('../data/GSE92742/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx',
                 cid=samples,rid=gene_ids.index).data_df.T[gene_ids.index]
    consensus_signatures_gse92742_trt_cp.loc[gene]=calc_MODZ(expression)

Done for 0 genes
Done for 100 genes
Done for 200 genes
Done for 300 genes
Done for 400 genes
Done for 500 genes
Done for 600 genes
Done for 700 genes
Done for 800 genes
Done for 900 genes
Done for 1000 genes
Done for 1100 genes
Done for 1200 genes
Done for 1300 genes
Done for 1400 genes
Done for 1500 genes
Done for 1600 genes
Done for 1700 genes
Done for 1800 genes


In [55]:
consensus_signatures_gse92742_trt_cp.head()

pr_gene_id,780,7849,6193,23,9552,387,10921,10285,533,6194,...,54681,11000,6915,6253,7264,5467,2767,23038,57048,79716
trichostatin-a,-0.847422,-0.503865,0.483912,0.132198,-4.53212,0.634433,-0.698595,-1.79651,1.22057,-0.768776,...,-2.1688,-0.0347143,2.49821,1.53952,-0.665192,3.59326,1.76075,2.27205,0.0706258,-1.3203
geldanamycin,0.542107,0.291332,-0.0559945,0.232725,1.71355,-0.569987,-0.462787,1.32345,-1.24771,-0.341131,...,0.31085,0.235081,0.410122,0.351053,-1.19802,0.734019,-0.0621831,0.679144,-0.185019,0.608295
iloprost,-0.145694,0.157691,0.0225018,-0.261297,-0.333339,-0.146206,-0.101394,0.0563712,0.00170397,-0.124097,...,0.182753,-0.176382,-0.0254652,-0.331386,0.104815,0.103503,0.222015,-0.0912689,-0.128151,-0.0233869
wortmannin,1.05525,0.461242,-0.0889274,0.0519562,0.647678,-0.272643,-0.342935,-0.397896,-1.00563,0.143087,...,0.401039,0.440965,0.483755,0.574008,-0.675819,0.927863,-0.383918,1.2194,0.524388,0.736217
calcitriol,0.342617,0.389063,-0.0431887,-0.00860405,0.00678472,0.179245,-0.33049,-0.290733,-0.395502,-0.206041,...,0.146251,-0.00878185,0.195036,-0.208913,-0.0821654,0.0700581,0.200104,-0.00942942,-0.0878858,0.11443


In [56]:
consensus_signatures_gse92742_trt_cp.columns=gene_ids[consensus_signatures_gse92742_trt_cp.columns].values

In [57]:
consensus_signatures_gse92742_trt_cp.to_csv('../results/consensus_signature_gse92742_trt_cp.csv',sep=',')

In [58]:
gse92742_trt_sh=pd.read_csv('../results/LINCS_gse92742_trt_sh.csv', sep=',',header=0, index_col=0, low_memory=False)

In [59]:
genes_perturbed3=gse92742_trt_sh['pert_iname'].unique()
consensus_signatures_gse92742_trt_sh=pd.DataFrame(index=genes_perturbed3,columns=gene_ids.index)
consensus_signatures_gse92742_trt_sh.head()

pr_gene_id,780,7849,6193,23,9552,387,10921,10285,533,6194,...,54681,11000,6915,6253,7264,5467,2767,23038,57048,79716
A2M,,,,,,,,,,,...,,,,,,,,,,
ACVR1,,,,,,,,,,,...,,,,,,,,,,
ACVR1B,,,,,,,,,,,...,,,,,,,,,,
ADAM15,,,,,,,,,,,...,,,,,,,,,,
ADAM17,,,,,,,,,,,...,,,,,,,,,,


In [None]:
for i in range(len(genes_perturbed3)):
    if (i%100)==0:
        print('Done for %i genes' %i)
    gene=genes_perturbed3[i]
    fil=gse92742_trt_sh['pert_iname']==gene
    samples=gse92742_trt_sh.index[fil]
    expression=parse('../data/GSE92742/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx',
                 cid=samples,rid=gene_ids.index).data_df.T[gene_ids.index]
    consensus_signatures_gse92742_trt_sh.loc[gene]=calc_MODZ(expression)

Done for 0 genes
Done for 100 genes
Done for 200 genes


In [None]:
consensus_signatures_gse92742_trt_sh.head()

In [None]:
consensus_signatures_gse92742_trt_sh.columns=gene_ids[consensus_signatures_gse92742_trt_sh.columns].values

In [None]:
consensus_signatures_gse92742_trt_sh.to_csv('../results/consensus_signature_gse92742_trt_sh.csv',sep=',')

In [None]:
gse92742_trt_sh_fil=pd.read_csv('../results/LINCS_gse92742_trt_sh_fil.csv', sep=',',header=0, index_col=0, low_memory=False)

In [None]:
genes_perturbed7=gse92742_trt_sh_fil['pert_iname'].unique()
consensus_signatures_gse92742_trt_sh_fil=pd.DataFrame(index=genes_perturbed7,columns=gene_ids.index)
consensus_signatures_gse92742_trt_sh_fil.head()

In [None]:
for i in range(len(genes_perturbed7)):
    if (i%100)==0:
        print('Done for %i genes' %i)
    gene=genes_perturbed7[i]
    fil=gse92742_trt_sh_fil['pert_iname']==gene
    samples=gse92742_trt_sh_fil.index[fil]
    expression=parse('../data/GSE92742/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx',
                 cid=samples,rid=gene_ids.index).data_df.T[gene_ids.index]
    consensus_signatures_gse92742_trt_sh_fil.loc[gene]=calc_MODZ(expression)

In [None]:
consensus_signatures_gse92742_trt_sh_fil.head()

In [None]:
consensus_signatures_gse92742_trt_sh_fil.columns=gene_ids[consensus_signatures_gse92742_trt_sh_fil.columns].values

In [None]:
consensus_signatures_gse92742_trt_sh_fil.to_csv('../results/consensus_signature_gse92742_trt_sh_fil.csv',sep=',')

In [None]:
gse92742_trt_lig=pd.read_csv('../results/LINCS_gse92742_trt_lig.csv', sep=',',header=0, index_col=0, low_memory=False)

In [None]:
genes_perturbed4=gse92742_trt_lig['pert_iname'].unique()
consensus_signatures_gse92742_trt_lig=pd.DataFrame(index=genes_perturbed4,columns=gene_ids.index)
consensus_signatures_gse92742_trt_lig.head()

In [None]:
for i in range(len(genes_perturbed4)):
    if (i%100)==0:
        print('Done for %i genes' %i)
    gene=genes_perturbed4[i]
    fil=gse92742_trt_lig['pert_iname']==gene
    samples=gse92742_trt_lig.index[fil]
    expression=parse('../data/GSE92742/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx',
                 cid=samples,rid=gene_ids.index).data_df.T[gene_ids.index]
    consensus_signatures_gse92742_trt_lig.loc[gene]=calc_MODZ(expression)

In [None]:
consensus_signatures_gse92742_trt_lig.head()

In [None]:
consensus_signatures_gse92742_trt_lig.columns=gene_ids[consensus_signatures_gse92742_trt_lig.columns].values

In [None]:
consensus_signatures_gse92742_trt_lig.to_csv('../results/consensus_signature_gse92742_trt_lig.csv',sep=',')

In [None]:
gse92742_trt_oe=pd.read_csv('../results/LINCS_gse92742_trt_oe.csv', sep=',',header=0, index_col=0, low_memory=False)

In [None]:
genes_perturbed5=gse92742_trt_oe['pert_iname'].unique()
consensus_signatures_gse92742_trt_oe=pd.DataFrame(index=genes_perturbed5,columns=gene_ids.index)
consensus_signatures_gse92742_trt_oe.head()

In [None]:
for i in range(len(genes_perturbed5)):
    if (i%100)==0:
        print('Done for %i genes' %i)
    gene=genes_perturbed5[i]
    fil=gse92742_trt_oe['pert_iname']==gene
    samples=gse92742_trt_oe.index[fil]
    expression=parse('../data/GSE92742/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx',
                 cid=samples,rid=gene_ids.index).data_df.T[gene_ids.index]
    consensus_signatures_gse92742_trt_oe.loc[gene]=calc_MODZ(expression)

In [None]:
consensus_signatures_gse92742_trt_oe.head()

In [None]:
consensus_signatures_gse92742_trt_oe.columns=gene_ids[consensus_signatures_gse92742_trt_oe.columns].values

In [None]:
consensus_signatures_gse92742_trt_oe.to_csv('../results/consensus_signature_gse92742_trt_oe.csv',sep=',')