# 01 Data preparation

Processing xlsx files from the data folder into suitable inputs and generate other input files.

In [1]:
import pandas as pd
import numpy as np
import os
import pickle

In [2]:
data_dir = "00_input"

out_dir = "01_outputs_2020"
os.makedirs(out_dir, exist_ok = True)

## Modules

Load data about modules and drugs.

In [3]:
modules_df = pd.read_excel(os.path.join(data_dir, "ALL_DATA_2020.xlsx"), sheet_name = "modules", index_col = 0)

selected_modules = modules_df.index.tolist()
print("Selected modules list: ", len(selected_modules), selected_modules)

Selected modules list:  10 ['CDK1', 'CDK2', 'PAK', 'PI3K', 'ERK', 'BET', 'Aurora', 'TGFbR', 'VEGFR', 'PKC']


In [4]:
IC50_df = pd.read_excel(os.path.join(data_dir, "ALL_DATA_2020.xlsx"), sheet_name = "IC50s")
display(IC50_df)
IC50_df = IC50_df.rename(columns = {"IC50, uM": "IC50"})

modules = IC50_df.Module.unique().tolist()
n_modules = len(modules)
print("Modules list: ", n_modules, modules)

drugs = IC50_df.Drug.tolist()
n_drugs = len(drugs)
print("Drugs list: ", n_drugs, drugs)

Unnamed: 0,Drug,Module,"IC50, uM"
0,CGP-60474,CDK1,0.01
1,dinaciclib,CDK1,0.004
2,PHA-767491,CDK1,0.25
3,roscovitine,CDK2,2.0
4,PF-03758309,PAK,0.002
5,PF-04691502,PI3K,0.01
6,XL-147,PI3K,1.0
7,GSK-2334470,PI3K,0.1
8,A-66,PI3K,0.5
9,BGT-226,PI3K,0.03


Modules list:  10 ['CDK1', 'CDK2', 'PAK', 'PI3K', 'ERK', 'BET', 'Aurora', 'TGFbR', 'VEGFR', 'PKC']
Drugs list:  38 ['CGP-60474', 'dinaciclib', 'PHA-767491', 'roscovitine', 'PF-03758309', 'PF-04691502', 'XL-147', 'GSK-2334470', 'A-66', 'BGT-226', 'BX-795', 'AZ-628', 'FR-180204', 'GDC-0879', 'GW-5074', 'PD-0325901', 'PD-184352', 'TAK-733', 'dabrafenib', 'refametinib', 'RAF-265', 'I-BET-762', 'I-BET-151', 'JQ-1-R', 'PFI-1', 'GSK-1070916', 'MLN-8054', 'ZM-447439', 'alisertib', 'SB-525334', 'rebastinib', 'lenvatinib', 'orantinib', 'foretinib', 'cediranib', 'sunitinib', 'tivozanib', 'enzastaurin']


## LINCS meta data

In [5]:
sig_info_df = pd.read_excel(os.path.join(data_dir, "LINCS_2020_HUVEC_sig_info.xlsx"), index_col = 0)
display(sig_info_df)

Unnamed: 0_level_0,cell,plate,time,level_3_samples,samples_number,pert_type,pert_drug,targets,targets_number,dose,dose_float
level_5_sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
LJP007_HUVEC_24H:B19,HUVEC,LJP007,24 h,LJP007_HUVEC_24H_X1_B21:B19|LJP007_HUVEC_24H_X...,3,trt_cp,A-66,,0,10 uM,10.00
LJP007_HUVEC_24H:B20,HUVEC,LJP007,24 h,LJP007_HUVEC_24H_X1_B21:B20|LJP007_HUVEC_24H_X...,3,trt_cp,A-66,,0,3.33 uM,3.33
LJP007_HUVEC_24H:B21,HUVEC,LJP007,24 h,LJP007_HUVEC_24H_X1_B21:B21|LJP007_HUVEC_24H_X...,3,trt_cp,A-66,,0,1.11 uM,1.11
LJP007_HUVEC_24H:B22,HUVEC,LJP007,24 h,LJP007_HUVEC_24H_X1_B21:B22|LJP007_HUVEC_24H_X...,3,trt_cp,A-66,,0,0.37 uM,0.37
LJP007_HUVEC_24H:B23,HUVEC,LJP007,24 h,LJP007_HUVEC_24H_X1_B21:B23|LJP007_HUVEC_24H_X...,3,trt_cp,A-66,,0,0.12 uM,0.12
...,...,...,...,...,...,...,...,...,...,...,...
REP.B011_HUVEC.A_24H:O08_2,HUVEC,REP.B011,24 h,REP.B011_HUVEC.A_24H_X1_B29:O08|REP.B011_HUVEC...,2,trt_cp,roscovitine,"CDK2, CDK7, CDK9",3,0.74 uM,0.74
REP.B011_HUVEC.A_24H:O09_2,HUVEC,REP.B011,24 h,REP.B011_HUVEC.A_24H_X1_B29:O09|REP.B011_HUVEC...,2,trt_cp,roscovitine,"CDK2, CDK7, CDK9",3,0.25 uM,0.25
REP.B011_HUVEC.A_24H:O10_2,HUVEC,REP.B011,24 h,REP.B011_HUVEC.A_24H_X1_B29:O10|REP.B011_HUVEC...,2,trt_cp,roscovitine,"CDK2, CDK7, CDK9",3,0.08 uM,0.08
REP.B011_HUVEC.A_24H:O11_2,HUVEC,REP.B011,24 h,REP.B011_HUVEC.A_24H_X1_B29:O11|REP.B011_HUVEC...,2,trt_cp,roscovitine,"CDK2, CDK7, CDK9",3,0.03 uM,0.03


In [6]:
# Get sig_id for selected drugs
sig_info_df = sig_info_df.loc[sig_info_df.pert_drug.isin(drugs)]

# here"s what we have now
display(sig_info_df)

Unnamed: 0_level_0,cell,plate,time,level_3_samples,samples_number,pert_type,pert_drug,targets,targets_number,dose,dose_float
level_5_sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
LJP007_HUVEC_24H:B19,HUVEC,LJP007,24 h,LJP007_HUVEC_24H_X1_B21:B19|LJP007_HUVEC_24H_X...,3,trt_cp,A-66,,0,10 uM,10.00
LJP007_HUVEC_24H:B20,HUVEC,LJP007,24 h,LJP007_HUVEC_24H_X1_B21:B20|LJP007_HUVEC_24H_X...,3,trt_cp,A-66,,0,3.33 uM,3.33
LJP007_HUVEC_24H:B21,HUVEC,LJP007,24 h,LJP007_HUVEC_24H_X1_B21:B21|LJP007_HUVEC_24H_X...,3,trt_cp,A-66,,0,1.11 uM,1.11
LJP007_HUVEC_24H:B22,HUVEC,LJP007,24 h,LJP007_HUVEC_24H_X1_B21:B22|LJP007_HUVEC_24H_X...,3,trt_cp,A-66,,0,0.37 uM,0.37
LJP007_HUVEC_24H:B23,HUVEC,LJP007,24 h,LJP007_HUVEC_24H_X1_B21:B23|LJP007_HUVEC_24H_X...,3,trt_cp,A-66,,0,0.12 uM,0.12
...,...,...,...,...,...,...,...,...,...,...,...
REP.B011_HUVEC.A_24H:O08_2,HUVEC,REP.B011,24 h,REP.B011_HUVEC.A_24H_X1_B29:O08|REP.B011_HUVEC...,2,trt_cp,roscovitine,"CDK2, CDK7, CDK9",3,0.74 uM,0.74
REP.B011_HUVEC.A_24H:O09_2,HUVEC,REP.B011,24 h,REP.B011_HUVEC.A_24H_X1_B29:O09|REP.B011_HUVEC...,2,trt_cp,roscovitine,"CDK2, CDK7, CDK9",3,0.25 uM,0.25
REP.B011_HUVEC.A_24H:O10_2,HUVEC,REP.B011,24 h,REP.B011_HUVEC.A_24H_X1_B29:O10|REP.B011_HUVEC...,2,trt_cp,roscovitine,"CDK2, CDK7, CDK9",3,0.08 uM,0.08
REP.B011_HUVEC.A_24H:O11_2,HUVEC,REP.B011,24 h,REP.B011_HUVEC.A_24H_X1_B29:O11|REP.B011_HUVEC...,2,trt_cp,roscovitine,"CDK2, CDK7, CDK9",3,0.03 uM,0.03


In [7]:
sig_ids = sig_info_df.index.unique().tolist()
print("Experiments ids list from LINCS: ", len(sig_ids))

Experiments ids list from LINCS:  429


Confirm data by checking the drugs of interest against the filtered LINCS meta data.

In [8]:
print(f"Number of drugs of interest:\t{len(drugs)}")
print(f"Number of drugs in LINCS data:\t{len(sig_info_df.pert_drug.unique())}")

display(sig_info_df.value_counts("pert_drug"))

Number of drugs of interest:	38
Number of drugs in LINCS data:	38


pert_drug
roscovitine    33
PD-0325901     29
MLN-8054       24
CGP-60474      24
AZ-628         18
PF-03758309    15
cediranib      12
lenvatinib     12
refametinib    12
RAF-265        12
PFI-1          12
PD-184352      12
dabrafenib     12
JQ-1-R         12
I-BET-762      12
I-BET-151      12
GW-5074        12
GSK-2334470    12
GDC-0879       12
BX-795         12
PF-04691502    11
GSK-1070916    11
ZM-447439      11
foretinib       6
rebastinib      6
orantinib       6
sunitinib       6
A-66            6
PHA-767491      6
alisertib       6
XL-147          6
TAK-733         6
SB-525334       6
FR-180204       6
tivozanib       6
enzastaurin     5
BGT-226         5
dinaciclib      3
Name: count, dtype: int64

## LINCS data

In [9]:
Data_df = pd.read_excel(os.path.join(data_dir, "LINCS_2020_HUVEC_Data_norm.xlsx"), index_col = 0).T
display(Data_df)

Unnamed: 0,LJP007_HUVEC_24H:B19,LJP007_HUVEC_24H:B20,LJP007_HUVEC_24H:B21,LJP007_HUVEC_24H:B22,LJP007_HUVEC_24H:B23,LJP007_HUVEC_24H:B24,LJP007_HUVEC_24H:C01,LJP007_HUVEC_24H:C02,LJP007_HUVEC_24H:C03,LJP007_HUVEC_24H:C04,...,REP.A011_HUVEC.A_24H:O09_2,REP.A011_HUVEC.A_24H:O10_2,REP.A011_HUVEC.A_24H:O11_2,REP.A011_HUVEC.A_24H:O12_2,REP.B011_HUVEC.A_24H:O07_2,REP.B011_HUVEC.A_24H:O08_2,REP.B011_HUVEC.A_24H:O09_2,REP.B011_HUVEC.A_24H:O10_2,REP.B011_HUVEC.A_24H:O11_2,REP.B011_HUVEC.A_24H:O12_2
AARS,0.006899,-0.036417,-0.002468,-0.192234,0.193749,-0.987884,-0.107851,-0.026385,0.125566,-0.088968,...,0.599166,-1.938309,-1.550059,0.181616,0.300304,-0.060096,0.328879,-0.603046,0.142453,-0.611021
ABCB6,-0.153516,0.090083,0.100917,-0.050100,0.074066,-0.659458,0.085049,0.124600,-0.207017,-0.206216,...,1.495430,-0.652769,-1.641520,0.452230,0.603896,-0.425903,-0.276004,0.646096,0.440571,-0.243154
ABCC5,0.430885,0.988469,-0.617515,-0.851815,-0.762765,-0.806991,-0.460982,-0.416399,-0.401315,-0.600982,...,-1.589408,1.927692,1.893692,-2.560858,2.216461,-0.799989,-0.612189,1.197562,-0.605189,-0.966489
ABCF1,0.869589,0.250189,0.483856,0.259573,0.420089,0.115573,0.727923,0.141889,-0.109810,0.062656,...,-0.044570,-0.984220,-1.935670,0.140505,-0.223652,-0.113703,0.028747,0.226997,0.001048,0.286047
ABCF3,-0.049203,-0.110871,0.241329,-0.277354,-0.815970,0.044405,-0.292220,-0.077737,-0.243837,-0.102671,...,-0.710602,-0.706151,-1.285051,0.228374,-0.079478,-0.725977,-0.238903,-0.174853,-0.239252,-0.088803
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF395,1.322245,0.406061,-0.584673,1.496211,2.218262,-0.601197,0.650295,-0.629755,0.837345,-0.014839,...,1.362500,-2.039700,-2.237650,0.414100,0.654866,-0.642534,0.903966,0.226066,0.694516,0.293466
ZNF451,-0.331278,-0.115078,0.114506,0.072972,-0.202944,-0.456144,-0.127895,0.128838,0.231789,0.071722,...,-2.694105,-1.195880,-1.509356,0.117395,-0.017080,-0.051730,0.123271,-0.477355,0.139670,-0.051455
ZNF586,-0.805566,-0.388083,-0.484466,-0.304266,-0.467732,-0.605591,-0.890166,-0.414350,-0.539833,-0.558366,...,0.291264,-0.746286,-1.077836,0.505365,0.361900,0.422150,0.339700,0.567650,0.171450,0.292250
ZNF589,-0.300999,-0.058116,-0.237699,0.468434,0.532901,-0.904574,0.014768,0.084234,0.139318,0.111168,...,-2.220960,-0.700336,1.503390,0.547189,1.006954,0.560229,0.872654,-0.904821,-0.011846,0.653454


In [10]:
genes = Data_df.index.tolist()
n_genes = len(genes)
print("Landmark genes list: ", n_genes, genes)

Landmark genes list:  978 ['AARS', 'ABCB6', 'ABCC5', 'ABCF1', 'ABCF3', 'ABHD4', 'ABHD6', 'ABL1', 'ACAA1', 'ACAT2', 'ACBD3', 'ACD', 'ACLY', 'ACOT9', 'ADAM10', 'ADAT1', 'ADGRE5', 'ADGRG1', 'ADH5', 'ADI1', 'ADO', 'ADRB2', 'AGL', 'AKAP8', 'AKAP8L', 'AKR7A2', 'AKT1', 'ALAS1', 'ALDH7A1', 'ALDOA', 'ALDOC', 'AMDHD2', 'ANKRD10', 'ANO10', 'ANXA7', 'APBB2', 'APOE', 'APP', 'APPBP2', 'ARFIP2', 'ARHGAP1', 'ARHGEF12', 'ARHGEF2', 'ARID4B', 'ARID5B', 'ARL4C', 'ARNT2', 'ARPP19', 'ASAH1', 'ASCC3', 'ATF1', 'ATF5', 'ATF6', 'ATG3', 'ATMIN', 'ATP11B', 'ATP1B1', 'ATP2C1', 'ATP6V0B', 'ATP6V1D', 'AURKA', 'AURKB', 'AXIN1', 'B4GAT1', 'BACE2', 'BAD', 'BAG3', 'BAMBI', 'BAX', 'BCL2', 'BCL7B', 'BDH1', 'BECN1', 'BHLHE40', 'BID', 'BIRC2', 'BIRC5', 'BLCAP', 'BLMH', 'BLVRA', 'BMP4', 'BNIP3', 'BNIP3L', 'BPHL', 'BRCA1', 'BTK', 'BUB1B', 'BZW2', 'C2CD2', 'C2CD2L', 'C2CD5', 'C5', 'CAB39', 'CALM3', 'CALU', 'CAMSAP2', 'CANT1', 'CAPN1', 'CARMIL1', 'CASC3', 'CASK', 'CASP10', 'CASP2', 'CASP3', 'CASP7', 'CAST', 'CAT', 'CBLB', 'CBR1

### Add experiments with siCDK2

In [11]:
experiments_add = pd.read_excel(os.path.join(data_dir, "ALL_DATA_2020.xlsx"), sheet_name = "add_experiments_CDK2", index_col = 0)
experiments_add_df = pd.DataFrame(np.zeros((len(genes), 6)), index = genes, columns = experiments_add.columns)
experiments_add_df.loc[experiments_add.index] = experiments_add

display(experiments_add_df)

Data_df = pd.concat([Data_df, experiments_add_df], axis = 1)
display(Data_df)

Unnamed: 0,siCDK2_STAT_vs_CTL_STAT,siCDK2_STAT_vs_CTL_STAT.1,siCDK2_STAT_vs_CTL_STAT.2,siCDK2_PSS_vs_CTL_STAT_ADJ_CTL_PSS_vs_CTL_STAT,siCDK2_PSS_vs_CTL_STAT_ADJ_CTL_PSS_vs_CTL_STAT.1,siCDK2_PSS_vs_CTL_STAT_ADJ_CTL_PSS_vs_CTL_STAT.2
AARS,0.0,0.0,0.0,0.000000,0.000000,0.000000
ABCB6,0.0,0.0,0.0,-0.018243,-0.018243,-0.018243
ABCC5,0.0,0.0,0.0,0.695906,0.695906,0.695906
ABCF1,0.0,0.0,0.0,0.000000,0.000000,0.000000
ABCF3,0.0,0.0,0.0,0.404739,0.404739,0.404739
...,...,...,...,...,...,...
ZNF395,0.0,0.0,0.0,0.000000,0.000000,0.000000
ZNF451,0.0,0.0,0.0,-0.327347,-0.327347,-0.327347
ZNF586,0.0,0.0,0.0,-0.556124,-0.556124,-0.556124
ZNF589,0.0,0.0,0.0,0.406653,0.406653,0.406653


Unnamed: 0,LJP007_HUVEC_24H:B19,LJP007_HUVEC_24H:B20,LJP007_HUVEC_24H:B21,LJP007_HUVEC_24H:B22,LJP007_HUVEC_24H:B23,LJP007_HUVEC_24H:B24,LJP007_HUVEC_24H:C01,LJP007_HUVEC_24H:C02,LJP007_HUVEC_24H:C03,LJP007_HUVEC_24H:C04,...,REP.B011_HUVEC.A_24H:O09_2,REP.B011_HUVEC.A_24H:O10_2,REP.B011_HUVEC.A_24H:O11_2,REP.B011_HUVEC.A_24H:O12_2,siCDK2_STAT_vs_CTL_STAT,siCDK2_STAT_vs_CTL_STAT.1,siCDK2_STAT_vs_CTL_STAT.2,siCDK2_PSS_vs_CTL_STAT_ADJ_CTL_PSS_vs_CTL_STAT,siCDK2_PSS_vs_CTL_STAT_ADJ_CTL_PSS_vs_CTL_STAT.1,siCDK2_PSS_vs_CTL_STAT_ADJ_CTL_PSS_vs_CTL_STAT.2
AARS,0.006899,-0.036417,-0.002468,-0.192234,0.193749,-0.987884,-0.107851,-0.026385,0.125566,-0.088968,...,0.328879,-0.603046,0.142453,-0.611021,0.0,0.0,0.0,0.000000,0.000000,0.000000
ABCB6,-0.153516,0.090083,0.100917,-0.050100,0.074066,-0.659458,0.085049,0.124600,-0.207017,-0.206216,...,-0.276004,0.646096,0.440571,-0.243154,0.0,0.0,0.0,-0.018243,-0.018243,-0.018243
ABCC5,0.430885,0.988469,-0.617515,-0.851815,-0.762765,-0.806991,-0.460982,-0.416399,-0.401315,-0.600982,...,-0.612189,1.197562,-0.605189,-0.966489,0.0,0.0,0.0,0.695906,0.695906,0.695906
ABCF1,0.869589,0.250189,0.483856,0.259573,0.420089,0.115573,0.727923,0.141889,-0.109810,0.062656,...,0.028747,0.226997,0.001048,0.286047,0.0,0.0,0.0,0.000000,0.000000,0.000000
ABCF3,-0.049203,-0.110871,0.241329,-0.277354,-0.815970,0.044405,-0.292220,-0.077737,-0.243837,-0.102671,...,-0.238903,-0.174853,-0.239252,-0.088803,0.0,0.0,0.0,0.404739,0.404739,0.404739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF395,1.322245,0.406061,-0.584673,1.496211,2.218262,-0.601197,0.650295,-0.629755,0.837345,-0.014839,...,0.903966,0.226066,0.694516,0.293466,0.0,0.0,0.0,0.000000,0.000000,0.000000
ZNF451,-0.331278,-0.115078,0.114506,0.072972,-0.202944,-0.456144,-0.127895,0.128838,0.231789,0.071722,...,0.123271,-0.477355,0.139670,-0.051455,0.0,0.0,0.0,-0.327347,-0.327347,-0.327347
ZNF586,-0.805566,-0.388083,-0.484466,-0.304266,-0.467732,-0.605591,-0.890166,-0.414350,-0.539833,-0.558366,...,0.339700,0.567650,0.171450,0.292250,0.0,0.0,0.0,-0.556124,-0.556124,-0.556124
ZNF589,-0.300999,-0.058116,-0.237699,0.468434,0.532901,-0.904574,0.014768,0.084234,0.139318,0.111168,...,0.872654,-0.904821,-0.011846,0.653454,0.0,0.0,0.0,0.406653,0.406653,0.406653


In [12]:
exp_ids = Data_df.columns.tolist()
n_experiments = len(exp_ids)
print("Experiments ids list to modeling: ", n_experiments)

Experiments ids list to modeling:  435


## Inhibitor concentrations, IC50, and perturbation matrices

In [13]:
inhib_conc_matrix = np.zeros((n_modules, n_experiments))
ic50_matrix = np.ones((n_modules, n_experiments))

In [14]:
for i, module in enumerate(modules):
    drugs_for_module = IC50_df.Drug[IC50_df.Module == module].tolist()
    for drug in drugs_for_module:
        # get IC50 for this drug
        ic50 = IC50_df.IC50[IC50_df.Drug == drug].values
        print(drug, ic50)
        assert ic50.size == 1
        # get experiments with this drug
        exp_with_drug = sig_info_df.index[sig_info_df.pert_drug == drug].tolist()
        print(len(exp_with_drug), exp_with_drug) 
        for exp_id in exp_with_drug:
            j = exp_ids.index(exp_id)
            # print(j)
            # extract inhibitor concentration
            inhib_conc = sig_info_df.dose_float[sig_info_df.index == exp_id].values
            assert inhib_conc.size == 1
            # insert values in matrices
            inhib_conc_matrix[i, j] = inhib_conc.item()
            ic50_matrix[i, j] = ic50.item()

CGP-60474 [0.01]
24 ['LJP007_HUVEC_24H:C19', 'LJP007_HUVEC_24H:C20', 'LJP007_HUVEC_24H:C21', 'LJP007_HUVEC_24H:C23', 'LJP007_HUVEC_24H:C24', 'LJP008_HUVEC.A_24H:C19', 'LJP008_HUVEC.A_24H:C20', 'LJP008_HUVEC.A_24H:C21', 'LJP008_HUVEC.A_24H:C22', 'LJP008_HUVEC.A_24H:C23', 'LJP008_HUVEC.A_24H:C24', 'LJP008_HUVEC_24H:C20', 'LJP008_HUVEC_24H:C24', 'LJP009_HUVEC.A_24H:C19', 'LJP009_HUVEC.A_24H:C20', 'LJP009_HUVEC.A_24H:C21', 'LJP009_HUVEC.A_24H:C22', 'LJP009_HUVEC.A_24H:C23', 'LJP009_HUVEC.A_24H:C24', 'LJP009_HUVEC_24H:C19', 'LJP009_HUVEC_24H:C20', 'LJP009_HUVEC_24H:C22', 'LJP009_HUVEC_24H:C23', 'LJP009_HUVEC_24H:C24']
dinaciclib [0.004]
3 ['LJP007_HUVEC_24H:E21', 'LJP007_HUVEC_24H:E22', 'LJP007_HUVEC_24H:E24']
PHA-767491 [0.25]
6 ['LJP007_HUVEC_24H:J01', 'LJP007_HUVEC_24H:J02', 'LJP007_HUVEC_24H:J03', 'LJP007_HUVEC_24H:J04', 'LJP007_HUVEC_24H:J05', 'LJP007_HUVEC_24H:J06']
roscovitine [2.]
33 ['REP.A011_HUVEC.A_24H:O07', 'REP.A011_HUVEC.A_24H:O09', 'REP.A011_HUVEC.A_24H:O10', 'REP.A011_HUVEC

In [15]:
# transform matrices into pandas dfs for export with row and column names
inhib_conc_df = pd.DataFrame(inhib_conc_matrix, index = modules, columns = exp_ids)
ic50_df = pd.DataFrame(ic50_matrix, index = modules, columns = exp_ids)

display(ic50_df)
display(inhib_conc_df)

Unnamed: 0,LJP007_HUVEC_24H:B19,LJP007_HUVEC_24H:B20,LJP007_HUVEC_24H:B21,LJP007_HUVEC_24H:B22,LJP007_HUVEC_24H:B23,LJP007_HUVEC_24H:B24,LJP007_HUVEC_24H:C01,LJP007_HUVEC_24H:C02,LJP007_HUVEC_24H:C03,LJP007_HUVEC_24H:C04,...,REP.B011_HUVEC.A_24H:O09_2,REP.B011_HUVEC.A_24H:O10_2,REP.B011_HUVEC.A_24H:O11_2,REP.B011_HUVEC.A_24H:O12_2,siCDK2_STAT_vs_CTL_STAT,siCDK2_STAT_vs_CTL_STAT.1,siCDK2_STAT_vs_CTL_STAT.2,siCDK2_PSS_vs_CTL_STAT_ADJ_CTL_PSS_vs_CTL_STAT,siCDK2_PSS_vs_CTL_STAT_ADJ_CTL_PSS_vs_CTL_STAT.1,siCDK2_PSS_vs_CTL_STAT_ADJ_CTL_PSS_vs_CTL_STAT.2
CDK1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CDK2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0
PAK,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
PI3K,0.5,0.5,0.5,0.5,0.5,0.5,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ERK,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
BET,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Aurora,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
TGFbR,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
VEGFR,1.0,1.0,1.0,1.0,1.0,1.0,0.01,0.01,0.01,0.01,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
PKC,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,LJP007_HUVEC_24H:B19,LJP007_HUVEC_24H:B20,LJP007_HUVEC_24H:B21,LJP007_HUVEC_24H:B22,LJP007_HUVEC_24H:B23,LJP007_HUVEC_24H:B24,LJP007_HUVEC_24H:C01,LJP007_HUVEC_24H:C02,LJP007_HUVEC_24H:C03,LJP007_HUVEC_24H:C04,...,REP.B011_HUVEC.A_24H:O09_2,REP.B011_HUVEC.A_24H:O10_2,REP.B011_HUVEC.A_24H:O11_2,REP.B011_HUVEC.A_24H:O12_2,siCDK2_STAT_vs_CTL_STAT,siCDK2_STAT_vs_CTL_STAT.1,siCDK2_STAT_vs_CTL_STAT.2,siCDK2_PSS_vs_CTL_STAT_ADJ_CTL_PSS_vs_CTL_STAT,siCDK2_PSS_vs_CTL_STAT_ADJ_CTL_PSS_vs_CTL_STAT.1,siCDK2_PSS_vs_CTL_STAT_ADJ_CTL_PSS_vs_CTL_STAT.2
CDK1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CDK2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.25,0.08,0.03,0.01,0.0,0.0,0.0,0.0,0.0,0.0
PAK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PI3K,10.0,3.33,1.11,0.37,0.12,0.04,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BET,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Aurora,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TGFbR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
VEGFR,0.0,0.0,0.0,0.0,0.0,0.0,10.0,3.33,1.11,0.37,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PKC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# create binary perturbation matrix
pert_df = pd.DataFrame(
    np.where(inhib_conc_matrix != 0, 1, 0),
    index = inhib_conc_df.index,
    columns = inhib_conc_df.columns,
)
# display(pert_df)

pert_df.iloc[1, 429:] = 1
# display(pert_df.iloc[:, 429:])

display(pert_df)

Unnamed: 0,LJP007_HUVEC_24H:B19,LJP007_HUVEC_24H:B20,LJP007_HUVEC_24H:B21,LJP007_HUVEC_24H:B22,LJP007_HUVEC_24H:B23,LJP007_HUVEC_24H:B24,LJP007_HUVEC_24H:C01,LJP007_HUVEC_24H:C02,LJP007_HUVEC_24H:C03,LJP007_HUVEC_24H:C04,...,REP.B011_HUVEC.A_24H:O09_2,REP.B011_HUVEC.A_24H:O10_2,REP.B011_HUVEC.A_24H:O11_2,REP.B011_HUVEC.A_24H:O12_2,siCDK2_STAT_vs_CTL_STAT,siCDK2_STAT_vs_CTL_STAT.1,siCDK2_STAT_vs_CTL_STAT.2,siCDK2_PSS_vs_CTL_STAT_ADJ_CTL_PSS_vs_CTL_STAT,siCDK2_PSS_vs_CTL_STAT_ADJ_CTL_PSS_vs_CTL_STAT.1,siCDK2_PSS_vs_CTL_STAT_ADJ_CTL_PSS_vs_CTL_STAT.2
CDK1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CDK2,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
PAK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PI3K,1,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ERK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BET,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Aurora,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TGFbR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
VEGFR,0,0,0,0,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
PKC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Global responses for DPD modules

In [17]:
# load STV data frame
STVs = pd.read_excel(os.path.join(data_dir, "ALL_DATA_2020.xlsx"), sheet_name = "STVs", index_col = 0)
STV_df = pd.DataFrame(np.zeros((len(Data_df.index), 3)), index = Data_df.index, columns = STVs.columns)
STV_df.loc[STVs.index] = STVs

display(STV_df)

Unnamed: 0,remod,FSS,OSS
AARS,0.000000,0.000000,0.000000
ABCB6,-0.083943,-0.012487,-0.014844
ABCC5,-0.014500,0.000000,-0.024863
ABCF1,0.000000,0.000000,0.000000
ABCF3,0.019767,0.000000,0.005727
...,...,...,...
ZNF395,0.006993,-0.044617,0.032520
ZNF451,-0.016303,0.022245,-0.017704
ZNF586,-0.027161,0.000000,-0.007869
ZNF589,-0.013882,0.015627,-0.007826


In [18]:
# create empty DPD data frame
DPD_df = pd.DataFrame(
    np.zeros((len(Data_df.columns), len(STV_df.columns))),
    index = Data_df.columns,
    columns = STV_df.columns,
)

# populate
for exp_id in DPD_df.index:
    for state in STV_df.columns:
        DPD_df.loc[exp_id, state] = np.dot(Data_df.T.loc[exp_id], STV_df.loc[:, state])

display(DPD_df)

Unnamed: 0,remod,FSS,OSS
LJP007_HUVEC_24H:B19,-0.024074,-1.740634,0.720341
LJP007_HUVEC_24H:B20,0.397650,0.423876,0.494894
LJP007_HUVEC_24H:B21,0.772521,0.793323,0.525337
LJP007_HUVEC_24H:B22,0.111395,0.588123,0.158974
LJP007_HUVEC_24H:B23,-0.014829,0.324496,0.340764
...,...,...,...
siCDK2_STAT_vs_CTL_STAT.1,0.265222,-0.761037,-0.202065
siCDK2_STAT_vs_CTL_STAT.2,0.265222,-0.761037,-0.202065
siCDK2_PSS_vs_CTL_STAT_ADJ_CTL_PSS_vs_CTL_STAT,1.324112,-4.195989,9.271936
siCDK2_PSS_vs_CTL_STAT_ADJ_CTL_PSS_vs_CTL_STAT.1,1.324112,-4.195989,9.271936


In [19]:
# transform to R_global
R_global_DPD_df = DPD_df.T
display(R_global_DPD_df)

Unnamed: 0,LJP007_HUVEC_24H:B19,LJP007_HUVEC_24H:B20,LJP007_HUVEC_24H:B21,LJP007_HUVEC_24H:B22,LJP007_HUVEC_24H:B23,LJP007_HUVEC_24H:B24,LJP007_HUVEC_24H:C01,LJP007_HUVEC_24H:C02,LJP007_HUVEC_24H:C03,LJP007_HUVEC_24H:C04,...,REP.B011_HUVEC.A_24H:O09_2,REP.B011_HUVEC.A_24H:O10_2,REP.B011_HUVEC.A_24H:O11_2,REP.B011_HUVEC.A_24H:O12_2,siCDK2_STAT_vs_CTL_STAT,siCDK2_STAT_vs_CTL_STAT.1,siCDK2_STAT_vs_CTL_STAT.2,siCDK2_PSS_vs_CTL_STAT_ADJ_CTL_PSS_vs_CTL_STAT,siCDK2_PSS_vs_CTL_STAT_ADJ_CTL_PSS_vs_CTL_STAT.1,siCDK2_PSS_vs_CTL_STAT_ADJ_CTL_PSS_vs_CTL_STAT.2
remod,-0.024074,0.39765,0.772521,0.111395,-0.014829,-0.14277,1.53228,-0.138874,0.492911,-0.047575,...,0.379346,0.97792,0.32694,-0.311052,0.265222,0.265222,0.265222,1.324112,1.324112,1.324112
FSS,-1.740634,0.423876,0.793323,0.588123,0.324496,-0.281456,-3.035403,-0.809173,-0.345464,0.128354,...,-1.038011,-0.005622,0.445622,0.061587,-0.761037,-0.761037,-0.761037,-4.195989,-4.195989,-4.195989
OSS,0.720341,0.494894,0.525337,0.158974,0.340764,0.926952,-2.3472,-0.06386,-0.314658,-0.311412,...,0.246457,0.243745,-0.786478,0.751844,-0.202065,-0.202065,-0.202065,9.271936,9.271936,9.271936


## Save outputs

In [20]:
# save metadata as pickle
all_metadata = {
    "modules": modules,
    "n_modules": n_modules,
    "drugs": drugs,
    "n_drugs": n_drugs,
    "genes": genes,
    "n_genes": n_genes,
    "exp_ids": exp_ids,
    "n_experiments": n_experiments,
}

print(all_metadata)

with open(os.path.join(out_dir, "metadata.pickle"), "wb") as f:
    pickle.dump(all_metadata, f, protocol = pickle.HIGHEST_PROTOCOL)

{'modules': ['CDK1', 'CDK2', 'PAK', 'PI3K', 'ERK', 'BET', 'Aurora', 'TGFbR', 'VEGFR', 'PKC'], 'n_modules': 10, 'drugs': ['CGP-60474', 'dinaciclib', 'PHA-767491', 'roscovitine', 'PF-03758309', 'PF-04691502', 'XL-147', 'GSK-2334470', 'A-66', 'BGT-226', 'BX-795', 'AZ-628', 'FR-180204', 'GDC-0879', 'GW-5074', 'PD-0325901', 'PD-184352', 'TAK-733', 'dabrafenib', 'refametinib', 'RAF-265', 'I-BET-762', 'I-BET-151', 'JQ-1-R', 'PFI-1', 'GSK-1070916', 'MLN-8054', 'ZM-447439', 'alisertib', 'SB-525334', 'rebastinib', 'lenvatinib', 'orantinib', 'foretinib', 'cediranib', 'sunitinib', 'tivozanib', 'enzastaurin'], 'n_drugs': 38, 'genes': ['AARS', 'ABCB6', 'ABCC5', 'ABCF1', 'ABCF3', 'ABHD4', 'ABHD6', 'ABL1', 'ACAA1', 'ACAT2', 'ACBD3', 'ACD', 'ACLY', 'ACOT9', 'ADAM10', 'ADAT1', 'ADGRE5', 'ADGRG1', 'ADH5', 'ADI1', 'ADO', 'ADRB2', 'AGL', 'AKAP8', 'AKAP8L', 'AKR7A2', 'AKT1', 'ALAS1', 'ALDH7A1', 'ALDOA', 'ALDOC', 'AMDHD2', 'ANKRD10', 'ANO10', 'ANXA7', 'APBB2', 'APOE', 'APP', 'APPBP2', 'ARFIP2', 'ARHGAP1', 'A

In [21]:
# save doses and perturbation matrix
inhib_conc_df.to_csv(os.path.join(out_dir, "inhib_conc_annotated.csv"))
ic50_df.to_csv(os.path.join(out_dir, "ic50_annotated.csv"))
pert_df.to_csv(os.path.join(out_dir, "pert_annotated.csv"))

In [22]:
# save data
Data_df.to_csv(os.path.join(out_dir, "DATA.csv"))

In [23]:
# save R_global for DPDs
R_global_DPD_df.to_csv(os.path.join(out_dir, "R_global_DPDonly_annotated.csv"))