In [2]:
import pandas as pd
from scipy.stats import pearsonr
import pathlib
import sys
import numpy as np

sys.path.append("../")
from utils import load_utils

In [3]:
# Load PRISM data
top_dir = "../5.drug-dependency"
data_dir = "data"

prism_df, prism_cell_df, prism_trt_df = load_utils.load_prism(
    top_dir=top_dir,
    data_dir=data_dir,
    secondary_screen=False,
    load_cell_info=True,
    load_treatment_info=True,
)

# Reset the index and name it ModelID
prism_df.reset_index(inplace=True)
prism_df.rename(columns={'index': 'ModelID'}, inplace=True)

# Check the result
print(prism_df.shape)
prism_df.head(3)


(578, 4687)


Unnamed: 0,ModelID,BRD-A00077618-236-07-6::2.5::HTS,BRD-A00100033-001-08-9::2.5::HTS,BRD-A00147595-001-01-5::2.5::HTS,BRD-A00218260-001-03-4::2.5::HTS,BRD-A00376169-001-01-6::2.5::HTS,BRD-A00520476-001-07-4::2.5::HTS,BRD-A00546892-001-02-6::2.5::HTS,BRD-A00578795-001-04-3::2.5::HTS,BRD-A00758722-001-04-9::2.5::HTS,...,BRD-K98557884-001-01-6::2.5::MTS004,BRD-K99077012-001-01-9::2.332734192::MTS004,BRD-K99199077-001-16-1::2.603211317::MTS004,BRD-K99431849-001-01-7::2.500018158::MTS004,BRD-K99447003-335-04-1::2.37737659::MTS004,BRD-K99506538-001-03-8::2.5::MTS004,BRD-K99616396-001-05-1::2.499991421::MTS004,BRD-K99879819-001-02-1::2.5187366::MTS004,BRD-K99919177-001-01-3::2.5::MTS004,BRD-M63173034-001-03-6::2.64076472::MTS004
0,ACH-000001,-0.015577,-0.449332,0.489379,0.206675,0.27273,0.021036,-0.02546,0.467158,-0.736306,...,0.429238,0.204841,0.150055,-0.575404,-0.101247,0.399233,-0.127658,-0.141651,-1.153652,0.510464
1,ACH-000007,-0.09573,0.257943,0.772349,-0.438502,-0.732832,0.779201,0.426523,-1.288508,-0.476133,...,-0.471486,0.212998,-0.12323,0.625527,0.383198,0.212031,0.349225,-0.387439,-0.831461,0.323558
2,ACH-000008,0.37948,-0.596132,0.548056,0.422269,-0.216986,0.081866,0.145335,-0.570841,-0.512119,...,-0.111951,0.534787,0.206642,-0.410153,-0.560722,-0.036088,0.158071,0.171043,-3.94709,0.09931


In [4]:
#Load reactome pathways
pathway_dir = pathlib.Path("../3.analysis/results/significant_gsea_results.parquet.gz").resolve()
pathway_df = pd.read_parquet(pathway_dir)
pathway_df.head()

Unnamed: 0,Term,es,nes,pval,sidak,fdr,geneset_size,leading_edge,z_dim,source
0,mRNA Splicing R-HSA-72172,-0.654698,-7.429723,1.088256e-13,1.12961e-10,5.853937e-11,109,"DHX9,DHX38,DHX15,CDC5L,PCF11,PLRG1,LSM7,PRPF6,...",z_1,real
1,mRNA Splicing - Major Pathway R-HSA-72163,-0.676886,-7.371983,1.681083e-13,1.744964e-10,5.853937e-11,104,"DHX9,DHX38,DHX15,CDC5L,PCF11,PLRG1,LSM7,PRPF6,...",z_1,real
2,Processing Of Capped Intron-Containing Pre-mRN...,-0.60507,-7.371129,1.691889e-13,1.756181e-10,5.853937e-11,141,"DHX9,DHX38,DHX15,CDC5L,PCF11,PLRG1,LSM7,PRPF6,...",z_1,real
13,APC/C-mediated Degradation Of Cell Cycle Prote...,-0.723782,-6.112642,9.799483e-10,1.017186e-06,7.265617e-08,61,"PSMB3,ANAPC4,CDK1,PSMB1,RB1,PSMB5,PSMD11,PSMD1...",z_1,real
14,Antigen Processing: Ubiquitination And Proteas...,-0.585487,-6.063218,1.334247e-09,1.384948e-06,9.232992e-08,101,"PSMB3,LRR1,PSMD9,ANAPC4,PSMB1,KEAP1,HECTD1,PSM...",z_1,real


In [5]:
# load the latent dim matrix 
latent_dir = pathlib.Path("../2.train-VAE/results/latent_df.parquet").resolve()
latent_df = pd.read_parquet(latent_dir)
latent_df.head()

Unnamed: 0,ModelID,0,1,2,3,4,5,6,7,8,...,26,27,28,29,30,31,32,33,34,35
0,ACH-000748,0.0,0.0,2.931751,1.300346,0.968795,0.0,0.079798,2.580413,0.24233,...,1.407413,0.0,0.0,0.0,0.0,0.68958,1.444408,0.17097,3.589293,1.664138
1,ACH-000645,0.0,0.0,1.865084,0.886528,0.897159,0.0,0.156185,2.27979,0.061815,...,0.817676,0.0,0.0,0.0,0.0,0.0,1.385956,0.009952,1.823041,1.082538
2,ACH-001301,0.0,0.0,2.396594,1.361056,0.183786,0.0,0.0,2.563383,0.0,...,0.618336,0.0,0.0,0.0,0.0,0.823913,1.231781,0.0,2.180012,0.911842
3,ACH-000361,0.0,0.0,2.906988,1.973957,0.100008,0.0,0.0,2.699383,0.0,...,0.911914,0.0,0.0,0.0,0.0,1.754643,1.337547,0.0,2.75127,1.274395
4,ACH-000325,0.0,0.0,3.764669,3.333713,0.0,0.0,0.0,4.411053,0.0,...,0.045819,0.0,0.0,0.0,0.0,3.796674,1.755739,0.0,2.519099,0.682146


In [6]:
# Ensure ModelID is the index for both dataframes to align 
latent_df.set_index('ModelID', inplace=True)
prism_df.set_index('ModelID', inplace=True)
latent_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
ModelID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACH-000748,0.0,0.0,2.931751,1.300346,0.968795,0.0,0.079798,2.580413,0.24233,3.798377,...,1.407413,0.0,0.0,0.0,0.0,0.68958,1.444408,0.17097,3.589293,1.664138
ACH-000645,0.0,0.0,1.865084,0.886528,0.897159,0.0,0.156185,2.27979,0.061815,3.558936,...,0.817676,0.0,0.0,0.0,0.0,0.0,1.385956,0.009952,1.823041,1.082538
ACH-001301,0.0,0.0,2.396594,1.361056,0.183786,0.0,0.0,2.563383,0.0,3.603009,...,0.618336,0.0,0.0,0.0,0.0,0.823913,1.231781,0.0,2.180012,0.911842
ACH-000361,0.0,0.0,2.906988,1.973957,0.100008,0.0,0.0,2.699383,0.0,3.965211,...,0.911914,0.0,0.0,0.0,0.0,1.754643,1.337547,0.0,2.75127,1.274395
ACH-000325,0.0,0.0,3.764669,3.333713,0.0,0.0,0.0,4.411053,0.0,6.069576,...,0.045819,0.0,0.0,0.0,0.0,3.796674,1.755739,0.0,2.519099,0.682146


In [7]:
# Align both dataframes based on the ModelID
common_model_ids = latent_df.index.intersection(prism_df.index)

In [8]:
# Filter both dataframes to keep only common ModelIDs
latent_df_filtered = latent_df.loc[common_model_ids]
prism_df_filtered = prism_df.loc[common_model_ids]

In [9]:
# Check the variance of each latent dimension and drug response column
latent_variance = latent_df_filtered.var()
prism_variance = prism_df_filtered.var()

# Filter out constant columns (variance == 0)
latent_df_filtered = latent_df_filtered.loc[:, latent_variance != 0]
prism_df_filtered = prism_df_filtered.loc[:, prism_variance != 0]

In [10]:
latent_df_filtered.head()

Unnamed: 0_level_0,1,2,3,4,6,7,8,9,11,12,...,24,25,26,27,28,31,32,33,34,35
ModelID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACH-000748,0.0,2.931751,1.300346,0.968795,0.079798,2.580413,0.24233,3.798377,4.988098,0.0,...,0.0,4.071278,1.407413,0.0,0.0,0.68958,1.444408,0.17097,3.589293,1.664138
ACH-000645,0.0,1.865084,0.886528,0.897159,0.156185,2.27979,0.061815,3.558936,3.101457,0.0,...,0.0,2.596276,0.817676,0.0,0.0,0.0,1.385956,0.009952,1.823041,1.082538
ACH-000361,0.0,2.906988,1.973957,0.100008,0.0,2.699383,0.0,3.965211,3.971841,0.0,...,0.0,3.320359,0.911914,0.0,0.0,1.754643,1.337547,0.0,2.75127,1.274395
ACH-000500,0.0,2.159487,1.75294,0.288381,0.0,2.658966,0.0,3.791974,3.448829,0.0,...,0.0,2.962499,0.582971,0.0,0.0,1.086582,1.592717,0.0,2.102342,0.958153
ACH-000305,0.0,2.629987,1.629254,0.0,0.0,3.175623,0.0,4.790333,3.812537,0.0,...,0.0,3.345766,0.533255,0.0,0.0,0.751871,1.744751,0.0,2.371026,1.104819


In [11]:
prism_df_filtered.head()

Unnamed: 0_level_0,BRD-A00077618-236-07-6::2.5::HTS,BRD-A00100033-001-08-9::2.5::HTS,BRD-A00147595-001-01-5::2.5::HTS,BRD-A00218260-001-03-4::2.5::HTS,BRD-A00376169-001-01-6::2.5::HTS,BRD-A00520476-001-07-4::2.5::HTS,BRD-A00546892-001-02-6::2.5::HTS,BRD-A00578795-001-04-3::2.5::HTS,BRD-A00758722-001-04-9::2.5::HTS,BRD-A00827783-001-24-6::2.5::HTS,...,BRD-K98557884-001-01-6::2.5::MTS004,BRD-K99077012-001-01-9::2.332734192::MTS004,BRD-K99199077-001-16-1::2.603211317::MTS004,BRD-K99431849-001-01-7::2.500018158::MTS004,BRD-K99447003-335-04-1::2.37737659::MTS004,BRD-K99506538-001-03-8::2.5::MTS004,BRD-K99616396-001-05-1::2.499991421::MTS004,BRD-K99879819-001-02-1::2.5187366::MTS004,BRD-K99919177-001-01-3::2.5::MTS004,BRD-M63173034-001-03-6::2.64076472::MTS004
ModelID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACH-000748,0.040994,0.043226,0.608362,0.621063,0.433496,0.10563,0.640146,0.174177,-0.578185,0.169207,...,-0.194259,0.127125,-0.710559,-0.51799,0.428975,0.288549,-0.331999,-0.129989,-1.289889,-0.12792
ACH-000645,0.132239,-0.017053,-0.453666,0.717919,-0.933121,0.562395,0.032053,-0.860867,-0.767628,-0.413682,...,0.500183,-0.111305,-0.143309,0.249487,0.130374,-0.222888,0.453176,-0.150609,-3.006035,0.012878
ACH-000361,-0.604188,-0.284897,-0.55033,0.025452,-0.279372,0.391857,-0.050825,-0.587276,-0.403876,0.351209,...,0.328679,-0.471446,-0.448718,-0.02685,-0.044571,-0.130468,0.920305,-1.593728,-3.038775,-0.620405
ACH-000500,-0.084264,0.299285,-0.030174,0.455913,0.366639,0.356831,-0.427765,-1.202408,-0.729004,0.373132,...,0.179556,0.055173,-0.333961,-0.143009,-0.40688,-0.129199,-0.006238,-0.592128,-3.924667,-0.120793
ACH-000305,0.594885,-0.26372,-0.259307,0.376236,0.369383,-0.374505,-0.255021,0.466388,-0.37355,0.459996,...,0.562821,-0.023984,0.000466,-0.920437,0.135617,0.635887,-0.594428,-0.128707,-3.178385,0.161976


In [12]:
# Create a dataframe to store the Pearson correlation results
correlation_results = []

# Iterate over each latent dimension and drug column
for latent_col in latent_df_filtered.columns:
    for drug_col in prism_df_filtered.columns:
        latent_values = latent_df_filtered[latent_col]
        drug_values = prism_df_filtered[drug_col]

        # Check if either column is constant
        if latent_values.nunique() <= 1 or drug_values.nunique() <= 1:
            corr = np.nan
        else:
            # Drop missing values for both columns
            valid_data = pd.concat([latent_values, drug_values], axis=1).dropna()
            latent_values_valid = valid_data[latent_col]
            drug_values_valid = valid_data[drug_col]

            if len(latent_values_valid) > 1 and len(drug_values_valid) > 1:
                # Calculate Pearson correlation
                corr, _ = pearsonr(latent_values_valid, drug_values_valid)
            else:
                corr = np.nan
                print("nan")
        
        # Store the result
        correlation_results.append({
            'latent_dimension': latent_col,
            'drug': drug_col,
            'correlation': corr
        })

# Convert the results to a dataframe for easier analysis
correlation_df = pd.DataFrame(correlation_results)

# Display the correlation dataframe
correlation_df.sort_values(by='correlation', key=abs, ascending=False).head(50)


  corr, _ = pearsonr(latent_values_valid, drug_values_valid)


Unnamed: 0,latent_dimension,drug,correlation
111864,32,BRD-K98572433-001-02-9::2.5::HTS,-0.424845
112240,32,BRD-K50010139-001-02-3::2.5::MTS004,-0.41208
110728,32,BRD-K64052750-001-17-5::2.5::HTS,-0.396887
109458,32,BRD-K26603252-003-04-9::2.5::HTS,-0.3937
110934,32,BRD-K70301465-001-02-6::2.5::HTS,-0.393304
111035,32,BRD-K73309154-003-02-8::2.5::HTS,-0.392537
111126,32,BRD-K76239644-001-01-8::2.5::HTS,-0.384671
110108,32,BRD-K46386702-001-02-1::2.5::HTS,-0.373803
110046,32,BRD-K44844162-001-01-6::2.5::HTS,-0.373797
109346,32,BRD-K23190681-001-01-1::2.5::HTS,-0.371871


In [17]:
#Sort pathways by NES score (ascending order)
pathway_df.sort_values(by='nes', ascending=True)

ranked_gsea = pathway_df.sort_values(by='nes', key=abs, ascending=False)

#Group by 'z_dim' and aggregate 'Term' into a list of associated pathways
grouped_pathway_df = ranked_gsea.groupby('z_dim').apply(lambda x: x.nlargest(10, 'nes')['Term'].tolist()).reset_index(drop=False)

# remove z_
grouped_pathway_df['z_dim'] = grouped_pathway_df['z_dim'].str.replace('z_', '', regex=False)

grouped_pathway_df.columns = ['latent dimension', 'Associated Pathways']

grouped_pathway_df.head()


  grouped_pathway_df = ranked_gsea.groupby('z_dim').apply(lambda x: x.nlargest(10, 'nes')['Term'].tolist()).reset_index(drop=False)


Unnamed: 0,latent dimension,Associated Pathways
0,1,"[rRNA Processing R-HSA-72312, rRNA Processing ..."
1,10,[Formation Of ATP By Chemiosmotic Coupling R-H...
2,11,[rRNA Processing In Nucleus And Cytosol R-HSA-...
3,12,"[S Phase R-HSA-69242, DNA Replication R-HSA-69..."
4,13,"[Leishmania Infection R-HSA-9658195, Fcgamma R..."


In [18]:
# Assuming 'drug_column_name' is the column in prism_trt_df that matches the 'drug' column in correlation_df
prism_trt_df_filtered = prism_trt_df[['column_name', 'name', 'moa', 'target']]

# Merge correlation_df with prism_trt_df based on the 'drug' column in correlation_df and the matching column in prism_trt_df
correlation_df_merged1 = pd.merge(correlation_df, prism_trt_df_filtered, how='left', left_on='drug', right_on='column_name')

# Drop the redundant drug_column_name column after the merge if needed
correlation_df_merged1 = correlation_df_merged1.drop(columns=['column_name'])

# Merge correlation_df with prism_trt_df based on the 'drug' column in correlation_df and the matching column in prism_trt_df
correlation_df_merged = pd.merge(correlation_df_merged1, grouped_pathway_df, how='left', left_on='latent_dimension', right_on='latent dimension')

# Drop the redundant drug_column_name column after the merge if needed
correlation_df_merged = correlation_df_merged.drop(columns=['latent dimension'])

significant_corr_df = correlation_df_merged[
    (correlation_df_merged['correlation'].abs() > 0.1)
]
# saving results as single output file
correlation_dir = pathlib.Path("./results/drug_correlation.csv")
significant_corr_df.to_csv(correlation_dir)

# Display the updated dataframe with the new columns
correlation_df_merged.sort_values(by='correlation', key=abs, ascending=False).head(50)


Unnamed: 0,latent_dimension,drug,correlation,name,moa,target,Associated Pathways
111864,32,BRD-K98572433-001-02-9::2.5::HTS,-0.424845,AZD8931,EGFR inhibitor,"EGFR, ERBB2, ERBB3","[Cilium Assembly R-HSA-5617833, Regulation Of ..."
112240,32,BRD-K50010139-001-02-3::2.5::MTS004,-0.41208,poziotinib,EGFR inhibitor,"EGFR, ERBB2, ERBB4","[Cilium Assembly R-HSA-5617833, Regulation Of ..."
110728,32,BRD-K64052750-001-17-5::2.5::HTS,-0.396887,gefitinib,EGFR inhibitor,EGFR,"[Cilium Assembly R-HSA-5617833, Regulation Of ..."
109458,32,BRD-K26603252-003-04-9::2.5::HTS,-0.3937,PD-153035,EGFR inhibitor,"EGFR, KDR","[Cilium Assembly R-HSA-5617833, Regulation Of ..."
110934,32,BRD-K70301465-001-02-6::2.5::HTS,-0.393304,ibrutinib,Bruton's tyrosine kinase (BTK) inhibitor,"BLK, BMX, BTK","[Cilium Assembly R-HSA-5617833, Regulation Of ..."
111035,32,BRD-K73309154-003-02-8::2.5::HTS,-0.392537,OSI-420,EGFR inhibitor,EGFR,"[Cilium Assembly R-HSA-5617833, Regulation Of ..."
111126,32,BRD-K76239644-001-01-8::2.5::HTS,-0.384671,BMS-690514,"EGFR inhibitor, VEGFR inhibitor","EGFR, ERBB2, FLT3, KDR","[Cilium Assembly R-HSA-5617833, Regulation Of ..."
110108,32,BRD-K46386702-001-02-1::2.5::HTS,-0.373803,ARRY-334543,EGFR inhibitor,ERBB2,"[Cilium Assembly R-HSA-5617833, Regulation Of ..."
110046,32,BRD-K44844162-001-01-6::2.5::HTS,-0.373797,taselisib,PI3K inhibitor,PIK3CA,"[Cilium Assembly R-HSA-5617833, Regulation Of ..."
109346,32,BRD-K23190681-001-01-1::2.5::HTS,-0.371871,AV-412,protein tyrosine kinase inhibitor,"EGFR, ERBB2","[Cilium Assembly R-HSA-5617833, Regulation Of ..."
