In [4]:
import pandas as pd
from scipy.stats import pearsonr
import pathlib
import sys
import numpy as np

sys.path.append("../")
from utils import load_utils

In [5]:
# Load PRISM data
top_dir = "../5.drug-dependency"
data_dir = "data"

prism_df, prism_cell_df, prism_trt_df = load_utils.load_prism(
    top_dir=top_dir,
    data_dir=data_dir,
    secondary_screen=False,
    load_cell_info=True,
    load_treatment_info=True,
)

# Reset the index and name it ModelID
prism_df.reset_index(inplace=True)
prism_df.rename(columns={'index': 'ModelID'}, inplace=True)

# Check the result
print(prism_df.shape)
prism_df.head(3)


(578, 4687)


Unnamed: 0,ModelID,BRD-A00077618-236-07-6::2.5::HTS,BRD-A00100033-001-08-9::2.5::HTS,BRD-A00147595-001-01-5::2.5::HTS,BRD-A00218260-001-03-4::2.5::HTS,BRD-A00376169-001-01-6::2.5::HTS,BRD-A00520476-001-07-4::2.5::HTS,BRD-A00546892-001-02-6::2.5::HTS,BRD-A00578795-001-04-3::2.5::HTS,BRD-A00758722-001-04-9::2.5::HTS,...,BRD-K98557884-001-01-6::2.5::MTS004,BRD-K99077012-001-01-9::2.332734192::MTS004,BRD-K99199077-001-16-1::2.603211317::MTS004,BRD-K99431849-001-01-7::2.500018158::MTS004,BRD-K99447003-335-04-1::2.37737659::MTS004,BRD-K99506538-001-03-8::2.5::MTS004,BRD-K99616396-001-05-1::2.499991421::MTS004,BRD-K99879819-001-02-1::2.5187366::MTS004,BRD-K99919177-001-01-3::2.5::MTS004,BRD-M63173034-001-03-6::2.64076472::MTS004
0,ACH-000001,-0.015577,-0.449332,0.489379,0.206675,0.27273,0.021036,-0.02546,0.467158,-0.736306,...,0.429238,0.204841,0.150055,-0.575404,-0.101247,0.399233,-0.127658,-0.141651,-1.153652,0.510464
1,ACH-000007,-0.09573,0.257943,0.772349,-0.438502,-0.732832,0.779201,0.426523,-1.288508,-0.476133,...,-0.471486,0.212998,-0.12323,0.625527,0.383198,0.212031,0.349225,-0.387439,-0.831461,0.323558
2,ACH-000008,0.37948,-0.596132,0.548056,0.422269,-0.216986,0.081866,0.145335,-0.570841,-0.512119,...,-0.111951,0.534787,0.206642,-0.410153,-0.560722,-0.036088,0.158071,0.171043,-3.94709,0.09931


In [6]:
#Load reactome pathways
pathway_dir = pathlib.Path("../3.analysis/results/significant_gsea_results.parquet.gz").resolve()
pathway_df = pd.read_parquet(pathway_dir)
pathway_df.head()

Unnamed: 0,Term,es,nes,pval,sidak,fdr,geneset_size,leading_edge,z_dim,source
0,Metabolism Of Nucleotides R-HSA-15869,-0.846917,-4.694658,3e-06,0.002768,0.00213,24,"AK2,DHODH,ADSS2,DTYMK,COASY,ATIC,UMPS,DUT,GART...",z_1,real
6,Metabolism Of Vitamins And Cofactors R-HSA-196854,-0.69271,-4.179989,2.9e-05,0.029807,0.004323,31,"PPCS,MTR,PDSS1,FPGS,SHMT2,NAMPT,NMNAT1,LRP8,CO...",z_1,real
8,Metabolism Of Water-Soluble Vitamins And Cofac...,-0.82351,-4.033644,5.5e-05,0.055412,0.006334,19,"PPCS,MTR,SHMT2,FPGS,NAMPT,NMNAT1,COASY,TPK1,RF...",z_1,real
9,Leishmania Infection R-HSA-9658195,0.636204,3.94847,7.9e-05,0.0784,0.008164,34,"PTK2,GRB2,CRK,TXN2,JUN,CDC42,RELA,SUGT1,MYO10,...",z_1,real
11,Nucleotide Biosynthesis R-HSA-8956320,-0.980421,-3.831108,0.000128,0.12403,0.011035,11,"DHODH,ADSS2,COASY,ATIC,UMPS,GART,GMPS,ADSL,PAI...",z_1,real


In [7]:
# load the latent dim matrix 
latent_dir = pathlib.Path("../2.train-VAE/results/latent_df.parquet").resolve()
latent_df = pd.read_parquet(latent_dir)
latent_df.head()
print(latent_df.shape)

(958, 37)


In [8]:
# Ensure ModelID is the index for both dataframes to align 
latent_df.set_index('ModelID', inplace=True)
prism_df.set_index('ModelID', inplace=True)
latent_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
ModelID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACH-000499,0.116793,0.0,3.545689,0.0,5.529771,0.0,0.0,0.0,0.0,0.0,...,0.0,4.155077,0.0,1.326112,0.0,4.327004,2.289593,2.347978,0.0,2.050914
ACH-001668,0.0,0.0,3.077717,0.0,4.795898,0.0,0.0,0.0,0.0,0.0,...,0.0,3.540287,0.0,1.125518,0.0,3.794856,2.156937,1.999025,0.0,1.775838
ACH-001211,0.153375,0.0,3.289188,0.0,4.810534,0.0,0.0,0.0,0.0,0.0,...,0.0,3.672348,0.0,1.189166,0.0,4.018786,2.493559,2.113322,0.0,2.088114
ACH-000997,0.981759,0.0,3.474591,0.0,5.523706,0.0,0.0,0.0,0.0,0.0,...,0.038345,3.702794,0.0,0.9579,0.0,4.090379,2.449086,2.523274,0.0,2.623637
ACH-001745,0.718717,0.0,2.982288,0.0,4.48022,0.0,0.0,0.0,0.0,0.0,...,0.0,3.362561,0.0,1.117164,0.0,3.605763,2.393928,2.12423,0.0,2.599002


In [9]:
# Align both dataframes based on the ModelID
common_model_ids = latent_df.index.intersection(prism_df.index)

In [10]:
# Filter both dataframes to keep only common ModelIDs
latent_df_filtered = latent_df.loc[common_model_ids]
prism_df_filtered = prism_df.loc[common_model_ids]

In [11]:
# Check the variance of each latent dimension and drug response column
latent_variance = latent_df_filtered.var()
prism_variance = prism_df_filtered.var()

# Filter out constant columns (variance == 0)
latent_df_filtered = latent_df_filtered.loc[:, latent_variance != 0]
prism_df_filtered = prism_df_filtered.loc[:, prism_variance != 0]

In [12]:
latent_df_filtered.head()

Unnamed: 0_level_0,0,2,3,4,5,8,9,11,13,14,...,24,26,27,29,30,31,32,33,34,35
ModelID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACH-000499,0.116793,3.545689,0.0,5.529771,0.0,0.0,0.0,1.127391,0.318939,0.245627,...,2.25422,0.0,4.155077,1.326112,0.0,4.327004,2.289593,2.347978,0.0,2.050914
ACH-001211,0.153375,3.289188,0.0,4.810534,0.0,0.0,0.0,1.084301,0.418505,0.153598,...,2.421721,0.0,3.672348,1.189166,0.0,4.018786,2.493559,2.113322,0.0,2.088114
ACH-000997,0.981759,3.474591,0.0,5.523706,0.0,0.0,0.0,1.7848,1.187676,0.466309,...,2.797282,0.038345,3.702794,0.9579,0.0,4.090379,2.449086,2.523274,0.0,2.623637
ACH-000991,0.74239,2.701471,0.0,4.14587,0.0,0.0,0.0,1.51413,0.967802,0.0,...,2.194706,0.0,2.864799,0.652992,0.0,2.979443,2.134192,1.711026,0.0,2.095009
ACH-000976,0.408425,2.288068,0.0,3.632316,0.0,0.0,0.0,1.324106,0.707384,0.0,...,1.824299,0.0,2.853092,0.639549,0.0,2.789583,1.861289,1.238621,0.0,1.944306


In [13]:
prism_df_filtered.head()

Unnamed: 0_level_0,BRD-A00077618-236-07-6::2.5::HTS,BRD-A00100033-001-08-9::2.5::HTS,BRD-A00147595-001-01-5::2.5::HTS,BRD-A00218260-001-03-4::2.5::HTS,BRD-A00376169-001-01-6::2.5::HTS,BRD-A00520476-001-07-4::2.5::HTS,BRD-A00546892-001-02-6::2.5::HTS,BRD-A00578795-001-04-3::2.5::HTS,BRD-A00758722-001-04-9::2.5::HTS,BRD-A00827783-001-24-6::2.5::HTS,...,BRD-K98557884-001-01-6::2.5::MTS004,BRD-K99077012-001-01-9::2.332734192::MTS004,BRD-K99199077-001-16-1::2.603211317::MTS004,BRD-K99431849-001-01-7::2.500018158::MTS004,BRD-K99447003-335-04-1::2.37737659::MTS004,BRD-K99506538-001-03-8::2.5::MTS004,BRD-K99616396-001-05-1::2.499991421::MTS004,BRD-K99879819-001-02-1::2.5187366::MTS004,BRD-K99919177-001-01-3::2.5::MTS004,BRD-M63173034-001-03-6::2.64076472::MTS004
ModelID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACH-000499,0.054812,0.243269,-0.500022,-0.36086,0.162478,0.136255,0.305524,-0.069632,-0.565769,-0.134557,...,0.070188,0.212325,0.024884,-0.118871,0.063809,0.139679,-0.665313,-0.141676,-2.69336,0.307056
ACH-001211,0.574641,0.432232,-0.036033,0.33423,0.023207,0.555496,0.550459,-0.314658,-0.842464,0.707809,...,0.491068,-0.154228,-0.153672,-0.280917,-0.373173,-0.185357,0.115126,0.12048,-2.032993,-0.25549
ACH-000997,0.303937,-0.512432,0.506612,0.404795,-0.028118,0.660699,0.28222,0.284187,-1.189394,-0.006098,...,0.498206,0.046906,-0.024142,-0.047734,0.492969,-0.005077,0.156377,0.356507,-1.93771,0.6645
ACH-000991,0.987977,-0.731795,0.758242,0.154601,-0.26767,0.64587,0.569782,-0.410402,-0.529644,-0.193156,...,0.749307,-0.322657,-0.343749,-0.622262,0.330623,0.136813,0.107156,-0.136725,-1.394123,0.501798
ACH-000976,-0.117612,-0.331519,0.017683,0.446033,-0.125278,0.369145,0.153985,-0.355988,-1.139236,-0.23039,...,0.388903,-0.019051,-0.02356,-0.253268,-0.080192,0.058574,0.166287,-0.382119,-3.20213,0.119646


In [14]:
# Create a dataframe to store the Pearson correlation results
correlation_results = []

# Iterate over each latent dimension and drug column
for latent_col in latent_df_filtered.columns:
    for drug_col in prism_df_filtered.columns:
        latent_values = latent_df_filtered[latent_col]
        drug_values = prism_df_filtered[drug_col]

        # Check if either column is constant
        if latent_values.nunique() <= 1 or drug_values.nunique() <= 1:
            corr = np.nan
        else:
            # Drop missing values for both columns
            valid_data = pd.concat([latent_values, drug_values], axis=1).dropna()
            latent_values_valid = valid_data[latent_col]
            drug_values_valid = valid_data[drug_col]

            if len(latent_values_valid) > 1 and len(drug_values_valid) > 1:
                # Calculate Pearson correlation
                corr, _ = pearsonr(latent_values_valid, drug_values_valid)
            else:
                corr = np.nan
                print("nan")
        
        # Store the result
        correlation_results.append({
            'latent_dimension': latent_col,
            'drug': drug_col,
            'correlation': corr
        })

# Convert the results to a dataframe for easier analysis
correlation_df = pd.DataFrame(correlation_results)

# Display the correlation dataframe
correlation_df.sort_values(by='correlation', key=abs, ascending=False).head(50)


  corr, _ = pearsonr(latent_values_valid, drug_values_valid)


Unnamed: 0,latent_dimension,drug,correlation
87438,23,BRD-K68395654-001-03-1::2.5::HTS,-0.36469
64008,18,BRD-K68395654-001-03-1::2.5::HTS,-0.362985
68768,19,BRD-K70463136-001-01-5::2.5::HTS,-0.358689
87166,23,BRD-K60443845-001-08-6::2.5::HTS,-0.357909
17148,4,BRD-K68395654-001-03-1::2.5::HTS,-0.354071
54636,16,BRD-K68395654-001-03-1::2.5::HTS,-0.35362
16876,4,BRD-K60443845-001-08-6::2.5::HTS,-0.35305
124926,33,BRD-K68395654-001-03-1::2.5::HTS,-0.351516
54364,16,BRD-K60443845-001-08-6::2.5::HTS,-0.348948
7504,2,BRD-K60443845-001-08-6::2.5::HTS,-0.346419


In [15]:
#Sort pathways by NES score (ascending order)
pathway_df.sort_values(by='nes', ascending=True)

ranked_gsea = pathway_df.sort_values(by='nes', key=abs, ascending=False)

#Group by 'z_dim' and aggregate 'Term' into a list of associated pathways
grouped_pathway_df = ranked_gsea.groupby('z_dim').apply(lambda x: x.nlargest(10, 'nes')['Term'].tolist()).reset_index(drop=False)

# remove z_
grouped_pathway_df['z_dim'] = grouped_pathway_df['z_dim'].str.replace('z_', '', regex=False)

grouped_pathway_df.columns = ['latent dimension', 'Associated Pathways']

grouped_pathway_df.head()


  grouped_pathway_df = ranked_gsea.groupby('z_dim').apply(lambda x: x.nlargest(10, 'nes')['Term'].tolist()).reset_index(drop=False)


Unnamed: 0,latent dimension,Associated Pathways
0,1,"[Leishmania Infection R-HSA-9658195, FCGR3A-me..."
1,10,"[rRNA Processing R-HSA-72312, rRNA Processing ..."
2,11,"[rRNA Processing R-HSA-72312, rRNA Processing ..."
3,12,"[Leishmania Infection R-HSA-9658195, MET Promo..."
4,13,[Citric Acid (TCA) Cycle And Respiratory Elect...


In [16]:
# Assuming 'drug_column_name' is the column in prism_trt_df that matches the 'drug' column in correlation_df
prism_trt_df_filtered = prism_trt_df[['column_name', 'name', 'moa', 'target', 'indication', 'phase']]

# Merge correlation_df with prism_trt_df based on the 'drug' column in correlation_df and the matching column in prism_trt_df
correlation_df_merged1 = pd.merge(correlation_df, prism_trt_df_filtered, how='left', left_on='drug', right_on='column_name')

# Drop the redundant drug_column_name column after the merge if needed
correlation_df_merged1 = correlation_df_merged1.drop(columns=['column_name'])

# Merge correlation_df with prism_trt_df based on the 'drug' column in correlation_df and the matching column in prism_trt_df
correlation_df_merged = pd.merge(correlation_df_merged1, grouped_pathway_df, how='left', left_on='latent_dimension', right_on='latent dimension')

# Drop the redundant drug_column_name column after the merge if needed
correlation_df_merged = correlation_df_merged.drop(columns=['latent dimension'])

significant_corr_df = correlation_df_merged[
    (correlation_df_merged['correlation'].abs() > 0.1)
]
# saving results as single output file
correlation_dir = pathlib.Path("./results/drug_correlation.parquet.gz")
significant_corr_df.to_parquet(correlation_dir)

# Display the updated dataframe with the new columns
correlation_df_merged.sort_values(by='correlation', key=abs, ascending=False).head(50)


Unnamed: 0,latent_dimension,drug,correlation,name,moa,target,indication,phase,Associated Pathways
87438,23,BRD-K68395654-001-03-1::2.5::HTS,-0.36469,EVP4593,NFkB pathway inhibitor,,,Preclinical,"[FCGR3A-mediated Phagocytosis R-HSA-9664422, F..."
64008,18,BRD-K68395654-001-03-1::2.5::HTS,-0.362985,EVP4593,NFkB pathway inhibitor,,,Preclinical,"[Leishmania Infection R-HSA-9658195, FCGR3A-me..."
68768,19,BRD-K70463136-001-01-5::2.5::HTS,-0.358689,BAY-87-2243,hypoxia inducible factor inhibitor,HIF1A,,Phase 1,"[Diseases Of Glycosylation R-HSA-3781865, Ubiq..."
87166,23,BRD-K60443845-001-08-6::2.5::HTS,-0.357909,chlormidazole,fungal lanosterol demethylase inhibitor,,fungal infection,Launched,"[FCGR3A-mediated Phagocytosis R-HSA-9664422, F..."
17148,4,BRD-K68395654-001-03-1::2.5::HTS,-0.354071,EVP4593,NFkB pathway inhibitor,,,Preclinical,[Citric Acid (TCA) Cycle And Respiratory Elect...
54636,16,BRD-K68395654-001-03-1::2.5::HTS,-0.35362,EVP4593,NFkB pathway inhibitor,,,Preclinical,"[Respiratory Electron Transport, ATP Synthesis..."
16876,4,BRD-K60443845-001-08-6::2.5::HTS,-0.35305,chlormidazole,fungal lanosterol demethylase inhibitor,,fungal infection,Launched,[Citric Acid (TCA) Cycle And Respiratory Elect...
124926,33,BRD-K68395654-001-03-1::2.5::HTS,-0.351516,EVP4593,NFkB pathway inhibitor,,,Preclinical,[APC/C-mediated Degradation Of Cell Cycle Prot...
54364,16,BRD-K60443845-001-08-6::2.5::HTS,-0.348948,chlormidazole,fungal lanosterol demethylase inhibitor,,fungal infection,Launched,"[Respiratory Electron Transport, ATP Synthesis..."
7504,2,BRD-K60443845-001-08-6::2.5::HTS,-0.346419,chlormidazole,fungal lanosterol demethylase inhibitor,,fungal infection,Launched,[Citric Acid (TCA) Cycle And Respiratory Elect...
