In [1]:
import pathlib
import sys
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from scipy.stats import ttest_ind, f_oneway

sys.path.append("../")
from utils import load_utils

In [2]:
# Load PRISM data
top_dir = "../5.drug-dependency"
data_dir = "data"

prism_df, prism_cell_df, prism_trt_df = load_utils.load_prism(
    top_dir=top_dir,
    data_dir=data_dir,
    secondary_screen=False,
    load_cell_info=True,
    load_treatment_info=True,
)

print(prism_df.shape)
prism_df.head(3)

(578, 4686)


Unnamed: 0,BRD-A00077618-236-07-6::2.5::HTS,BRD-A00100033-001-08-9::2.5::HTS,BRD-A00147595-001-01-5::2.5::HTS,BRD-A00218260-001-03-4::2.5::HTS,BRD-A00376169-001-01-6::2.5::HTS,BRD-A00520476-001-07-4::2.5::HTS,BRD-A00546892-001-02-6::2.5::HTS,BRD-A00578795-001-04-3::2.5::HTS,BRD-A00758722-001-04-9::2.5::HTS,BRD-A00827783-001-24-6::2.5::HTS,...,BRD-K98557884-001-01-6::2.5::MTS004,BRD-K99077012-001-01-9::2.332734192::MTS004,BRD-K99199077-001-16-1::2.603211317::MTS004,BRD-K99431849-001-01-7::2.500018158::MTS004,BRD-K99447003-335-04-1::2.37737659::MTS004,BRD-K99506538-001-03-8::2.5::MTS004,BRD-K99616396-001-05-1::2.499991421::MTS004,BRD-K99879819-001-02-1::2.5187366::MTS004,BRD-K99919177-001-01-3::2.5::MTS004,BRD-M63173034-001-03-6::2.64076472::MTS004
ACH-000001,-0.015577,-0.449332,0.489379,0.206675,0.27273,0.021036,-0.02546,0.467158,-0.736306,0.644137,...,0.429238,0.204841,0.150055,-0.575404,-0.101247,0.399233,-0.127658,-0.141651,-1.153652,0.510464
ACH-000007,-0.09573,0.257943,0.772349,-0.438502,-0.732832,0.779201,0.426523,-1.288508,-0.476133,-0.277105,...,-0.471486,0.212998,-0.12323,0.625527,0.383198,0.212031,0.349225,-0.387439,-0.831461,0.323558
ACH-000008,0.37948,-0.596132,0.548056,0.422269,-0.216986,0.081866,0.145335,-0.570841,-0.512119,0.452698,...,-0.111951,0.534787,0.206642,-0.410153,-0.560722,-0.036088,0.158071,0.171043,-3.94709,0.09931


In [3]:
#Load Model data
data_dir = pathlib.Path("../0.data-download/data")
model_input_file = pathlib.Path(f"{data_dir}/Model.parquet")
model_df = pd.read_parquet(model_input_file)

print(model_df.shape)
model_df.head(3)

(1959, 43)


Unnamed: 0,ModelID,PatientID,CellLineName,StrippedCellLineName,DepmapModelType,OncotreeLineage,OncotreePrimaryDisease,OncotreeSubtype,OncotreeCode,LegacyMolecularSubtype,...,EngineeredModel,TissueOrigin,ModelDerivationMaterial,PublicComments,CCLEName,HCMIID,WTSIMasterCellID,SangerModelID,COSMICID,DateSharedIndbGaP
0,ACH-000001,PT-gj46wT,NIH:OVCAR-3,NIHOVCAR3,HGSOC,Ovary/Fallopian Tube,Ovarian Epithelial Tumor,High-Grade Serous Ovarian Cancer,HGSOC,,...,,,,,NIHOVCAR3_OVARY,,2201.0,SIDM00105,905933.0,
1,ACH-000002,PT-5qa3uk,HL-60,HL60,AML,Myeloid,Acute Myeloid Leukemia,Acute Myeloid Leukemia,AML,,...,,,,,HL60_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,55.0,SIDM00829,905938.0,
2,ACH-000003,PT-puKIyc,CACO2,CACO2,COAD,Bowel,Colorectal Adenocarcinoma,Colon Adenocarcinoma,COAD,,...,,,,,CACO2_LARGE_INTESTINE,,,SIDM00891,,


In [4]:
#Load correlation data
#Load Model data
correlation_data_dir = pathlib.Path("../5.drug-dependency/results/")
correlation_input_file = pathlib.Path(f"{correlation_data_dir}/drug_correlation.parquet.gz")
correlation_df = pd.read_parquet(correlation_input_file)

print(correlation_df.shape)
correlation_df.head(3)

(17120, 9)


Unnamed: 0,latent_dimension,drug,correlation,name,moa,target,indication,phase,Associated Pathways
7,0,BRD-A00578795-001-04-3::2.5::HTS,-0.108747,zaltoprofen,cyclooxygenase inhibitor,,"fever, pain relief",Launched,
8,0,BRD-A00758722-001-04-9::2.5::HTS,-0.104911,noretynodrel,progestogen hormone,PGR,contraceptive,Launched,
15,0,BRD-A01593789-001-03-1::2.5::HTS,-0.106124,chlormadinone-acetate,5 alpha reductase inhibitor,PGR,"hypermenorrhea, amenorrhea, oligomenorrhea, po...",Launched,


In [5]:
# Merge drug_df with model_df on ModelID to add the OncotreePrimaryDisease column
drug_df_with_disease = prism_df.merge(model_df[['ModelID', 'OncotreePrimaryDisease']], left_index=True, right_on='ModelID')

In [6]:
# Prepare a DataFrame to store t-test results
ttest_results = []

# Loop through each OncotreePrimaryDisease type
for disease in drug_df_with_disease['OncotreePrimaryDisease'].unique():
    print(f"Processing {disease}...")

    # Filter the drug matrix for the current disease
    disease_drug_df = drug_df_with_disease[drug_df_with_disease['OncotreePrimaryDisease'] == disease].drop(columns=['ModelID', 'OncotreePrimaryDisease'])

    # Filter the drug matrix for the rest of the dataset (all other diseases)
    other_drug_df = drug_df_with_disease[drug_df_with_disease['OncotreePrimaryDisease'] != disease].drop(columns=['ModelID', 'OncotreePrimaryDisease'])

    # Perform t-tests comparing current disease vs the rest for each drug
    t_test_results = ttest_ind(disease_drug_df, other_drug_df, axis=0, nan_policy='omit')
    t_test_results_df = pd.DataFrame({
        "drug": prism_df.columns,
        "t_stat": t_test_results.statistic,
        "p_value": t_test_results.pvalue
    })

    # Filter significant drugs based on p-value < 0.05
    significant_drugs = t_test_results_df[t_test_results_df['p_value'] < 0.05]

    # Perform ANOVA on these drugs
    for drug in significant_drugs['drug'].unique():
        # Extract drug responses for the current drug
        disease_drug_responses = disease_drug_df[drug].dropna()
        other_drug_responses = other_drug_df[drug].dropna()

        # Perform ANOVA if both groups have sufficient data
        if len(disease_drug_responses) > 1 and len(other_drug_responses) > 1:
            f_statistic, p_value = f_oneway(disease_drug_responses, other_drug_responses)
            higher_group = disease if disease_drug_responses.mean() > other_drug_responses.mean() else "Other Types"

            # Store the results
            ttest_results.append({
                'OncotreePrimaryDisease': disease,
                'Drug': drug,
                'F-statistic': f_statistic,
                'p-value': p_value,
                'Higher in': higher_group
            })

# Convert results to DataFrame
ttest_results_df = pd.DataFrame(ttest_results)

# Apply a significance threshold (e.g., p < 0.05)
significant_ttest_results_df = ttest_results_df[ttest_results_df['p-value'] < 0.05]

# Display the top 50 significant results based on F-statistic
significant_ttest_results_df.sort_values(by='F-statistic', key=abs, ascending=False).head(50)

Processing Ovarian Epithelial Tumor...
Processing Colorectal Adenocarcinoma...
Processing Melanoma...
Processing Bladder Urothelial Carcinoma...
Processing Non-Small Cell Lung Cancer...
Processing Invasive Breast Carcinoma...
Processing Pancreatic Adenocarcinoma...
Processing Diffuse Glioma...
Processing Sarcoma, NOS...


  t_test_results = ttest_ind(disease_drug_df, other_drug_df, axis=0, nan_policy='omit')


Processing Ewing Sarcoma...
Processing Renal Cell Carcinoma...
Processing Esophagogastric Adenocarcinoma...
Processing Rhabdomyosarcoma...
Processing Fibrosarcoma...


  t_test_results = ttest_ind(disease_drug_df, other_drug_df, axis=0, nan_policy='omit')


Processing Neuroblastoma...
Processing Osteosarcoma...
Processing Pleural Mesothelioma...
Processing Prostate Adenocarcinoma...
Processing Rhabdoid Cancer...
Processing Adenosquamous Carcinoma of the Pancreas...
Processing Non-Cancerous...
Processing Intracholecystic Papillary Neoplasm...


  t_test_results = ttest_ind(disease_drug_df, other_drug_df, axis=0, nan_policy='omit')


Processing Head and Neck Squamous Cell Carcinoma...
Processing Anaplastic Thyroid Cancer...
Processing Ampullary Carcinoma...
Processing Endometrial Carcinoma...
Processing Intraductal Papillary Neoplasm of the Bile Duct...
Processing Embryonal Tumor...
Processing Hepatocellular Carcinoma...
Processing Lung Neuroendocrine Tumor...
Processing Esophageal Squamous Cell Carcinoma...
Processing Pancreatic Neuroendocrine Tumor...


  t_test_results = ttest_ind(disease_drug_df, other_drug_df, axis=0, nan_policy='omit')


Processing Chondrosarcoma...
Processing Uterine Sarcoma/Mesenchymal...
Processing Poorly Differentiated Thyroid Cancer...
Processing Leiomyosarcoma...
Processing Breast Ductal Carcinoma In Situ...
Processing Hepatoblastoma...
Processing Well-Differentiated Thyroid Cancer...
Processing Undifferentiated Pleomorphic Sarcoma/Malignant Fibrous Histiocytoma/High-Grade Spindle Cell Sarcoma...


  t_test_results = ttest_ind(disease_drug_df, other_drug_df, axis=0, nan_policy='omit')


Processing Bladder Squamous Cell Carcinoma...
Processing Urethral Cancer...


  t_test_results = ttest_ind(disease_drug_df, other_drug_df, axis=0, nan_policy='omit')


Processing Medullary Thyroid Cancer...


  t_test_results = ttest_ind(disease_drug_df, other_drug_df, axis=0, nan_policy='omit')


Unnamed: 0,OncotreePrimaryDisease,Drug,F-statistic,p-value,Higher in
956,Melanoma,BRD-A56085258-001-01-8::2.5::HTS,189.489352,2.411596e-37,Other Types
1245,Melanoma,BRD-K56343971-001-10-6::2.5::HTS,145.21212,6.609393e-30,Other Types
1008,Melanoma,BRD-K05804044-001-06-0::2.5::HTS,126.643434,1.2948029999999999e-26,Other Types
1490,Melanoma,BRD-K38527262-300-01-0::2.300987371::MTS004,119.990736,2.408021e-25,Other Types
1032,Melanoma,BRD-K09951645-001-06-8::2.5::HTS,112.044959,6.445948e-24,Other Types
1465,Melanoma,BRD-A75975749-001-01-4::2.5::MTS004,105.713314,9.263587000000001e-23,Other Types
1366,Melanoma,BRD-K78809024-001-05-7::2.5::HTS,101.865824,3.991097e-22,Other Types
1301,Melanoma,BRD-K67578145-001-12-1::2.5::HTS,92.554326,2.302241e-20,Other Types
1063,Melanoma,BRD-K16478699-001-09-2::2.5::HTS,91.742105,3.273193e-20,Other Types
1300,Melanoma,BRD-K67578145-001-09-7::2.38::HTS,90.803766,5.726354e-20,Other Types


In [7]:
# Assuming 'drug_column_name' is the column in prism_trt_df that matches the 'drug' column in correlation_df
prism_trt_df_filtered = prism_trt_df[['column_name', 'name', 'moa', 'target']]

# Merge correlation_df with prism_trt_df based on the 'drug' column in correlation_df and the matching column in prism_trt_df
merged_df = pd.merge(significant_ttest_results_df, prism_trt_df_filtered, how='left', left_on='Drug', right_on='column_name')

# Drop the redundant drug_column_name column after the merge if needed
merged_df = merged_df.drop(columns=['column_name'])

# Save results to a parquet file
merged_df.to_parquet("../5.drug-dependency/results/drug_diff_results.parquet", index=False)

In [8]:
# Assuming merged_df and correlation_df are already defined
correlation_df_filtered = correlation_df[['drug', 'latent_dimension', 'correlation', 'indication','phase']]

# Get unique values from "Higher in" column and exclude "Other types"
higher_in_values = merged_df["Higher in"].unique()
higher_in_values = [value for value in higher_in_values if value != "Other Types"]

# Filter for 'Diffuse Glioma' in the 'Higher in' column
diffuse_glioma_df = merged_df[merged_df["Higher in"] == "Diffuse Glioma"]

# Merge with correlation_df on the "Drug" column for Diffuse Glioma
diffuse_glioma_final_df = pd.merge(diffuse_glioma_df, correlation_df_filtered, left_on="Drug", right_on="drug", how="inner")

# Save the Diffuse Glioma DataFrame to a Parquet file
diffuse_glioma_parquet_path = pathlib.Path('../5.drug-dependency/results/Diffuse_Glioma_Analysis.parquet').resolve()
diffuse_glioma_final_df.to_parquet(diffuse_glioma_parquet_path)

#Path for PDF file
pdf_path = pathlib.Path('../5.drug-dependency/results/Individual_Drug_Analysis.pdf').resolve()

# Create a PDF file to save the plots
with PdfPages(pdf_path) as pdf:
    for value in higher_in_values:
        # Filter rows for the current "Higher in" value
        filtered_df = merged_df[merged_df["Higher in"] == value]

        # Merge with correlation_df on the "Drug" column
        final_df = pd.merge(filtered_df, correlation_df_filtered, left_on="Drug", right_on="drug", how="inner")

        # Apply significance threshold for F-statistic
        final_df = final_df[final_df['F-statistic'] > 7]
        final_df = final_df[final_df['correlation'].abs() > 0.13]

        # Sort by absolute correlation
        sorted_df = final_df.loc[final_df["correlation"].abs().sort_values(ascending=False).index]

        # If sorted_df is not empty, create a page with the dataframe
        if not sorted_df.empty:
            fig, ax = plt.subplots(figsize=(10, sorted_df.shape[0] * 0.5))  # Adjust the height based on the number of rows
            ax.axis('tight')
            ax.axis('off')

            # Create a table from the dataframe
            table = ax.table(cellText=sorted_df.values, colLabels=sorted_df.columns, cellLoc='center', loc='center')

            table.auto_set_font_size(False)
            table.set_fontsize(8)
            table.scale(5, 5)  # Adjust the scale of the table for better readability

            # Set the title of the page
            plt.title(f'Analysis for {value}', fontsize=10)
            print(f'Analysis for {value}')

            # Save the page to the PDF
            pdf.savefig(fig, bbox_inches='tight')
            plt.close()

             # Create a new page with the list of unique drug names
            unique_drugs = sorted_df['name'].unique()

            fig, ax = plt.subplots(figsize=(10, len(unique_drugs) * 0.25))
            ax.axis('off')  # No axes

            # Display the unique drugs as text
            drug_list_text = '\n'.join(unique_drugs)
            ax.text(0.5, 0.95, drug_list_text, va='top', ha='center', fontsize=15)

            # Add title for unique drugs list
            plt.title(f'Unique Drugs for {value}', fontsize=25)

            # Save this page to the PDF
            pdf.savefig(fig, bbox_inches='tight')
            plt.close()

Analysis for Ovarian Epithelial Tumor
Analysis for Colorectal Adenocarcinoma
Analysis for Melanoma
Analysis for Bladder Urothelial Carcinoma
Analysis for Non-Small Cell Lung Cancer
Analysis for Invasive Breast Carcinoma
Analysis for Pancreatic Adenocarcinoma
Analysis for Diffuse Glioma
Analysis for Ewing Sarcoma
Analysis for Renal Cell Carcinoma
Analysis for Esophagogastric Adenocarcinoma
Analysis for Rhabdomyosarcoma
Analysis for Neuroblastoma
Analysis for Osteosarcoma
Analysis for Pleural Mesothelioma
Analysis for Prostate Adenocarcinoma
Analysis for Rhabdoid Cancer
Analysis for Adenosquamous Carcinoma of the Pancreas
Analysis for Head and Neck Squamous Cell Carcinoma
Analysis for Endometrial Carcinoma
Analysis for Intraductal Papillary Neoplasm of the Bile Duct
Analysis for Hepatocellular Carcinoma
Analysis for Lung Neuroendocrine Tumor
Analysis for Esophageal Squamous Cell Carcinoma
Analysis for Chondrosarcoma
Analysis for Uterine Sarcoma/Mesenchymal
Analysis for Hepatoblastoma
Ana