### Import Libraries


In [1]:
import sys
import pathlib

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from ccc.coef import ccc
from scipy.spatial.distance import squareform

sys.path.append("../utils")
import validate_utils


#### Set load/save paths


In [2]:
# external path to be set
classification_profiles_save_dir = pathlib.Path(
    "/media/roshankern/63af2010-c376-459e-a56e-576b170133b6/data/cell-health-plate-classification-profiles"
)

MCM_classification_profiles_save_dir = pathlib.Path(
    f"{classification_profiles_save_dir}/multi_class_models/"
)
SCM_classification_profiles_save_dir = pathlib.Path(
    f"{classification_profiles_save_dir}/single_class_models/"
)

tidy_long_corrs_save_dir = pathlib.Path("validations")
tidy_long_corrs_save_dir.mkdir(exist_ok=True, parents=True)


### Load Cell Health Profile Labels


In [3]:
cell_health_hash = "30ea5de393eb9cfc10b575582aa9f0f857b44c59"
cell_health_labels_link = f"https://raw.github.com/broadinstitute/cell-health/{cell_health_hash}/1.generate-profiles/data/consensus/cell_health_median.tsv.gz"

cell_health_labels = pd.read_csv(cell_health_labels_link, compression="gzip", sep="\t")
cell_health_labels

Unnamed: 0,Metadata_profile_id,Metadata_pert_name,Metadata_cell_line,cc_all_high_h2ax,cc_all_large_notround_polynuclear_mean,cc_all_large_round_polyploid_mean,cc_all_n_objects,cc_all_n_spots_h2ax_mean,cc_all_n_spots_h2ax_per_nucleus_area_mean,cc_all_nucleus_area_mean,...,vb_num_live_cells,vb_percent_all_apoptosis,vb_percent_caspase_dead_only,vb_percent_dead,vb_percent_dead_only,vb_percent_early_apoptosis,vb_percent_late_apoptosis,vb_percent_live,vb_ros_back_mean,vb_ros_mean
0,profile_0,AKT1-1,A549,0.008156,0.587977,0.018820,0.381501,0.176564,0.187675,-0.170616,...,0.399842,0.000000,-0.118976,-0.132871,-0.121090,0.000000,0.000000,0.132882,0.806970,1.293984
1,profile_1,AKT1-2,A549,0.056667,1.264627,0.241450,0.568443,0.235304,0.372684,-0.276888,...,0.101670,0.318027,0.621374,0.100032,0.074036,0.132751,0.467027,-0.099917,0.558041,1.151867
2,profile_2,ARID1B-1,A549,0.111163,1.092964,0.151393,0.290203,0.402121,0.481700,-0.276980,...,0.080701,0.339100,0.598093,0.055951,0.042014,0.165161,0.247058,-0.055920,-0.393937,0.103202
3,profile_3,ARID1B-2,A549,-0.061528,0.320829,-0.091007,0.141819,-0.378769,-0.288693,-0.108741,...,0.265754,0.098699,0.371930,-0.063935,-0.055160,0.138654,0.000000,0.063946,0.210005,0.055291
4,profile_4,ATF4-1,A549,3.967818,0.003400,3.268615,-2.246887,2.891737,2.878938,2.853995,...,-2.343919,0.000000,-0.089544,0.141535,0.131393,0.000000,0.000000,-0.141397,-0.631390,0.106477
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352,profile_352,TXN-2,HCC44,-0.342210,0.215623,0.375192,0.900283,-0.116349,-0.083793,0.207844,...,1.568620,-0.409888,0.148385,-0.519448,-0.372714,-0.136141,-0.502358,0.520090,0.182738,-0.116806
353,profile_353,XBP-1,HCC44,-0.247848,0.564827,-0.422787,0.780852,-0.074603,0.000000,-0.123090,...,0.344139,0.172909,0.839550,-0.212904,-0.245979,0.417994,0.028582,0.213691,0.245257,0.105633
354,profile_354,XBP-2,HCC44,-0.349572,-0.303116,-0.458782,0.434831,-0.157653,-0.058808,-0.404611,...,1.341318,-0.069684,0.540557,-0.146842,-0.288259,-0.179506,0.047976,0.147173,0.744648,0.023499
355,profile_355,YAP1-1,HCC44,-0.240422,0.408734,-0.351545,0.387100,-0.132059,0.002797,-0.059520,...,0.091803,-0.155056,-0.025451,-0.097891,-0.080691,-0.227172,-0.114292,0.098480,0.451868,0.127519


### Derive classification profile and cell health label correlations (multi-class models)


In [4]:
print("Deriving multi-class model correlations...")
# list for compiling tidy long correlation data
compiled_tidy_long_corrs = []
for classification_profiles_path in MCM_classification_profiles_save_dir.iterdir():

    # get information about the current model
    model_type = classification_profiles_path.name.split("__")[0]
    feature_type = classification_profiles_path.name.split("__")[1]

    print(
        f"Deriving correlations for {model_type} model with {feature_type} features..."
    )

    # load classification profiles
    classification_profiles = pd.read_csv(classification_profiles_path, sep="\t")

    # combine cell health label profiles and classification profiles on perturbation and cell line
    final_profile_dataframe = pd.merge(
        cell_health_labels,
        classification_profiles,
        on=["Metadata_pert_name", "Metadata_cell_line"],
    )

    # get tidy long correlations for this model's predictions
    model_tidy_long_corrs = validate_utils.get_tidy_long_corrs(final_profile_dataframe)

    # add model metadata
    model_tidy_long_corrs["model_type"] = model_type
    model_tidy_long_corrs["feature_type"] = feature_type

    # add correlations to compilation list
    compiled_tidy_long_corrs.append(model_tidy_long_corrs)


# compile and save tidy long data
compiled_tidy_long_corrs = pd.concat(compiled_tidy_long_corrs)
compiled_tidy_long_corrs = compiled_tidy_long_corrs.reset_index(drop=True)
compiled_tidy_long_corrs.to_csv(
    f"{tidy_long_corrs_save_dir}/compiled_correlations__MCM.tsv", sep="\t"
)

# preview tidy data
compiled_tidy_long_corrs

Deriving multi-class model correlations...
Deriving correlations for final model with CP features...
Deriving correlations for shuffled_baseline model with CP_and_DP features...
Deriving correlations for final model with DP features...
Deriving correlations for shuffled_baseline model with DP features...
Deriving correlations for shuffled_baseline model with CP features...
Deriving correlations for final model with CP_and_DP features...


Unnamed: 0,phenotypic_class,cell_line,corr_type,cell_health_indicator,corr_value,model_type,feature_type
0,ADCCM,all,pearson,cc_all_high_h2ax,-0.282781,final,CP
1,Anaphase,all,pearson,cc_all_high_h2ax,-0.070543,final,CP
2,Apoptosis,all,pearson,cc_all_high_h2ax,0.148195,final,CP
3,Binuclear,all,pearson,cc_all_high_h2ax,-0.099941,final,CP
4,Elongated,all,pearson,cc_all_high_h2ax,0.155004,final,CP
...,...,...,...,...,...,...,...
8395,MetaphaseAlignment,HCC44,ccc,vb_ros_mean,0.026201,final,CP_and_DP
8396,OutOfFocus,HCC44,ccc,vb_ros_mean,0.105229,final,CP_and_DP
8397,Polylobed,HCC44,ccc,vb_ros_mean,0.040965,final,CP_and_DP
8398,Prometaphase,HCC44,ccc,vb_ros_mean,0.029666,final,CP_and_DP


### Derive classification profile and cell health label correlations (single-class models)


In [5]:
print("Deriving single-class model correlations...")
# list for compiling tidy long correlation data
compiled_tidy_long_corrs = []
for phenotypic_class_path in SCM_classification_profiles_save_dir.iterdir():
    for classification_profiles_path in phenotypic_class_path.iterdir():

        # get information about the current model
        phenotypic_class = phenotypic_class_path.name
        model_type = classification_profiles_path.name.split("__")[0]
        feature_type = classification_profiles_path.name.split("__")[1]

        print(
            f"Deriving correlations for {model_type}, {phenotypic_class} model with {feature_type} features..."
        )

        # load classification profiles
        classification_profiles = pd.read_csv(classification_profiles_path, sep="\t")

        # combine cell health label profiles and classification profiles on perturbation and cell line
        final_profile_dataframe = pd.merge(
            cell_health_labels,
            classification_profiles,
            on=["Metadata_pert_name", "Metadata_cell_line"],
        )

        # get tidy long correlations for this model's predictions
        model_tidy_long_corrs = validate_utils.get_tidy_long_corrs(
            final_profile_dataframe
        )

        # add model metadata
        model_tidy_long_corrs["model_type"] = model_type
        model_tidy_long_corrs["feature_type"] = feature_type

        # add correlations to compilation list
        compiled_tidy_long_corrs.append(model_tidy_long_corrs)


# compile and save tidy long data
compiled_tidy_long_corrs = pd.concat(compiled_tidy_long_corrs)
compiled_tidy_long_corrs = compiled_tidy_long_corrs.reset_index(drop=True)
compiled_tidy_long_corrs.to_csv(
    f"{tidy_long_corrs_save_dir}/compiled_correlations__SCM.tsv", sep="\t"
)

# preview tidy data
compiled_tidy_long_corrs

Deriving single-class model correlations...
Deriving correlations for final, OutOfFocus model with CP features...
Deriving correlations for shuffled_baseline, OutOfFocus model with CP_and_DP features...
Deriving correlations for final, OutOfFocus model with DP features...
Deriving correlations for shuffled_baseline, OutOfFocus model with DP features...
Deriving correlations for shuffled_baseline, OutOfFocus model with CP features...
Deriving correlations for final, OutOfFocus model with CP_and_DP features...
Deriving correlations for final, Apoptosis model with CP features...
Deriving correlations for shuffled_baseline, Apoptosis model with CP_and_DP features...
Deriving correlations for final, Apoptosis model with DP features...
Deriving correlations for shuffled_baseline, Apoptosis model with DP features...
Deriving correlations for shuffled_baseline, Apoptosis model with CP features...
Deriving correlations for final, Apoptosis model with CP_and_DP features...
Deriving correlations 

Unnamed: 0,phenotypic_class,cell_line,corr_type,cell_health_indicator,corr_value,model_type,feature_type
0,OutOfFocus,all,pearson,cc_all_high_h2ax,0.288700,final,CP
1,OutOfFocus Negative,all,pearson,cc_all_high_h2ax,-0.288700,final,CP
2,OutOfFocus,all,ccc,cc_all_high_h2ax,0.030427,final,CP
3,OutOfFocus Negative,all,ccc,cc_all_high_h2ax,0.032504,final,CP
4,OutOfFocus,A549,pearson,cc_all_high_h2ax,0.273260,final,CP
...,...,...,...,...,...,...,...
1115,Elongated Negative,ES2,ccc,vb_ros_mean,0.048834,final,CP_and_DP
1116,Elongated,HCC44,pearson,vb_ros_mean,-0.000785,final,CP_and_DP
1117,Elongated Negative,HCC44,pearson,vb_ros_mean,0.000785,final,CP_and_DP
1118,Elongated,HCC44,ccc,vb_ros_mean,0.027207,final,CP_and_DP
