In [1]:
import gc
import pathlib

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import toml

## Morphology Feature space stats

In [2]:
# set paths to data
norm_data_path = pathlib.Path("../../../data/PBMC_sc_norm.parquet").resolve(strict=True)

# fs data
norm_fs_data_path = pathlib.Path(
    "../../../data/PBMC_preprocessed_sc_norm.parquet"
).resolve(strict=True)

## Check Raw features shape

In [None]:
# load in the normalized data
norm_schema = pq.read_schema(norm_data_path)

# get a list of column names
norm_cols = [col.name for col in norm_schema]
print(len(norm_cols))
# get columns that contain Metadata
metadata_cols = [col for col in norm_cols if "Metadata" in col]
# remove metadata columns from the list of columns
data_cols = [col for col in norm_cols if col not in metadata_cols]

print(f"There are {len(data_cols)} data columns")
print(f"There are {len(metadata_cols)} metadata columns")

2926
There are 2907 data columns
There are 19 metadata columns


## Check feature selected shape

In [4]:
norm_fs_schema = pq.read_schema(norm_fs_data_path)

# get a list of column names
norm_cols = [col.name for col in norm_schema]
print(len(norm_cols))
# get columns that contain Metadata
metadata_cols = [col for col in norm_cols if "Metadata" in col]
# remove metadata columns from the list of columns
data_cols = [col for col in norm_cols if col not in metadata_cols]

print(f"There are {len(data_cols)} data columns")
print(f"There are {len(metadata_cols)} metadata columns")

# get columns that contain Metadata

norm_fs_df_subset = pd.read_parquet(
    norm_fs_data_path,
    columns=["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
)
print(norm_fs_df_subset.shape)
norm_fs_df_subset.head()

2926
There are 2907 data columns
There are 19 metadata columns
(8318724, 2)


Unnamed: 0,Metadata_Well,oneb_Metadata_Treatment_Dose_Inhibitor_Dose
0,B02,LPS_0.010_ug_per_ml_DMSO_0.025_%
1,B02,LPS_0.010_ug_per_ml_DMSO_0.025_%
2,B02,LPS_0.010_ug_per_ml_DMSO_0.025_%
3,B02,LPS_0.010_ug_per_ml_DMSO_0.025_%
4,B02,LPS_0.010_ug_per_ml_DMSO_0.025_%


In [None]:
# path to the ground truth file
ground_truth_file_path = pathlib.Path(
    "../../../4.sc_Morphology_Neural_Network_MLP_Model/MLP_utils/ground_truth.toml"
).resolve(strict=True)
# read in the ground truth toml file
ground_truth = toml.load(ground_truth_file_path)
# get information from toml files
apoptosis_groups_list = ground_truth["Apoptosis"]["apoptosis_groups_list"]
pyroptosis_groups_list = ground_truth["Pyroptosis"]["pyroptosis_groups_list"]
healthy_groups_list = ground_truth["Healthy"]["healthy_groups_list"]
# add apoptosis, pyroptosis and healthy columns to dataframe
norm_fs_df_subset["apoptosis"] = norm_fs_df_subset.apply(
    lambda row: row["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
    in apoptosis_groups_list,
    axis=1,
)
norm_fs_df_subset["pyroptosis"] = norm_fs_df_subset.apply(
    lambda row: row["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
    in pyroptosis_groups_list,
    axis=1,
)
norm_fs_df_subset["healthy"] = norm_fs_df_subset.apply(
    lambda row: row["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
    in healthy_groups_list,
    axis=1,
)

# merge apoptosis, pyroptosis, and healthy columns into one column
norm_fs_df_subset["labels"] = norm_fs_df_subset.apply(
    lambda row: "apoptosis"
    if row["apoptosis"]
    else "pyroptosis"
    if row["pyroptosis"]
    else "healthy",
    axis=1,
)
# drop apoptosis, pyroptosis, and healthy columns
norm_fs_df_subset.drop(columns=["apoptosis", "pyroptosis", "healthy"], inplace=True)

In [6]:
# print the number of samples in each class
print(norm_fs_df_subset["labels"].value_counts())

healthy       4301036
pyroptosis    3578372
apoptosis      439316
Name: labels, dtype: int64


## Stats for the Elastic Net models

In [9]:
# set path for models performances
model_performances_path = pathlib.Path(
    "../../../6.bulk_Morphology_Elastic_Network/4.model_performance/results/regression/PBMC/all_model_performance.csv"
).resolve(strict=True)
# load in the model performances
model_performances = pd.read_csv(model_performances_path)

In [10]:
# drop uneeded columns
columns_to_drop = [
    "feature_names",
    "coefficients",
    "cell_type",
    "alpha",
    "l1_ratio",
]
model_performances.drop(columns=columns_to_drop, inplace=True)
# drop duplicates
print(model_performances.shape)
model_performances.drop_duplicates(inplace=True)
print(model_performances.shape)
model_performances.head()

(448426, 3)
(374, 3)


Unnamed: 0,secreted_proteins,shuffle,r2
0,CXCL17 [NSU],final,0.301493
1199,IL-7 [NSU],shuffled,-0.206494
2398,CXCL7 [NSU],shuffled,-0.001184
3597,CCL20 [NSU],final,0.957082
4796,IL-31 [NSU],final,-0.171904


In [11]:
# split the shuffled and final model performances
suffled_models = model_performances.loc[model_performances["shuffle"] == "shuffled"]
final_models = model_performances.loc[model_performances["shuffle"] == "final"]
print(suffled_models.shape)
print(final_models.shape)

(187, 3)
(187, 3)


In [12]:
# sort the final models by r2 score
final_models.sort_values(by="r2", ascending=False, inplace=True)
final_models.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_models.sort_values(by="r2", ascending=False, inplace=True)


Unnamed: 0,secreted_proteins,shuffle,r2
402864,TNF alpha [NSU],final,0.981769
86328,IL-1 beta [NSU],final,0.979441
275770,IL-6 [NSU],final,0.977715
394471,CCL4 [NSU],final,0.970054
196636,CCL3 [NSU],final,0.965891


In [13]:
# get the percentage of models that are above the threshold
threshold = 0.8
final_models_above_threshold = final_models.loc[final_models["r2"] >= threshold]
print(
    f"Percentage of models with r2 score above {threshold}: "
    f"{(final_models_above_threshold.shape[0] / final_models.shape[0]) * 100}",
    f"\n"
    f"The total number of models above the threshold is: {final_models_above_threshold.shape[0]}",
)

Percentage of models with r2 score above 0.8: 21.390374331550802 
The total number of models above the threshold is: 40


In [14]:
# sort the shuffled models by r2 score from low to high
final_models.sort_values(by="r2", ascending=True, inplace=True)
final_models.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_models.sort_values(by="r2", ascending=True, inplace=True)


Unnamed: 0,secreted_proteins,shuffle,r2
426844,FGF-21 [NSU],final,-0.331974
437635,IL-17C [NSU],final,-0.306111
326128,CX3CL1 [NSU],final,-0.293323
153472,IL-11 [NSU],final,-0.266779
74338,AITRL (GITR Ligand) [NSU],final,-0.230527


## LOCO ENET stats

In [58]:
# set path for models performances
model_performances_path = pathlib.Path(
    "../../../11.bulk_Morphology_Elastic_Network_LOCO/2.test_models/results/regression/PBMC_aggregated_with_nomic/model_stats.csv"
).resolve(strict=True)

variance_r2_stats_path = pathlib.Path(
    "../../../11.bulk_Morphology_Elastic_Network_LOCO/2.test_models/results/regression/PBMC_aggregated_with_nomic/variance_r2_stats.csv"
).resolve(strict=True)

model_performances = pd.read_csv(model_performances_path)
print(model_performances.shape)
model_performances.head()

(1843072, 15)


Unnamed: 0,explained_variance,neg_mean_absolute_error,neg_mean_squared_error,well,treatment,r2,cytokine,data_split,shuffle,predicted_value,actual_value,log10_neg_mean_absolute_error,log10_neg_mean_squared_error,log10_explained_variance,channel_feature_combinations_key
0,-0.244014,-0.043214,-0.043214,B02,LPS_0.010_ug_per_ml_DMSO_0.025_%,-0.269853,HVEM,test,final,0.4492,0.483963,1.36438,1.36438,,CorrDNA_CorrPM_CorrER
1,-0.244014,-0.043214,-0.043214,B03,LPS_0.010_ug_per_ml_DMSO_0.025_%,-0.269853,HVEM,test,final,0.42098,0.7367,1.36438,1.36438,,CorrDNA_CorrPM_CorrER
2,-0.244014,-0.043214,-0.043214,B04,LPS_Nigericin_100.000_ug_per_ml_1.000_uM_DMSO_...,-0.269853,HVEM,test,final,0.463055,0.684621,1.36438,1.36438,,CorrDNA_CorrPM_CorrER
3,-0.244014,-0.043214,-0.043214,B05,LPS_Nigericin_100.000_ug_per_ml_1.000_uM_DMSO_...,-0.269853,HVEM,test,final,0.452338,0.159467,1.36438,1.36438,,CorrDNA_CorrPM_CorrER
4,-0.244014,-0.043214,-0.043214,B06,DMSO_0.100_%_DMSO_0.025_%,-0.269853,HVEM,test,final,0.359926,0.470715,1.36438,1.36438,,CorrDNA_CorrPM_CorrER


In [59]:
variance_r2_stats = pd.read_csv(variance_r2_stats_path)
print(variance_r2_stats.shape)
variance_r2_stats.head()

(23936, 7)


Unnamed: 0,cytokine,data_split,shuffle,channel_feature_combinations_key,predicted_value,actual_value,r2
0,FGF-19,test,shuffled_baseline,CorrMito_CorrER,0.000606,0.039204,[-0.0412176]
1,IFgamma,train,final,CorrDNA_CorrMito,0.085494,0.100815,[0.93373251]
2,BMP9,test,final,No_channels,0.004012,0.031773,[0.15761338]
3,TRAIL,train,shuffled_baseline,CorrDNA_CorrMito,0.001925,0.019566,[-0.01720808]
4,TPO(Thrombopoietin),train,shuffled_baseline,CorrDNA_CorrPM_CorrMito_CorrER,0.0,0.024622,[0.]


In [60]:
# get only select keys
model_performances = model_performances.loc[
    model_performances["channel_feature_combinations_key"].isin(
        [
            "All_channels",
            "CorrDNA_CorrGasdermin_CorrMito_CorrER",
            "CorrDNA_CorrPM_CorrGasdermin_CorrER",
            "CorrDNA_CorrPM_CorrGasdermin_CorrMito",
            "CorrDNA_CorrPM_CorrMito_CorrER",
            "CorrPM_CorrGasdermin_CorrMito_CorrER",
        ]
    )
]
# replace string values with more readable names
model_performances["channel_feature_combinations_key"] = model_performances[
    "channel_feature_combinations_key"
].replace(
    {
        "All_channels": "All channels",
        "CorrDNA_CorrGasdermin_CorrMito_CorrER": "PM removed",
        "CorrDNA_CorrPM_CorrGasdermin_CorrER": "Mito removed",
        "CorrDNA_CorrPM_CorrGasdermin_CorrMito": "ER removed",
        "CorrDNA_CorrPM_CorrMito_CorrER": "Gasdermin removed",
        "CorrPM_CorrGasdermin_CorrMito_CorrER": "DNA removed",
    }
)
model_performances["channel_feature_combinations_key"].unique()


# get only select keys
variance_r2_stats = variance_r2_stats.loc[
    variance_r2_stats["channel_feature_combinations_key"].isin(
        [
            "All_channels",
            "CorrDNA_CorrGasdermin_CorrMito_CorrER",
            "CorrDNA_CorrPM_CorrGasdermin_CorrER",
            "CorrDNA_CorrPM_CorrGasdermin_CorrMito",
            "CorrDNA_CorrPM_CorrMito_CorrER",
            "CorrPM_CorrGasdermin_CorrMito_CorrER",
        ]
    )
]

# replace string values with more readable names
variance_r2_stats["channel_feature_combinations_key"] = variance_r2_stats[
    "channel_feature_combinations_key"
].replace(
    {
        "All_channels": "All channels",
        "CorrDNA_CorrGasdermin_CorrMito_CorrER": "PM removed",
        "CorrDNA_CorrPM_CorrGasdermin_CorrER": "Mito removed",
        "CorrDNA_CorrPM_CorrGasdermin_CorrMito": "ER removed",
        "CorrDNA_CorrPM_CorrMito_CorrER": "Gasdermin removed",
        "CorrPM_CorrGasdermin_CorrMito_CorrER": "DNA removed",
    }
)

# drop the shuffled models
model_performances = model_performances.loc[model_performances["shuffle"] == "final"]
variance_r2_stats = variance_r2_stats.loc[variance_r2_stats["shuffle"] == "final"]
print(model_performances.shape)

print(variance_r2_stats.shape)

(172788, 15)
(2244, 7)


In [None]:
model_performances
# get the explained variance, MSE, R2 for each cytokine, data split, channel combination
model_performances_grouped = model_performances.groupby(
    ["cytokine", "data_split", "channel_feature_combinations_key"]
).agg(
    {
        "explained_variance": "mean",
        "neg_mean_squared_error": "mean",
        "r2": "mean",
    }
)
model_performances_grouped.reset_index(inplace=True)
print(model_performances_grouped.shape)

(2244, 6)


## Stats for 11A-C

In [None]:
# get the global average of neg mean squared error, explained variance, and r2 for each channel combination
channel_feature_combinations_key_global_avg = model_performances_grouped.groupby(
    "channel_feature_combinations_key"
).agg(
    {
        "explained_variance": "mean",
        "neg_mean_squared_error": "mean",
        "r2": "mean",
    }
)

# get the average of the explained variance, MSE, and R2 for each channel combination
channel_feature_combinations_key_global_avg.reset_index(inplace=True)
channel_feature_combinations_key_global_avg[
    "percent_change_in_negMSE_compared_to_all_channels"
] = (
    (
        channel_feature_combinations_key_global_avg["neg_mean_squared_error"]
        - channel_feature_combinations_key_global_avg.loc[
            channel_feature_combinations_key_global_avg[
                "channel_feature_combinations_key"
            ]
            == "All channels",
            "neg_mean_squared_error",
        ].values[0]
    )
    / channel_feature_combinations_key_global_avg.loc[
        channel_feature_combinations_key_global_avg["channel_feature_combinations_key"]
        == "All channels",
        "neg_mean_squared_error",
    ].values[0]
    * 100
)
channel_feature_combinations_key_global_avg[
    "percent_change_in_explained_variance_compared_to_all_channels"
] = (
    (
        channel_feature_combinations_key_global_avg["explained_variance"]
        - channel_feature_combinations_key_global_avg.loc[
            channel_feature_combinations_key_global_avg[
                "channel_feature_combinations_key"
            ]
            == "All channels",
            "explained_variance",
        ].values[0]
    )
    / channel_feature_combinations_key_global_avg.loc[
        channel_feature_combinations_key_global_avg["channel_feature_combinations_key"]
        == "All channels",
        "explained_variance",
    ].values[0]
    * 100
)
channel_feature_combinations_key_global_avg[
    "percent_change_in_r2_compared_to_all_channels"
] = (
    (
        channel_feature_combinations_key_global_avg["r2"]
        - channel_feature_combinations_key_global_avg.loc[
            channel_feature_combinations_key_global_avg[
                "channel_feature_combinations_key"
            ]
            == "All channels",
            "r2",
        ].values[0]
    )
    / channel_feature_combinations_key_global_avg.loc[
        channel_feature_combinations_key_global_avg["channel_feature_combinations_key"]
        == "All channels",
        "r2",
    ].values[0]
    * 100
)
channel_feature_combinations_key_global_avg

Unnamed: 0,channel_feature_combinations_key,explained_variance,neg_mean_squared_error,r2,percent_change_in_negMSE_compared_to_all_channels,percent_change_in_explained_variance_compared_to_all_channels,percent_change_in_r2_compared_to_all_channels
0,All channels,0.400583,-0.019796,0.392975,-0.0,0.0,0.0
1,DNA removed,0.390905,-0.020333,0.383783,2.71393,-2.415817,-2.339112
2,ER removed,0.392863,-0.020191,0.385536,1.994671,-1.927113,-1.892882
3,Gasdermin removed,0.395321,-0.020094,0.38819,1.508699,-1.313463,-1.21775
4,Mito removed,0.393283,-0.020171,0.386,1.894805,-1.822374,-1.775029
5,PM removed,0.390773,-0.02035,0.383315,2.800117,-2.448902,-2.458193


In [64]:
# get the min and max r2 values for each channel combination
channel_feature_combinations_key_min_max = model_performances_grouped.groupby(
    "channel_feature_combinations_key"
).agg(
    {
        "r2": ["min", "max"],
    }
)
channel_feature_combinations_key_min_max.reset_index(inplace=True)
channel_feature_combinations_key_min_max

Unnamed: 0_level_0,channel_feature_combinations_key,r2,r2
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max
0,All channels,-0.364949,0.995381
1,DNA removed,-0.260378,0.988398
2,ER removed,-0.33922,0.991563
3,Gasdermin removed,-0.270777,0.992087
4,Mito removed,-0.356742,0.989602
5,PM removed,-0.382889,0.990629


In [None]:
# subset for IL-1beta across all channel combinations
IL1beta_model_performances = model_performances_grouped.loc[
    model_performances_grouped["cytokine"] == "IL-1beta"
]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  IL1beta_model_performances['percent_change_in_negMSE_compared_to_all_channels'] = (IL1beta_model_performances['neg_mean_squared_error'] - IL1beta_model_performances.loc[IL1beta_model_performances['channel_feature_combinations_key'] == 'All channels', 'neg_mean_squared_error'].values[0]) / IL1beta_model_performances.loc[IL1beta_model_performances['channel_feature_combinations_key'] == 'All channels', 'neg_mean_squared_error'].values[0] * 100


Unnamed: 0,cytokine,data_split,channel_feature_combinations_key,explained_variance,neg_mean_squared_error,r2,percent_change_in_negMSE_compared_to_all_channels
1392,IL-1beta,test,All channels,0.978165,-0.004284,0.97777,-0.0
1393,IL-1beta,test,DNA removed,0.965716,-0.006714,0.965159,56.727855
1394,IL-1beta,test,ER removed,0.97133,-0.005732,0.970258,33.791763
1395,IL-1beta,test,Gasdermin removed,0.971927,-0.005515,0.971384,28.728512
1396,IL-1beta,test,Mito removed,0.962667,-0.007258,0.962335,69.432179
1397,IL-1beta,test,PM removed,0.967527,-0.006346,0.96707,48.133154
1398,IL-1beta,train,All channels,0.994245,-0.001062,0.994245,-75.212591
1399,IL-1beta,train,DNA removed,0.985091,-0.002751,0.985091,-35.782925
1400,IL-1beta,train,ER removed,0.987214,-0.002359,0.987214,-44.929671
1401,IL-1beta,train,Gasdermin removed,0.988139,-0.002189,0.988139,-48.912062
