In [1]:
import gc
import pathlib

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import toml

## Morphology Feature space stats

In [2]:
# set paths to data
norm_data_path = pathlib.Path("../../../data/PBMC_sc_norm.parquet").resolve(strict=True)

# fs data
norm_fs_data_path = pathlib.Path(
    "../../../data/PBMC_preprocessed_sc_norm.parquet"
).resolve(strict=True)

## Check Raw features shape

In [3]:
# load in the normalized data
# norm_data = pd.read_parquet(norm_data_path)

norm_schema = pq.read_schema(norm_data_path)

# get a list of column names
norm_cols = [col.name for col in norm_schema]
print(len(norm_cols))
# get columns that contain Metadata
metadata_cols = [col for col in norm_cols if "Metadata" in col]
# remove metadata columns from the list of columns
data_cols = [col for col in norm_cols if col not in metadata_cols]

print(f"There are {len(data_cols)} data columns")
print(f"There are {len(metadata_cols)} metadata columns")

2926
There are 2907 data columns
There are 19 metadata columns


## Check feature selected shape

In [4]:
norm_fs_schema = pq.read_schema(norm_fs_data_path)

# get a list of column names
norm_cols = [col.name for col in norm_schema]
print(len(norm_cols))
# get columns that contain Metadata
metadata_cols = [col for col in norm_cols if "Metadata" in col]
# remove metadata columns from the list of columns
data_cols = [col for col in norm_cols if col not in metadata_cols]

print(f"There are {len(data_cols)} data columns")
print(f"There are {len(metadata_cols)} metadata columns")

# get columns that contain Metadata

norm_fs_df_subset = pd.read_parquet(
    norm_fs_data_path,
    columns=["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
)
print(norm_fs_df_subset.shape)
norm_fs_df_subset.head()

2926
There are 2907 data columns
There are 19 metadata columns
(8318724, 2)


Unnamed: 0,Metadata_Well,oneb_Metadata_Treatment_Dose_Inhibitor_Dose
0,B02,LPS_0.010_ug_per_ml_DMSO_0.025_%
1,B02,LPS_0.010_ug_per_ml_DMSO_0.025_%
2,B02,LPS_0.010_ug_per_ml_DMSO_0.025_%
3,B02,LPS_0.010_ug_per_ml_DMSO_0.025_%
4,B02,LPS_0.010_ug_per_ml_DMSO_0.025_%


In [5]:
# paht to the ground truth file
ground_truth_file_path = pathlib.Path(
    "../../../4.sc_Morphology_Neural_Network_MLP_Model/MLP_utils/ground_truth.toml"
).resolve(strict=True)
# read in the ground truth toml file
ground_truth = toml.load(ground_truth_file_path)
# get information from toml files
apoptosis_groups_list = ground_truth["Apoptosis"]["apoptosis_groups_list"]
pyroptosis_groups_list = ground_truth["Pyroptosis"]["pyroptosis_groups_list"]
healthy_groups_list = ground_truth["Healthy"]["healthy_groups_list"]
# add apoptosis, pyroptosis and healthy columns to dataframe
norm_fs_df_subset["apoptosis"] = norm_fs_df_subset.apply(
    lambda row: row["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
    in apoptosis_groups_list,
    axis=1,
)
norm_fs_df_subset["pyroptosis"] = norm_fs_df_subset.apply(
    lambda row: row["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
    in pyroptosis_groups_list,
    axis=1,
)
norm_fs_df_subset["healthy"] = norm_fs_df_subset.apply(
    lambda row: row["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
    in healthy_groups_list,
    axis=1,
)

# merge apoptosis, pyroptosis, and healthy columns into one column
norm_fs_df_subset["labels"] = norm_fs_df_subset.apply(
    lambda row: "apoptosis"
    if row["apoptosis"]
    else "pyroptosis"
    if row["pyroptosis"]
    else "healthy",
    axis=1,
)
# drop apoptosis, pyroptosis, and healthy columns
norm_fs_df_subset.drop(columns=["apoptosis", "pyroptosis", "healthy"], inplace=True)

In [6]:
# print the number of samples in each class
print(norm_fs_df_subset["labels"].value_counts())

healthy       4301036
pyroptosis    3578372
apoptosis      439316
Name: labels, dtype: int64


## Stats for the Elastic Net models

In [9]:
# set path for models performances
model_performances_path = pathlib.Path(
    "../../../6.bulk_Morphology_Elastic_Network/4.model_performance/results/regression/PBMC/all_model_performance.csv"
).resolve(strict=True)
# load in the model performances
model_performances = pd.read_csv(model_performances_path)

In [10]:
# drop uneeded columns
columns_to_drop = [
    "feature_names",
    "coefficients",
    "cell_type",
    "alpha",
    "l1_ratio",
]
model_performances.drop(columns=columns_to_drop, inplace=True)
# drop duplicates
print(model_performances.shape)
model_performances.drop_duplicates(inplace=True)
print(model_performances.shape)
model_performances.head()

(448426, 3)
(374, 3)


Unnamed: 0,secreted_proteins,shuffle,r2
0,CXCL17 [NSU],final,0.301493
1199,IL-7 [NSU],shuffled,-0.206494
2398,CXCL7 [NSU],shuffled,-0.001184
3597,CCL20 [NSU],final,0.957082
4796,IL-31 [NSU],final,-0.171904


In [11]:
# split the shuffled and final model performances
suffled_models = model_performances.loc[model_performances["shuffle"] == "shuffled"]
final_models = model_performances.loc[model_performances["shuffle"] == "final"]
print(suffled_models.shape)
print(final_models.shape)

(187, 3)
(187, 3)


In [12]:
# sort the final models by r2 score
final_models.sort_values(by="r2", ascending=False, inplace=True)
final_models.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_models.sort_values(by="r2", ascending=False, inplace=True)


Unnamed: 0,secreted_proteins,shuffle,r2
402864,TNF alpha [NSU],final,0.981769
86328,IL-1 beta [NSU],final,0.979441
275770,IL-6 [NSU],final,0.977715
394471,CCL4 [NSU],final,0.970054
196636,CCL3 [NSU],final,0.965891


In [13]:
# get the percentage of models that are above the threshold
threshold = 0.8
final_models_above_threshold = final_models.loc[final_models["r2"] >= threshold]
print(
    f"Percentage of models with r2 score above {threshold}: "
    f"{(final_models_above_threshold.shape[0] / final_models.shape[0]) * 100}",
    f"\n"
    f"The total number of models above the threshold is: {final_models_above_threshold.shape[0]}",
)

Percentage of models with r2 score above 0.8: 21.390374331550802 
The total number of models above the threshold is: 40


In [14]:
# sort the shuffled models by r2 score from low to high
final_models.sort_values(by="r2", ascending=True, inplace=True)
final_models.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_models.sort_values(by="r2", ascending=True, inplace=True)


Unnamed: 0,secreted_proteins,shuffle,r2
426844,FGF-21 [NSU],final,-0.331974
437635,IL-17C [NSU],final,-0.306111
326128,CX3CL1 [NSU],final,-0.293323
153472,IL-11 [NSU],final,-0.266779
74338,AITRL (GITR Ligand) [NSU],final,-0.230527


## LOCO ENET stats

In [None]:
# set path for models performances
model_performances_path = pathlib.Path(
    "../../../11.bulk_Morphology_Elastic_Network_LOCO/2.test_models/results/regression/PBMC_aggregated_with_nomic"
).resolve(strict=True)
#

In [17]:
# get a list of all the model performances
model_performances_list = list(model_performances_path.glob("*.csv"))
print(len(model_performances_list))
# load in the model performances to a df
model_performances = pd.concat(
    [pd.read_csv(model) for model in model_performances_list]
)
model_performances.head()

47874


KeyboardInterrupt: 