In [1]:
import logging
import pathlib
import sys
from typing import Optional

import numpy as np
import pandas as pd
import toml
from copairs.map import run_pipeline
from pycytominer import feature_select

# imports src
sys.path.append("../")
from src import utils

# setting up logger
logging.basicConfig(
    filename="map_analysis_testing.log",
    level=logging.DEBUG,
    format="%(levelname)s:%(asctime)s:%(name)s:%(message)s",
)

## Helper functions
Set of helper functions to help out throughout the notebook

In [2]:
## Helper function


def shuffle_meta_labels(
    dataset: pd.DataFrame, target_col: str, seed: Optional[int] = 0
) -> pd.DataFrame:
    """shuffles labels or values within a single selected column

    Parameters
    ----------
    dataset : pd.DataFrame
        dataframe containing the dataset

    target_col : str
        Column to select in order to conduct the shuffling

    seed : int
        setting random seed

    Returns
    -------
    pd.DataFrame
        shuffled dataset

    Raises
    ------
    TypeError
        raised if incorrect types are provided
    """
    # setting seed
    np.random.seed(seed)

    # type checking
    if not isinstance(target_col, str):
        raise TypeError("'target_col' must be a string type")
    if not isinstance(dataset, pd.DataFrame):
        raise TypeError("'dataset' must be a pandas dataframe")

    # selecting column, shuffle values within column, add to dataframe
    dataset[target_col] = np.random.permutation(dataset[target_col].values)
    return dataset


def shuffle_features(feature_vals: np.array, seed: Optional[int] = 0) -> np.array:
    """suffles all values within feature space

    Parameters
    ----------
    feature_vals : np.array
        shuffled

    seed : Optional[int]
        setting random seed

    Returns
    -------
    np.array
        Returns shuffled values within the feature space

    Raises
    ------
    TypeError
        Raised if a numpy array is not provided
    """
    # setting seed
    np.random.seed(seed)

    # shuffle given array
    if not isinstance(feature_vals, np.ndarray):
        raise TypeError("'feature_vals' must be a numpy array")
    if feature_vals.ndim != 2:
        raise TypeError("'feature_vals' must be a 2x2 matrix")

    # creating a copy for feature vales to prevent overwriting of global variables
    feature_vals = np.copy(feature_vals)

    # shuffling feature space
    n_cols = feature_vals.shape[1]
    for col_idx in range(0, n_cols):
        # selecting column, shuffle, and update:
        feature_vals[:, col_idx] = np.random.permutation(feature_vals[:, col_idx])

    return feature_vals

## Setting up Paths and loading data

In [3]:
# load in the treatment groups
ground_truth = pathlib.Path(
    "../../4.sc_Morphology_Neural_Network_MLP_Model/MLP_utils/ground_truth.toml"
).resolve(strict=True)
# load in the ground truth
ground_truth = toml.load(ground_truth)
apoptosis_ground_truth = ground_truth["Apoptosis"]["apoptosis_groups_list"]
pyroptosis_ground_truth = ground_truth["Pyroptosis"]["pyroptosis_groups_list"]
control_ground_truth = ground_truth["Healthy"]["healthy_groups_list"]

In [4]:
single_cell_data = pathlib.Path(
    f"../../data/PBMC_preprocessed_sc_norm_aggregated_nomic.parquet"
).resolve(strict=True)
df = pd.read_parquet(single_cell_data)

In [5]:
# out paths
map_out_dir = pathlib.Path("../data/processed/mAP_scores/secretome/")
map_out_dir.mkdir(exist_ok=True, parents=True)

# regular data output
# saving to csv
regular_feat_map_path = pathlib.Path(map_out_dir / "mAP_scores_regular_treatment.csv")

# shuffled data output
shuffled_feat_map_path = pathlib.Path(map_out_dir / "mAP_scores_shuffled_treatment.csv")

# shuffled feature space output
shuffled_feat_space_map_path = pathlib.Path(
    map_out_dir / "mAP_scores_shuffled_feature_space_treatment.csv"
)

### Clean up data

In [6]:
# add apoptosis, pyroptosis and healthy columns to dataframe
df["Apoptosis"] = df.apply(
    lambda row: row["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
    in apoptosis_ground_truth,
    axis=1,
)
df["Pyroptosis"] = df.apply(
    lambda row: row["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
    in pyroptosis_ground_truth,
    axis=1,
)
df["Control"] = df.apply(
    lambda row: row["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
    in control_ground_truth,
    axis=1,
)

# merge apoptosis, pyroptosis, and healthy columns into one column
df["Metadata_labels"] = df.apply(
    lambda row: "Apoptosis"
    if row["Apoptosis"]
    else "Pyroptosis"
    if row["Pyroptosis"]
    else "Control",
    axis=1,
)
# # drop apoptosis, pyroptosis, and healthy columns
df.drop(columns=["Apoptosis", "Pyroptosis", "Control"], inplace=True)

In [7]:
# keep columns that contain Metdata and ['NSU']
df = df.filter(regex="Metadata|NSU")
df.head()

df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].unique()
# replace values in the oneb_Metadata_Treatment_Dose_Inhibitor_Dose column
df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] = df[
    "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"
].replace(
    "Flagellin_0.100_ug_per_ml_DMSO_0.000_%", "Flagellin_0.100_ug_per_ml_DMSO_0.025_%"
)
df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] = df[
    "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"
].replace("Flagellin_1.000_0_DMSO_0.025_%", "Flagellin_1.000_ug_per_ml_DMSO_0.025_%")
df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] = df[
    "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"
].replace(
    "Flagellin_1.000_ug_per_ml_DMSO_0.000_%", "Flagellin_1.000_ug_per_ml_DMSO_0.025_%"
)
df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] = df[
    "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"
].replace("media_ctr_0.0_0_Media_0_0", "Media")
df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] = df[
    "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"
].replace("media_ctr_0.0_0_Media_ctr_0.0_0", "Media")
df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] = df[
    "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"
].replace(
    "Flagellin_1.000_0_Disulfiram_1.000_uM",
    "Flagellin_1.000_ug_per_ml_Disulfiram_1.000_uM",
)
len(df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].unique())

37

In [8]:
# output directories
map_out_dir = pathlib.Path("../data/processed/mAP_scores/")
map_out_dir.mkdir(parents=True, exist_ok=True)

### mAP Pipeline Parameters

The null size needs to be determined for the mAP pipeline. This is the size of the null class that is used to determine the mAP score.

In [9]:
tmp = (
    df.groupby(["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"])
    .count()
    .reset_index()[["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]]
)
# get the Pyroptosis number of Metadata_Well
# get the counts of each oneb_Metadata_Treatment_Dose_Inhibitor_Dose
min_count = tmp["Metadata_Well"].min()
print(min_count)

2


Unnamed: 0,Metadata_Well,oneb_Metadata_Treatment_Dose_Inhibitor_Dose
0,8,DMSO_0.100_%_DMSO_0.025_%
1,4,DMSO_0.100_%_DMSO_1.000_%
2,4,DMSO_0.100_%_Z-VAD-FMK_100.000_uM
3,4,DMSO_0.100_%_Z-VAD-FMK_30.000_uM
4,4,Disulfiram_0.100_uM_DMSO_0.025_%
5,4,Disulfiram_1.000_uM_DMSO_0.025_%
6,4,Disulfiram_2.500_uM_DMSO_0.025_%
7,3,Flagellin_0.100_ug_per_ml_DMSO_0.025_%
8,2,Flagellin_1.000_ug_per_ml_DMSO_0.025_%
9,3,Flagellin_1.000_ug_per_ml_Disulfiram_1.000_uM


In [10]:
pos_sameby = [
    "oneb_Metadata_Treatment_Dose_Inhibitor_Dose",
]
pos_diffby = ["Metadata_Well"]

neg_sameby = []
neg_diffby = ["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]

null_size = min_count
batch_size = 1

# number of resampling
n_resamples = 10

### mAP analysis for non-shuffled data

In [11]:
# This will generated 100 values [0..100] as seed values
# This will occur per phenotype

# spliting metadata and raw feature values
logging.info("splitting data set into metadata and raw feature values")
df_meta, df_feats = utils.split_data(df)
df_feats = np.array(df_feats)

# execute pipeline on negative control with training dataset with cp features
# print(negative_training_cp_meta)
# print(negative_training_cp_feats)
try:
    # execute pipeline on negative control with trianing dataset with cp features
    # print(negative_training_cp_meta)
    # print(negative_training_cp_feats)
    logging.info(f"Running pipeline on CP features using phenotype")
    result = run_pipeline(
        meta=df_meta,
        feats=df_feats,
        pos_sameby=pos_sameby,
        pos_diffby=pos_diffby,
        neg_sameby=neg_sameby,
        neg_diffby=neg_diffby,
        batch_size=batch_size,
        null_size=null_size,
    )

    # adding columns
    result["shuffled"] = "non-shuffled"


except ZeroDivisionError as e:
    logging.warning(f"{e} captured on phenotye:. Skipping")
# concatenating all datasets
result.to_csv(regular_feat_map_path, index=False)
result.head()

  0%|          | 0/237 [00:00<?, ?it/s]

  0%|          | 0/10789 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,Metadata_Well,oneb_Metadata_Treatment_Dose_Inhibitor_Dose,Metadata_labels,average_precision,p_value,n_pos_pairs,n_total_pairs,shuffled
0,B02,LPS_0.010_ug_per_ml_DMSO_0.025_%,Pyroptosis,0.276772,0.333333,3,148,non-shuffled
1,B03,LPS_0.010_ug_per_ml_DMSO_0.025_%,Pyroptosis,0.347619,0.333333,3,148,non-shuffled
2,B04,LPS_Nigericin_100.000_ug_per_ml_1.000_uM_DMSO_...,Pyroptosis,1.0,0.333333,3,148,non-shuffled
3,B05,LPS_Nigericin_100.000_ug_per_ml_1.000_uM_DMSO_...,Pyroptosis,1.0,0.333333,3,148,non-shuffled
4,B06,DMSO_0.100_%_DMSO_0.025_%,Control,0.1551,0.333333,7,148,non-shuffled


### mAP analysis for shuffled data (Phenotype)

In [12]:
logging.info("Running mAP pipeline with shuffled phenotype labeled data")
seed = 0
# running process
# for loop selects one single phenotype
# then splits the data into metadata and raw feature values
# two different groups that contains 3 splits caused by the types of features

# This will generated 100 values [0..100] as seed values
# splitting metadata labeled shuffled data
logging.info("splitting shuffled data set into metadata and raw feature values")
df = shuffle_meta_labels(
    dataset=df, target_col="oneb_Metadata_Treatment_Dose_Inhibitor_Dose", seed=seed
)
(
    df_meta,
    df_feats,
) = utils.split_data(df)


df_feats = np.array(df_feats)

try:
    # execute pipeline on negative control with trianing dataset with cp features
    logging.info(
        f"Running pipeline on CP features using  phenotype, data is shuffled by phenoptype labels"
    )
    shuffled_result = run_pipeline(
        meta=df_meta,
        feats=df_feats,
        pos_sameby=pos_sameby,
        pos_diffby=pos_diffby,
        neg_sameby=neg_sameby,
        neg_diffby=neg_diffby,
        batch_size=batch_size,
        null_size=null_size,
    )

    # adding shuffle label column
    shuffled_result["shuffled"] = "phenotype_shuffled"


except ZeroDivisionError as e:
    logging.warning(f"{e} captured on phenotye: Skipping")

# saving to csv
shuffled_result.to_csv(shuffled_feat_map_path, index=False)
shuffled_result

  0%|          | 0/237 [00:00<?, ?it/s]

  0%|          | 0/10789 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,Metadata_Well,oneb_Metadata_Treatment_Dose_Inhibitor_Dose,Metadata_labels,average_precision,p_value,n_pos_pairs,n_total_pairs,shuffled
0,B02,Topotecan_10.000_nM_DMSO_0.025_%,Pyroptosis,0.036077,0.666667,3,148,phenotype_shuffled
1,B03,DMSO_0.100_%_Z-VAD-FMK_100.000_uM,Pyroptosis,0.066230,0.333333,3,148,phenotype_shuffled
2,B04,Topotecan_10.000_nM_DMSO_0.025_%,Pyroptosis,0.033492,0.666667,3,148,phenotype_shuffled
3,B05,H2O2_100.000_uM_DMSO_0.025_%,Pyroptosis,0.059661,0.333333,3,148,phenotype_shuffled
4,B06,LPS_0.010_ug_per_ml_DMSO_0.025_%,Control,0.068855,0.333333,3,148,phenotype_shuffled
...,...,...,...,...,...,...,...,...
144,O07,LPS_Nigericin_100.000_ug_per_ml_1.000_uM_DMSO_...,Control,0.044226,0.666667,3,148,phenotype_shuffled
145,O08,Thapsigargin_1.000_uM_DMSO_0.025_%,Control,0.026177,0.666667,3,148,phenotype_shuffled
146,O09,LPS_10.000_ug_per_ml_Disulfiram_2.500_uM,Control,0.017773,1.000000,3,148,phenotype_shuffled
147,O10,Flagellin_0.100_ug_per_ml_DMSO_0.025_%,Control,0.508403,0.333333,2,148,phenotype_shuffled


### mAP analysis for shuffled data (Feature space)

In [13]:
seed = 0


# split the shuffled dataset
# spliting metadata and raw feature values
logging.info("splitting shuffled data set into metadata and raw feature values")
(
    df_meta,
    df_feats,
) = utils.split_data(df)

df_feats = np.array(df_feats)


# shuffling the features, this will overwrite the generated feature space from above with the shuffled one
df_feats = shuffle_features(feature_vals=df_feats, seed=seed)


try:
    # execute pipeline on negative control with trianing dataset with cp features
    logging.info(
        f"Running pipeline on CP features using phenotype, feature space is shuffled"
    )
    shuffled_feat_map = run_pipeline(
        meta=df_meta,
        feats=df_feats,
        pos_sameby=pos_sameby,
        pos_diffby=pos_diffby,
        neg_sameby=neg_sameby,
        neg_diffby=neg_diffby,
        batch_size=batch_size,
        null_size=null_size,
    )

    # adding shuffle label column
    shuffled_feat_map["shuffled"] = "features_shuffled"


except ZeroDivisionError as e:
    logging.warning(f"{e} captured on phenotype:  Skipping")


# saving to csv
shuffled_feat_map.to_csv(shuffled_feat_space_map_path, index=False)
shuffled_feat_map

  0%|          | 0/237 [00:00<?, ?it/s]

  0%|          | 0/10789 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,Metadata_Well,oneb_Metadata_Treatment_Dose_Inhibitor_Dose,Metadata_labels,average_precision,p_value,n_pos_pairs,n_total_pairs,shuffled
0,B02,Topotecan_10.000_nM_DMSO_0.025_%,Pyroptosis,0.081604,0.333333,3,148,features_shuffled
1,B03,DMSO_0.100_%_Z-VAD-FMK_100.000_uM,Pyroptosis,0.022795,0.666667,3,148,features_shuffled
2,B04,Topotecan_10.000_nM_DMSO_0.025_%,Pyroptosis,0.024916,0.666667,3,148,features_shuffled
3,B05,H2O2_100.000_uM_DMSO_0.025_%,Pyroptosis,0.023977,0.666667,3,148,features_shuffled
4,B06,LPS_0.010_ug_per_ml_DMSO_0.025_%,Control,0.063995,0.333333,3,148,features_shuffled
...,...,...,...,...,...,...,...,...
144,O07,LPS_Nigericin_100.000_ug_per_ml_1.000_uM_DMSO_...,Control,0.023721,0.666667,3,148,features_shuffled
145,O08,Thapsigargin_1.000_uM_DMSO_0.025_%,Control,0.025479,0.666667,3,148,features_shuffled
146,O09,LPS_10.000_ug_per_ml_Disulfiram_2.500_uM,Control,0.024267,0.666667,3,148,features_shuffled
147,O10,Flagellin_0.100_ug_per_ml_DMSO_0.025_%,Control,0.077425,0.333333,2,148,features_shuffled
