In [1]:
import pathlib
import warnings

import numpy as np
import pandas as pd
import plotly.express as px
from copairs.map import aggregate

warnings.filterwarnings("ignore")

In [2]:
# Directories
processed_data_dir = pathlib.Path("../data/processed/")
sc_ap_scores_dir = (processed_data_dir / "mAP_scores/morphology").resolve()
agg_sc_ap_scores_dir = (processed_data_dir / "aggregate_mAPs/morphology").resolve()
agg_sc_ap_scores_dir.mkdir(parents=True, exist_ok=True)

## Preparing the dataset


In [3]:
all_files = list(sc_ap_scores_dir.glob("*.csv"))
# get the files that contain the string class
class_files = [file for file in all_files if "class" in file.stem]
mAPs = []
for file in class_files:
    df = pd.read_csv(file)
    df["file"] = file.stem
    mAPs.append(df)
# single-cell mAP scores
mAPs = pd.concat(mAPs)
mAPs.head()
mAPs["comparison"].unique()

array(['Control_vs_Apoptosis', 'Pyroptosis_vs_Apoptosis',
       'Pyroptosis_vs_Control'], dtype=object)

In [4]:
# grabbing all cp features (regular, feature shuffled and labeled shuffled)
reg_sc_mAPs = mAPs.loc[mAPs["shuffled"] == "non-shuffled"]
shuffled_feat_sc_mAPs = mAPs.loc[mAPs["shuffled"] == "features_shuffled"]

In [5]:
# Generating sampling_error df
# This table will be used to merge with the aggregate table to get the sampling error a specific category.
merged_sc_ap_scores_df = pd.concat(
    [
        reg_sc_mAPs,
        shuffled_feat_sc_mAPs,
    ]
)

# grouping dataframe based on phenotype levels, feature and feature types
df_group = merged_sc_ap_scores_df.groupby(
    by=["Metadata_labels", "shuffled", "comparison"]
)

# calculating sampling error
sampling_error_df = []
for name, df in df_group:
    pheno, shuffled_type, comparison = name

    # caclulating sampling error
    avg_percision = df["average_precision"].values
    sampling_error = np.std(avg_percision) / np.sqrt(len(avg_percision))

    sampling_error_df.append([pheno, shuffled_type, sampling_error, comparison])
cols = ["Metadata_labels", "shuffled", "sampling_error", "comparison"]
sampling_error_df = pd.DataFrame(sampling_error_df, columns=cols)


sampling_error_df.head()

Unnamed: 0,Metadata_labels,shuffled,sampling_error,comparison
0,Apoptosis,non-shuffled,0.085021,Control_vs_Apoptosis
1,Control,non-shuffled,0.011942,Control_vs_Apoptosis
2,Pyroptosis,non-shuffled,0.017691,Control_vs_Apoptosis


In [6]:
mAPs["comparison"].unique()

array(['Control_vs_Apoptosis', 'Pyroptosis_vs_Apoptosis',
       'Pyroptosis_vs_Control'], dtype=object)

In [7]:
# aggregate single cells scores with cell labels
data = tuple(mAPs.groupby(by=["Metadata_labels", "shuffled", "comparison"]))
columns = mAPs.columns
agg_sc_ap_scores_df = []
for cell_id, df1 in data:
    for shuffle_type, df2 in df1.groupby(by="shuffled"):
        aggregated_ap_score = df2["average_precision"].mean()

        # select a single row since all the metadata is the same
        selected_row = df2.iloc[0]

        # update the average precision score of the single row
        selected_row["average_precision"] = aggregated_ap_score
        agg_sc_ap_scores_df.append(selected_row.values.tolist())

# saving into the results repo
agg_sc_ap_scores_df = pd.DataFrame(data=agg_sc_ap_scores_df, columns=columns)
agg_sc_ap_scores_df.to_csv(
    sc_ap_scores_dir / "merged_sc_agg_ap_scores_class.csv", index=False
)
agg_sc_ap_scores_df.head()

Unnamed: 0,Metadata_Well,Metadata_labels,average_precision,p_value,n_pos_pairs,n_total_pairs,shuffled,comparison,file
0,D06,Apoptosis,0.640446,0.863636,7,153,non-shuffled,Control_vs_Apoptosis,merged_sc_agg_ap_scores_class
1,D06,Apoptosis,0.063021,0.378788,7,153,shuffled_baseline,Control_vs_Apoptosis,merged_sc_agg_ap_scores_class
2,D06,Apoptosis,0.063021,0.378788,7,153,shuffled_baseline,Pyroptosis_vs_Apoptosis,merged_sc_agg_ap_scores_class
3,D06,Apoptosis,0.063021,0.378788,7,153,shuffled_baseline,Pyroptosis_vs_Control,merged_sc_agg_ap_scores_class
4,B06,Control,0.603318,0.015152,80,153,non-shuffled,Control_vs_Apoptosis,merged_sc_agg_ap_scores_class


In [8]:
agg_sc_ap_scores_df["comparison"].unique()

array(['Control_vs_Apoptosis', 'Pyroptosis_vs_Apoptosis',
       'Pyroptosis_vs_Control'], dtype=object)

In [9]:
# Generating aggregate scores with a threshold p-value of 0.05
mAP_dfs = []
for name, df in tuple(
    agg_sc_ap_scores_df.groupby(by=["Metadata_labels", "shuffled", "comparison"])
):
    agg_df = aggregate(df, sameby=["Metadata_labels"], threshold=0.05)
    agg_df["Metadata_labels"] = name[0]
    agg_df["shuffled"] = name[1]
    agg_df["comparison"] = name[2]
    mAP_dfs.append(agg_df)

mAP_dfs = pd.concat(mAP_dfs)
mAP_dfs.to_csv(agg_sc_ap_scores_dir / "mAP_scores_class.csv", index=False)
mAP_dfs.head()

Unnamed: 0,Metadata_labels,mean_average_precision,nlog10pvalue,q_value,nlog10qvalue,above_p_threshold,above_q_threshold,shuffled,comparison
0,Apoptosis,0.640446,0.063669,0.863636,0.063669,False,False,non-shuffled,Control_vs_Apoptosis
0,Apoptosis,0.063021,0.421604,0.378788,0.421604,False,False,shuffled_baseline,Control_vs_Apoptosis
0,Apoptosis,0.063021,0.421604,0.378788,0.421604,False,False,shuffled_baseline,Pyroptosis_vs_Apoptosis
0,Apoptosis,0.063021,0.421604,0.378788,0.421604,False,False,shuffled_baseline,Pyroptosis_vs_Control
0,Control,0.603318,1.819544,0.015152,1.819544,True,True,non-shuffled,Control_vs_Apoptosis


In [10]:
mAP_dfs["comparison"].unique()
mAP_dfs["shuffled"].unique()

array(['non-shuffled', 'shuffled_baseline'], dtype=object)

## Forming bar plots


### Forming bar plots with CP Features


In [11]:
mAP_dfs

Unnamed: 0,Metadata_labels,mean_average_precision,nlog10pvalue,q_value,nlog10qvalue,above_p_threshold,above_q_threshold,shuffled,comparison
0,Apoptosis,0.640446,0.063669,0.863636,0.063669,False,False,non-shuffled,Control_vs_Apoptosis
0,Apoptosis,0.063021,0.421604,0.378788,0.421604,False,False,shuffled_baseline,Control_vs_Apoptosis
0,Apoptosis,0.063021,0.421604,0.378788,0.421604,False,False,shuffled_baseline,Pyroptosis_vs_Apoptosis
0,Apoptosis,0.063021,0.421604,0.378788,0.421604,False,False,shuffled_baseline,Pyroptosis_vs_Control
0,Control,0.603318,1.819544,0.015152,1.819544,True,True,non-shuffled,Control_vs_Apoptosis
0,Control,0.535067,0.176091,0.666667,0.176091,False,False,shuffled_baseline,Control_vs_Apoptosis
0,Control,0.535067,0.176091,0.666667,0.176091,False,False,shuffled_baseline,Pyroptosis_vs_Apoptosis
0,Control,0.535067,0.176091,0.666667,0.176091,False,False,shuffled_baseline,Pyroptosis_vs_Control
0,Pyroptosis,0.652992,1.819544,0.015152,1.819544,True,True,non-shuffled,Control_vs_Apoptosis
0,Pyroptosis,0.428223,0.404571,0.393939,0.404571,False,False,shuffled_baseline,Control_vs_Apoptosis


In [12]:
# phenotypes
df = mAP_dfs.reset_index().drop("index", axis=1)[
    ["Metadata_labels", "mean_average_precision", "shuffled", "comparison"]
]


fig = px.bar(
    df,
    x="Metadata_labels",
    y="mean_average_precision",
    color="comparison",
    barmode="group",
    title="Mean Average Precision for each Cell Death Phenotype",
    labels={
        "mean_average_precision": "Mean Average Precision",
        "Metadata_labels": "Cell Death Phenotypes",
    },
)
fig

## Generating box plots of single cell ap scores

In [13]:
all_df = pd.concat(
    [
        reg_sc_mAPs,
        shuffled_feat_sc_mAPs,
    ]
)

In [14]:
# Assuming all_cp_df, all_dp_df, and all_cp_dp_df are your DataFrames
categories_order = all_df["Metadata_labels"].unique()

# Create individual figures with the same category order
fig1 = px.box(
    all_df,
    x="Metadata_labels",
    y="average_precision",
    color="shuffled",
    title="Single Well Average Percision Scores",
    category_orders={"Metadata_labels": categories_order},
    labels={
        "average_precision": "Average Precision Scores",
        "Metadata_labels": "Cell Death Class",
    },
)
fig1

In [15]:
all_df["comparison"].unique()

array(['Control_vs_Apoptosis'], dtype=object)