In [1]:
import pathlib
import warnings

import numpy as np
import pandas as pd
import plotly.express as px
from copairs.map import aggregate

warnings.filterwarnings("ignore")

In [2]:
# Directories
processed_data_dir = pathlib.Path("../data/processed/")
sc_ap_scores_dir = (processed_data_dir / "mAP_scores/morphology").resolve()
agg_sc_ap_scores_dir = (processed_data_dir / "aggregate_mAPs/morphology").resolve()
agg_sc_ap_scores_dir.mkdir(parents=True, exist_ok=True)

## Preparing the dataset


In [3]:
all_files = list(sc_ap_scores_dir.glob("*.csv"))
# get the files that contain the string class
class_files = [file for file in all_files if "class" in file.stem]
mAPs = []
for file in class_files:
    df = pd.read_csv(file)
    df["file"] = file.stem
    mAPs.append(df)
# single-cell mAP scores
mAPs = pd.concat(mAPs)
mAPs.head()
mAPs["comparison"].unique()

array(['Pyroptosis_vs_Control', 'Pyroptosis_vs_Apoptosis',
       'Control_vs_Apoptosis'], dtype=object)

In [4]:
# grabbing all cp features (regular, feature shuffled and labeled shuffled)
reg_sc_mAPs = mAPs.loc[mAPs["shuffled"] == "non-shuffled"]
shuffled_feat_sc_mAPs = mAPs.loc[mAPs["shuffled"] == "features_shuffled"]

In [5]:
# calculating sampling error
# grouping dataframe based on phenotype levels, feature and feature types
df_group = mAPs.groupby(by=["Metadata_labels", "shuffled", "comparison"])
df_group
sampling_error_df = []
for name, df in df_group:
    pheno, shuffled_type, comparison = name

    # caclulating sampling error
    avg_percision = df["average_precision"].values
    sampling_error = np.std(avg_percision) / np.sqrt(len(avg_percision))

    sampling_error_df.append([pheno, shuffled_type, sampling_error, comparison])
cols = ["Metadata_labels", "shuffled", "sampling_error", "comparison"]
sampling_error_df = pd.DataFrame(sampling_error_df, columns=cols)


sampling_error_df.head()

Unnamed: 0,Metadata_labels,shuffled,sampling_error,comparison
0,Apoptosis,non-shuffled,0.08918,Control_vs_Apoptosis
1,Apoptosis,non-shuffled,0.090958,Pyroptosis_vs_Apoptosis
2,Apoptosis,shuffled,0.027081,Control_vs_Apoptosis
3,Apoptosis,shuffled,0.018228,Pyroptosis_vs_Apoptosis
4,Control,non-shuffled,0.003623,Control_vs_Apoptosis


In [6]:
# Generating aggregate scores with a threshold p-value of 0.05
mAP_dfs = []
for name, df in tuple(mAPs.groupby(by=["Metadata_labels", "shuffled", "comparison"])):
    agg_df = aggregate(df, sameby=["Metadata_labels"], threshold=0.05)
    agg_df["Metadata_labels"] = name[0]
    agg_df["shuffled"] = name[1]
    agg_df["comparison"] = name[2]
    mAP_dfs.append(agg_df)

mAP_dfs = pd.concat(mAP_dfs)
mAP_dfs.to_csv(agg_sc_ap_scores_dir / "mAP_scores_class.csv", index=False)
mAP_dfs.head()

Unnamed: 0,Metadata_labels,mean_average_precision,nlog10pvalue,q_value,nlog10qvalue,above_p_threshold,above_q_threshold,shuffled,comparison
0,Apoptosis,0.666299,1.633124,0.023274,1.633124,True,True,non-shuffled,Control_vs_Apoptosis
0,Apoptosis,0.697376,1.592101,0.02558,1.592101,True,True,non-shuffled,Pyroptosis_vs_Apoptosis
0,Apoptosis,0.121844,0.464301,0.34332,0.464301,False,False,shuffled,Control_vs_Apoptosis
0,Apoptosis,0.150701,0.425776,0.375167,0.425776,False,False,shuffled,Pyroptosis_vs_Apoptosis
0,Control,0.908511,0.374607,0.422078,0.374607,False,False,non-shuffled,Control_vs_Apoptosis
