In [1]:
import pathlib
import warnings

import numpy as np
import pandas as pd
import plotly.express as px
from copairs.map import aggregate

warnings.filterwarnings("ignore")

In [2]:
# Directories
processed_data_dir = pathlib.Path("../data/processed/")
sc_ap_scores_dir = (processed_data_dir / "mAP_scores/secretome").resolve()
agg_sc_ap_scores_dir = (processed_data_dir / "aggregate_mAPs/secretome").resolve()
agg_sc_ap_scores_dir.mkdir(parents=True, exist_ok=True)

## Preparing the dataset


In [3]:
all_files = list(sc_ap_scores_dir.glob("*.csv"))
# get the files that contain the string class
class_files = [file for file in all_files if "class" in file.stem]
mAPs = []
for file in class_files:
    df = pd.read_csv(file)
    df["file"] = file.stem
    mAPs.append(df)
# single-cell mAP scores
mAPs = pd.concat(mAPs)
mAPs.head()

Unnamed: 0,Metadata_Well,Metadata_labels,average_precision,p_value,n_pos_pairs,n_total_pairs,shuffled,comparison,file
0,D06,Apoptosis,0.86576,0.015152,7.0,72.0,non-shuffled,Pyroptosis_vs_Apoptosis,merged_sc_agg_ap_scores_class
1,B06,Control,0.956508,0.015152,75.0,140.0,non-shuffled,Pyroptosis_vs_Control,merged_sc_agg_ap_scores_class
2,B02,Pyroptosis,0.928684,0.015152,64.0,140.0,non-shuffled,Pyroptosis_vs_Control,merged_sc_agg_ap_scores_class
0,B02,Pyroptosis,0.387611,1.0,64.0,140.0,shuffled,Pyroptosis_vs_Control,mAP_scores_shuffled_feature_space_class
1,B03,Pyroptosis,0.455559,0.742424,64.0,140.0,shuffled,Pyroptosis_vs_Control,mAP_scores_shuffled_feature_space_class


In [4]:
# grabbing all cp features (regular, feature shuffled and labeled shuffled)
reg_sc_mAPs = mAPs.loc[mAPs["shuffled"] == "non-shuffled"]
shuffled_feat_sc_mAPs = mAPs.loc[mAPs["shuffled"] == "features_shuffled"]
shuffled_pheno_sc_mAPs = mAPs.loc[mAPs["shuffled"] == "phenotype_shuffled"]

In [5]:
# calculating sampling error
# grouping dataframe based on phenotype levels, feature and feature types
df_group = mAPs.groupby(by=["Metadata_labels", "shuffled"])


sampling_error_df = []
for name, df in df_group:
    pheno, shuffled_type = name

    # caclulating sampling error
    avg_percision = df["average_precision"].values
    sampling_error = np.std(avg_percision) / np.sqrt(len(avg_percision))

    sampling_error_df.append([pheno, shuffled_type, sampling_error])
cols = ["Metadata_labels", "shuffled", "sampling_error"]
sampling_error_df = pd.DataFrame(sampling_error_df, columns=cols)


sampling_error_df.head()

Unnamed: 0,Metadata_labels,shuffled,sampling_error
0,Apoptosis,non-shuffled,0.054908
1,Apoptosis,shuffled,0.017341
2,Control,non-shuffled,0.006999
3,Control,shuffled,0.013836
4,Pyroptosis,non-shuffled,0.006264


In [6]:
# Generating aggregate scores with a threshold p-value of 0.05
mAP_dfs = []
for name, df in tuple(mAPs.groupby(by=["Metadata_labels", "shuffled"])):
    agg_df = aggregate(df, sameby=["Metadata_labels"], threshold=0.05)
    agg_df["Metadata_labels"] = name[0]
    agg_df["shuffled"] = name[1]
    mAP_dfs.append(agg_df)

mAP_dfs = pd.concat(mAP_dfs)
mAP_dfs.to_csv(agg_sc_ap_scores_dir / "mAP_scores_class.csv", index=False)
mAP_dfs.head()

Unnamed: 0,Metadata_labels,mean_average_precision,nlog10pvalue,q_value,nlog10qvalue,above_p_threshold,above_q_threshold,shuffled
0,Apoptosis,0.86576,1.791478,0.016163,1.791478,True,True,non-shuffled
0,Apoptosis,0.149142,0.464571,0.343107,0.464571,False,False,shuffled
0,Control,0.956508,1.718933,0.019101,1.718933,True,True,non-shuffled
0,Control,0.736023,0.4204,0.379839,0.4204,False,False,shuffled
0,Pyroptosis,0.928684,1.81265,0.015394,1.81265,True,True,non-shuffled
