In [1]:
import pathlib
import pandas as pd

from copairs.map import aggregate

In [2]:
# Directories
processed_data_dir = pathlib.Path("../data/processed/")
mAP_scores_dir = (processed_data_dir / "mAP_scores").resolve(strict=True)
agg_mAP_scores_dir = (processed_data_dir / "aggregate_mAPs").resolve(strict=True)

## Preparing the dataset 
Next seed of the block, we load the collected mAP single-cell scores generated from the previous notebook.
These scores are loaded into a dataframe and separated based on the type of features (CP DP CP_DP).

We further divide the feature space based on the type of shuffling methods applied to it.
Then, we calculate their aggregated average scores using the `copairs` aggregate function.


In [3]:
all_files = list(mAP_scores_dir.glob("*.csv"))

cp_sc_mAPs = []
dp_sc_mAPs = []
cp_dp_sc_mAPs = []
for _file in all_files:
    if _file.name.startswith("cp_dp"):
        cp_dp_sc_mAPs.append(pd.read_csv(_file))
    elif _file.name.startswith("cp_"):
        dp_sc_mAPs.append(pd.read_csv(_file))
    elif _file.name.startswith("dp_"):
        cp_sc_mAPs.append(pd.read_csv(_file))

# single-cell mAP scores
cp_sc_mAPs = pd.concat(cp_sc_mAPs)
dp_sc_mAPs = pd.concat(dp_sc_mAPs)
cp_dp_sc_mAPs = pd.concat(cp_dp_sc_mAPs)

In [4]:
# Separating data frames: One by feature type (CP, DP, CP_DP)
# Additional split is performed using a shuffling approach:
# - feature_shuffled: feature values within the feature space are shuffled.
# - phenotype_shuffled: phenotypic labels are shuffled.
reg_cp_sc_mAPs = cp_sc_mAPs.loc[cp_sc_mAPs["shuffled"] == "non-shuffled"]
shuffled_feat_cp_sc_mAPs = cp_sc_mAPs.loc[cp_sc_mAPs["shuffled"] == "features_shuffled"]
shuffled_pheno_cp_sc_mAPs = cp_sc_mAPs.loc[cp_sc_mAPs["shuffled"] == "phenotype_shuffled"]

reg_dp_sc_mAPs = dp_sc_mAPs.loc[dp_sc_mAPs["shuffled"] == "non-shuffled"]
shuffled_feat_dp_sc_mAPs = dp_sc_mAPs.loc[dp_sc_mAPs["shuffled"] == "features_shuffled"]
shuffled_pheno_dp_sc_mAPs = dp_sc_mAPs.loc[dp_sc_mAPs["shuffled"] == "phenotype_shuffled"]

reg_cp_dp_sc_mAPs = cp_dp_sc_mAPs.loc[cp_dp_sc_mAPs["shuffled"] == "non-shuffled"]
shuffled_feat_cp_dp_sc_mAPs = cp_dp_sc_mAPs.loc[cp_dp_sc_mAPs["shuffled"] == "features_shuffled"]
shuffled_pheno_cp_dp_sc_mAPs = cp_dp_sc_mAPs.loc[cp_dp_sc_mAPs["shuffled"] == "phenotype_shuffled"]

In [5]:
# Generating aggregate scores with a threshold p-value of 0.05
agg_reg_cp_sc_mAPs = aggregate(reg_cp_sc_mAPs, sameby=["Mitocheck_Phenotypic_Class"], threshold=0.05)
agg_shuffled_feat_cp_sc_mAPs = aggregate(shuffled_feat_cp_sc_mAPs, sameby=["Mitocheck_Phenotypic_Class"], threshold=0.05)
agg_shuffled_pheno_cp_sc_mAPs = aggregate(shuffled_pheno_cp_sc_mAPs, sameby=["Mitocheck_Phenotypic_Class"], threshold=0.05)

agg_reg_dp_sc_mAPs = aggregate(reg_dp_sc_mAPs, sameby=["Mitocheck_Phenotypic_Class"], threshold=0.05)
agg_shuffled_feat_dp_sc_mAPs = aggregate(shuffled_feat_dp_sc_mAPs, sameby=["Mitocheck_Phenotypic_Class"], threshold=0.05)
agg_shuffled_pheno_dp_sc_mAPs = aggregate(shuffled_pheno_dp_sc_mAPs, sameby=["Mitocheck_Phenotypic_Class"], threshold=0.05)

agg_reg_cp_dp_sc_mAPs = aggregate(reg_cp_dp_sc_mAPs, sameby=["Mitocheck_Phenotypic_Class"], threshold=0.05)
agg_shuffled_feat_cp_dp_sc_mAPs = aggregate(shuffled_feat_cp_dp_sc_mAPs, sameby=["Mitocheck_Phenotypic_Class"], threshold=0.05)
agg_shuffled_pheno_cp_dp_sc_mAPs = aggregate(shuffled_pheno_cp_dp_sc_mAPs, sameby=["Mitocheck_Phenotypic_Class"], threshold=0.05)

In [6]:
# saving aggregated scores into the folder
save_dir = mAP_scores_dir / "aggregated_mAPs"

agg_reg_cp_sc_mAPs.to_csv(agg_mAP_scores_dir / "agg_reg_cp_sc_mAPs.csv", index=False)
agg_shuffled_feat_cp_sc_mAPs.to_csv(agg_mAP_scores_dir / "agg_shuffled_feat_cp_sc_mAPs.csv", index=False)
agg_shuffled_pheno_cp_sc_mAPs.to_csv(agg_mAP_scores_dir / "agg_shuffled_pheno_cp_sc_mAPs.csv", index=False)

agg_reg_dp_sc_mAPs.to_csv(agg_mAP_scores_dir / "agg_reg_dp_sc_mAPs.csv", index=False)
agg_shuffled_feat_dp_sc_mAPs.to_csv(agg_mAP_scores_dir / "agg_shuffled_feat_dp_sc_mAPs.csv", index=False)
agg_shuffled_pheno_dp_sc_mAPs.to_csv(agg_mAP_scores_dir / "agg_shuffled_pheno_dp_sc_mAPs.csv", index=False)

agg_reg_cp_dp_sc_mAPs.to_csv(agg_mAP_scores_dir / "agg_reg_cp_dp_sc_mAPs.csv", index=False)
agg_shuffled_feat_cp_dp_sc_mAPs.to_csv(agg_mAP_scores_dir / "agg_shuffled_feat_cp_dp_sc_mAPs.csv", index=False)
agg_shuffled_pheno_cp_dp_sc_mAPs.to_csv(agg_mAP_scores_dir / "agg_shuffled_pheno_cp_dp_sc_mAPs.csv", index=False)

# Forming bar plots 