In [1]:
import pathlib
import pandas as pd

from copairs.map import aggregate

In [2]:
# setting file paths are parameters
MAP_SCORES_DIR = pathlib.Path("../data/processed/mAP_scores/").resolve(strict=True)

# outpath of aggregate scores
AGG_OUT_DIR = pathlib.Path("../data/processed/aggregate_mAPs")
AGG_OUT_DIR.mkdir(exist_ok=True)

In [3]:
# separate file paths based on types of features used:
all_mAP_score_path = list(MAP_SCORES_DIR.glob("*.csv"))

cp_mAP_score_paths = []
dp_mAP_score_paths = []
cp_dp_mAP_score_paths = []
for path in all_mAP_score_path:
    if "_cp_dp_" in path.name:
        cp_dp_mAP_score_paths.append(path)
    elif "_cp_" in path.name:
        cp_mAP_score_paths.append(path)
    elif "_dp_" in path.name:
        dp_mAP_score_paths.append(path)


In [4]:
# now load all data into a dataframe
all_cp_mAP_df = pd.concat([pd.read_csv(cp_mAP_path) for cp_mAP_path in cp_mAP_score_paths])
all_dp_mAP_df = pd.concat([pd.read_csv(dp_mAP_path) for dp_mAP_path in dp_mAP_score_paths])
all_cp_dp_mAP_df = pd.concat([pd.read_csv(cp_dp_mAP_path) for cp_dp_mAP_path in cp_dp_mAP_score_paths])

In [5]:
# separate dataset based on if it is shuffled or not
cp_mAP_df = all_cp_mAP_df.loc[~all_cp_mAP_df["shuffled"]]
shuffled_cp_mAP_df = all_cp_mAP_df.loc[all_cp_mAP_df["shuffled"]]

dp_mAP_df = all_dp_mAP_df.loc[~all_dp_mAP_df["shuffled"]]
shuffled_dp_mAP_df = all_dp_mAP_df.loc[all_dp_mAP_df["shuffled"]]

cp_dp_mAP_df = all_cp_dp_mAP_df.loc[~all_cp_dp_mAP_df["shuffled"]]
shuffled_cp_dp_mAP_df = all_cp_dp_mAP_df.loc[all_cp_dp_mAP_df["shuffled"]]

In [11]:
# Two dataframes are created per feature since there is a shuffle and non-shuffled datasets
# next we aggregate the single cell results based on the phenotype to get the scores
aggregated_cp_aMP_scores = aggregate(cp_mAP_df, sameby=["Mitocheck_Phenotypic_Class"], threshold=0.05)
aggregated_cp_aMP_scores["shuffled"] = False
aggregated_cp_aMP_scores["feature_type"] = "CP"

aggregated_shuffled_cp_aMP_scores = aggregate(shuffled_cp_mAP_df, sameby=["Mitocheck_Phenotypic_Class"], threshold=0.5)
aggregated_shuffled_cp_aMP_scores["shuffled"] = True
aggregated_shuffled_cp_aMP_scores["feature_type"] = "CP"

aggregated_dp_aMP_scores = aggregate(dp_mAP_df, sameby=["Mitocheck_Phenotypic_Class"], threshold=0.05)
aggregated_dp_aMP_scores["shuffled"] = False
aggregated_dp_aMP_scores["feature_type"] = "DP"

aggregated_shuffled_dp_aMP_scores = aggregate(shuffled_dp_mAP_df, sameby=["Mitocheck_Phenotypic_Class"], threshold=0.05)
aggregated_shuffled_dp_aMP_scores["shuffled"] = True
aggregated_shuffled_dp_aMP_scores["feature_type"] = "DP"

aggregated_cp_dp_aMP_scores = aggregate(cp_dp_mAP_df, sameby=["Mitocheck_Phenotypic_Class"], threshold=0.05)
aggregated_cp_dp_aMP_scores["shuffled"] = False
aggregated_cp_dp_aMP_scores["feature_type"] = "CP_DP"

shuffle_aggregated_cp_dp_aMP_scores = aggregate(shuffled_cp_dp_mAP_df, sameby=["Mitocheck_Phenotypic_Class"], threshold=0.05)
shuffle_aggregated_cp_dp_aMP_scores["shuffled"] = True
shuffle_aggregated_cp_dp_aMP_scores["feature_type"] = "CP_DP"

In [20]:
# saving the non-aggregated dataset
cp_mAP_df.to_csv(AGG_OUT_DIR / "cp_aMP_scores_notshuffled.csv", index=False)
shuffled_cp_mAP_df.to_csv(AGG_OUT_DIR / "cp_aMP_scores_shuffled.csv", index=False)

dp_mAP_df.to_csv(AGG_OUT_DIR / "dp_aMP_scores_notshuffled.csv", index=False)
shuffled_dp_mAP_df.to_csv(AGG_OUT_DIR / "dp_aMP_scores_shuffled.csv", index=False)

cp_dp_mAP_df.to_csv(AGG_OUT_DIR / "cp_dp_aMP_scores_notshuffled.csv", index=False)
shuffled_cp_dp_mAP_df.to_csv(AGG_OUT_DIR / "shuffled_cp_dp_aMP_scores_notshuffled.csv", index=False)

# saving the aggregated files
aggregated_cp_aMP_scores.to_csv(AGG_OUT_DIR / "aggregated_cp_aMP_notshuffled_scores.csv", index=False)
aggregated_shuffled_cp_aMP_scores.to_csv(AGG_OUT_DIR / "aggregated_cp_aMP_shuffled_scores.csv", index=False)

aggregated_dp_aMP_scores.to_csv(AGG_OUT_DIR / "aggregated_dp_aMP_notshuffled_scores.csv", index=False)
aggregated_shuffled_dp_aMP_scores.to_csv(AGG_OUT_DIR / "aggregated_cp_aMP_shuffled_scores.csv", index=False)

aggregated_cp_dp_aMP_scores.to_csv(AGG_OUT_DIR / "aggregated_cp_dp_aMP_notshuffled_scores.csv", index=False)
shuffle_aggregated_cp_dp_aMP_scores.to_csv(AGG_OUT_DIR / "aggregated_cp_dp_aMP_shuffled_scores.csv", index=False)