In [1]:
import argparse
import pathlib
import random
import sys

import numpy as np
import pandas as pd

# check if in a jupyter notebook
try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False
import warnings

import pycytominer.aggregate
import tqdm

# Suppress all RuntimeWarnings
warnings.filterwarnings("ignore", category=RuntimeWarning)


sys.path.append("../utils")
from mAP_utils import run_mAP_across_time

In [2]:
if not in_notebook:
    # setup the argument parser
    parser = argparse.ArgumentParser(
        description="Generate a map for differing cell counts"
    )

    parser.add_argument(
        "--number_of_cells", type=int, help="Seed for the random number generator"
    )
    parser.add_argument(
        "--shuffle", action="store_true", help="Shuffle the order of the wells"
    )
    # parse the arguments
    args = parser.parse_args()
    number_of_cells = args.number_of_cells
    shuffle = args.shuffle
else:
    number_of_cells = 1
    shuffle = False

output_file = pathlib.Path(
    f"../results/mAP_cell_number_subsampled/{number_of_cells}_{shuffle}.parquet"
)
output_file.parent.mkdir(exist_ok=True, parents=True)

In [3]:
data_file_path = pathlib.Path(
    "../../data/CP_scDINO_features/combined_CP_scDINO_norm_fs.parquet"
).resolve(strict=True)
df = pd.read_parquet(data_file_path)

df.head()

Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
0,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,101,...,0.018387,-0.178214,1.589703,0.313944,1.126927,-0.143103,0.241127,-0.293259,-0.283715,1.434163
1,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,111,...,-1.811176,-0.059895,-1.208776,0.10275,0.845704,0.08393,-1.990931,-0.030848,-1.033722,-0.942127
2,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,11,...,-0.731998,0.654253,-0.075728,0.810937,0.30094,-0.22878,1.782329,0.153739,-0.763335,0.725093
3,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,128,...,1.339222,1.203907,0.509754,-0.711263,0.067196,-0.149771,1.40565,0.063245,2.16211,3.187469
4,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,132,...,-0.41992,-0.29961,0.344723,-0.25113,-1.851114,0.669517,-0.439855,1.576201,0.747753,0.895601


In [4]:
# subset the data by cell count
# one cell per well
subset_df = df.groupby(["Metadata_Well", "Metadata_Time"]).sample(
    n=number_of_cells, random_state=0, replace=False
)

In [5]:
subset_df.reset_index(drop=True, inplace=True)
if shuffle:
    # permutate the data
    for col in subset_df.columns:
        if col == "Metadata_Time":
            continue
        if col == "Metadata_dose":
            continue
        if col == "Metadata_Well":
            continue
        subset_df[col] = np.random.permutation(subset_df[col])
metadata_cols = [cols for cols in subset_df.columns if "Metadata" in cols]
features_cols = [cols for cols in subset_df.columns if "Metadata" not in cols]
features_cols = features_cols
aggregate_df = pycytominer.aggregate(
    population_df=subset_df,
    strata=["Metadata_Well", "Metadata_Time"],
    features=features_cols,
    operation="median",
)

In [6]:
metadata_df = subset_df[metadata_cols]
metadata_df = metadata_df.drop_duplicates(subset=["Metadata_Well", "Metadata_Time"])
metadata_df = metadata_df.reset_index(drop=True)
aggregate_df = pd.merge(
    metadata_df, aggregate_df, on=["Metadata_Well", "Metadata_Time"]
)
aggregate_df.head()

Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
0,1,C-02,183,Staurosporine,0.0,negative,1,2,0.0,119,...,1.538906,-1.451373,0.689326,1.015235,-0.054164,-1.370754,0.353132,-0.218096,-0.669798,-0.720105
1,1,C-02,189,Staurosporine,0.0,negative,2,2,1.0,25,...,0.923269,-0.754951,-0.449772,1.627492,0.648229,-1.23335,0.019822,-1.030158,0.543025,0.571881
2,1,C-02,180,Staurosporine,0.0,negative,11,1,10.0,165,...,0.610443,-0.340604,1.409342,-0.129539,0.440814,0.943133,-0.095659,-0.347217,-1.130622,0.081055
3,1,C-02,194,Staurosporine,0.0,negative,12,2,11.0,69,...,-0.738119,-0.848537,0.716184,0.161059,-0.364249,0.948706,-0.091702,-0.433425,-0.820682,0.999267
4,1,C-02,194,Staurosporine,0.0,negative,13,2,12.0,133,...,-0.607471,0.609924,-0.263886,-0.117806,-0.393199,0.352603,-0.019827,0.286804,-0.042885,-0.23054


In [7]:
dict_of_map_dfs = run_mAP_across_time(
    aggregate_df,
    seed=0,
    time_column="Metadata_Time",
    reference_column_name="Metadata_dose",
    reference_group=aggregate_df["Metadata_dose"].min(),
)


output_df = pd.concat(dict_of_map_dfs.values(), keys=dict_of_map_dfs.keys())
output_df.reset_index(inplace=True)
output_df.rename(columns={"level_0": "Metadata_Time"}, inplace=True)
# add the percentage of cells to the keys
output_df["number_of_cells"] = number_of_cells
output_df["seed"] = 0
output_df["shuffle"] = shuffle
output_df.reset_index(drop=True, inplace=True)
output_df.to_parquet(output_file)
output_df.head()

UnpairedException: Unable to find negative pairs.