In [1]:
import argparse
import pathlib
import random

import numpy as np
import pandas as pd
from copairs import map
from copairs.matching import assign_reference_index

# check if in a jupyter notebook
try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False
import warnings

import pycytominer.aggregate
import tqdm

# Suppress all RuntimeWarnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

import sys

sys.path.append("../utils")
from mAP_utils import run_mAP_across_time

In [2]:
if not in_notebook:
    # setup the argument parser
    parser = argparse.ArgumentParser(
        description="Generate a map for differing cell counts"
    )

    parser.add_argument(
        "--percentage", type=float, help="Percentage of wells to use for the map file"
    )
    parser.add_argument("--seed", type=int, help="Seed for the random number generator")
    parser.add_argument(
        "--shuffle", action="store_true", help="Shuffle the order of the wells"
    )
    # parse the arguments
    args = parser.parse_args()
    percentage = args.percentage
    set_seed = args.seed
    shuffle = args.shuffle
else:
    percentage = 0.1
    set_seed = 0
    shuffle = False

output_file = pathlib.Path(
    f"../results/mAP_cell_percentages/{percentage}_{set_seed}_{shuffle}.parquet"
)
output_file.parent.mkdir(exist_ok=True, parents=True)

In [3]:
data_file_path = pathlib.Path(
    "../../data/CP_feature_select/profiles/features_selected_profile.parquet"
).resolve(strict=True)
df = pd.read_parquet(data_file_path)

df.head()

Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_DifferenceVariance_CL_488_1_3_00_256,Nuclei_Texture_DifferenceVariance_CL_488_2_3_01_256,Nuclei_Texture_DifferenceVariance_CL_561_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CL_488_1_3_01_256,Nuclei_Texture_InverseDifferenceMoment_CL_561_3_01_256,Nuclei_Texture_InverseDifferenceMoment_DNA_3_00_256,Nuclei_Texture_SumAverage_CL_488_1_3_03_256,Nuclei_Texture_SumAverage_CL_488_2_3_03_256,Nuclei_Texture_SumAverage_CL_561_3_02_256,Nuclei_Texture_SumAverage_DNA_3_00_256
0,1,D-02,192,Staurosporine,0.0,negative,1,1,0.0,2,...,1.439092,0.891046,1.500406,0.587907,0.617193,-3.721488,-0.232102,0.381277,-0.044157,-3.411162
1,1,D-02,192,Staurosporine,0.0,negative,1,1,0.0,23,...,-1.048757,0.629287,0.600013,-0.178073,0.46453,0.297776,0.356853,0.381277,0.040152,0.158581
2,1,D-02,192,Staurosporine,0.0,negative,1,1,0.0,31,...,0.480107,0.891046,0.380827,0.448919,0.390155,0.297776,-0.232102,0.381277,0.078288,0.158581
3,1,D-02,192,Staurosporine,0.0,negative,1,1,0.0,35,...,-0.616834,0.891046,0.863686,0.217271,0.565594,0.297776,-0.030175,0.381277,-0.002435,0.158581
4,1,D-02,192,Staurosporine,0.0,negative,1,1,0.0,40,...,1.439092,-1.464788,-0.659766,0.587907,0.10422,0.297776,-0.232102,-1.424332,0.471232,0.158581


In [4]:
random.seed(set_seed)
subset_df = df.groupby(["Metadata_Time", "Metadata_Well"]).apply(
    lambda x: x.sample(frac=percentage, random_state=set_seed),
    include_groups=True,
)

  subset_df = df.groupby(["Metadata_Time", "Metadata_Well"]).apply(


In [5]:
subset_df.reset_index(drop=True, inplace=True)
if shuffle:
    # permutate the data
    for col in subset_df.columns:
        subset_df[col] = np.random.permutation(subset_df[col])
metadata_cols = [cols for cols in subset_df.columns if "Metadata" in cols]
features_cols = [cols for cols in subset_df.columns if "Metadata" not in cols]
features_cols = features_cols + ["Metadata_number_of_singlecells"]
aggregate_df = pycytominer.aggregate(
    population_df=subset_df,
    strata=["Metadata_Well", "Metadata_Time"],
    features=features_cols,
    operation="median",
)

In [6]:
metadata_df = subset_df[metadata_cols]
metadata_df = metadata_df.drop_duplicates(subset=["Metadata_Well", "Metadata_Time"])
metadata_df = metadata_df.reset_index(drop=True)
aggregate_df = pd.merge(
    metadata_df, aggregate_df, on=["Metadata_Well", "Metadata_Time"]
)
aggregate_df.head()

Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells_x,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_DifferenceVariance_CL_488_2_3_01_256,Nuclei_Texture_DifferenceVariance_CL_561_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CL_488_1_3_01_256,Nuclei_Texture_InverseDifferenceMoment_CL_561_3_01_256,Nuclei_Texture_InverseDifferenceMoment_DNA_3_00_256,Nuclei_Texture_SumAverage_CL_488_1_3_03_256,Nuclei_Texture_SumAverage_CL_488_2_3_03_256,Nuclei_Texture_SumAverage_CL_561_3_02_256,Nuclei_Texture_SumAverage_DNA_3_00_256,Metadata_number_of_singlecells_y
0,1,C-02,163,Staurosporine,0.0,negative,1,4,0.0,50,...,0.212609,-0.375083,0.154952,0.171849,0.297776,-0.099287,0.381277,0.201143,0.158581,183.0
1,1,C-03,185,Staurosporine,0.61,test,1,1,0.0,43,...,0.103955,0.409223,0.381574,0.455023,0.297776,-0.159184,0.381277,0.037473,0.158581,191.0
2,1,C-04,180,Staurosporine,1.22,test,1,3,0.0,21,...,-0.210237,0.346267,0.157277,0.459976,0.297776,-0.213688,0.381277,0.011786,0.158581,180.0
3,1,C-05,187,Staurosporine,2.44,test,1,2,0.0,86,...,-0.485484,0.703336,0.275076,0.524607,0.297776,-0.158529,0.381277,0.016623,0.158581,175.0
4,1,C-06,154,Staurosporine,4.88,test,1,4,0.0,124,...,0.495685,0.457185,0.462031,0.491852,0.297776,-0.214945,0.381277,-0.006228,0.158581,169.0


In [None]:
dict_of_map_dfs = run_mAP_across_time(
    aggregate_df,
    seed=set_seed,
    time_column="Metadata_Time",
    reference_column_name="Metadata_dose",
    reference_group=aggregate_df["Metadata_dose"].min(),
)


output_df = pd.concat(dict_of_map_dfs.values(), keys=dict_of_map_dfs.keys())
output_df.reset_index(inplace=True)
output_df.rename(columns={"level_0": "Metadata_Time"}, inplace=True)
# add the percentage of cells to the keys
output_df["percentage_of_cells"] = percentage
output_df["seed"] = set_seed
output_df["shuffle"] = shuffle
output_df.reset_index(drop=True, inplace=True)
output_df.to_parquet(output_file)
output_df.head()