# Annotate merged single cells with metadata from platemap file

## Import libraries

In [1]:
import pathlib
import sys

import pandas as pd
from pycytominer import annotate
from pycytominer.cyto_utils import output

## Set paths and variables

In [2]:
# load in platemap file as a pandas dataframe
platemap_path = pathlib.Path("../../data/").resolve()

# directory where parquet files are located
data_dir = pathlib.Path("../data/converted_data")

# directory where the annotated parquet files are saved to
output_dir = pathlib.Path("../data/annotated_data")
output_dir.mkdir(exist_ok=True)

In [3]:
# dictionary with each run for the cell type
dict_of_inputs = {
    "run_20230920ChromaLiveTL_24hr4ch_MaxIP": {
        "source_path": pathlib.Path(
            f"{data_dir}/20230920ChromaLiveTL_24hr4ch_MaxIP.parquet"
        ).resolve(strict=True),
        "platemap_path": pathlib.Path(f"{platemap_path}/platemap_24h.csv").resolve(
            strict=True
        ),
    },
    "run_20231017ChromaLive_6hr_4ch_MaxIP": {
        "source_path": pathlib.Path(
            f"{data_dir}/20231017ChromaLive_6hr_4ch_MaxIP.parquet"
        ).resolve(strict=True),
        "platemap_path": pathlib.Path(f"{platemap_path}/platemap_6hr_4ch.csv").resolve(
            strict=True
        ),
    },
    "run_20231017ChromaLive_endpoint_w_AnnexinV_2ch_MaxIP": {
        "source_path": pathlib.Path(
            f"{data_dir}/20231017ChromaLive_endpoint_w_AnnexinV_2ch_MaxIP.parquet"
        ).resolve(strict=True),
        "platemap_path": pathlib.Path(
            f"{platemap_path}/platemap_AnnexinV_2ch.csv"
        ).resolve(strict=True),
    },
}

## Annotate merged single cells

In [4]:
for data_run, info in dict_of_inputs.items():
    # load in converted parquet file as df to use in annotate function
    single_cell_df = pd.read_parquet(info["source_path"])
    platemap_df = pd.read_csv(info["platemap_path"])
    output_file = str(pathlib.Path(f"{output_dir}/{data_run}_sc.parquet"))
    print(f"Adding annotations to merged single cells for {data_run}!")

    # add metadata from platemap file to extracted single cell features
    annotated_df = annotate(
        profiles=single_cell_df,
        platemap=platemap_df,
        join_on=["Metadata_well", "Image_Metadata_Well"],
    )

    # move metadata well and single cell count to the front of the df (for easy visualization in python)
    well_column = annotated_df.pop("Metadata_Well")
    singlecell_column = annotated_df.pop("Metadata_number_of_singlecells")
    # insert the column as the second index column in the dataframe
    annotated_df.insert(1, "Metadata_Well", well_column)
    annotated_df.insert(2, "Metadata_number_of_singlecells", singlecell_column)

    # find columns that have path in the name
    file_cols = [col for col in single_cell_df.columns if "FileName" in col]
    path_cols = [col for col in single_cell_df.columns if "PathName" in col]
    # get the cols that contain BoundingBox
    bounding_box_cols = [
        "Cells_AreaShape_BoundingBoxMinimum_X",
        "Cells_AreaShape_BoundingBoxMinimum_Y",
        "Cells_AreaShape_BoundingBoxMaximum_X",
        "Cells_AreaShape_BoundingBoxMaximum_Y",
    ]
    center_cols = [
        "Nuclei_Location_Center_X",
        "Nuclei_Location_Center_Y",
    ]
    # Merging columns
    merging_cols = ["Metadata_ImageNumber", "Image_Metadata_Well", "Image_Metadata_FOV"]
    # add all lists of columns together
    cols_to_add = file_cols + path_cols + bounding_box_cols + center_cols + merging_cols
    print(cols_to_add)

    cols_to_add_df = single_cell_df[cols_to_add]

    # add "Metadata_" to the beginning of each column if it is in the cols_to_add list
    for col in cols_to_add:
        if col not in annotated_df.columns:
            continue
        if "Metadata_" in col:
            continue
        else:
            annotated_df.rename(columns={col: f"Metadata_{col}"}, inplace=True)

    # iterate over rows to ensure that the well values match the well values in the file name
    # this should be true for all rows but checking to make sure as there were issues with this in the past
    for index, row in annotated_df.iterrows():
        # check if the well value matches the well values in the file name
        if row["Metadata_Well"] == row["Metadata_Image_FileName_DNA"].split("_")[0]:
            continue
        else:
            print(f"Row {index} does not match well values in file name")
            print(f"Well value: {row['Metadata_Well']}")
            print(f"File name: {row['Metadata_Image_FileName_DNA']}")
    # save annotated df as parquet file
    output(
        df=annotated_df,
        output_filename=output_file,
        output_type="parquet",
    )
    print(f"Annotations have been added to {data_run} and saved!")
    # check last annotated df to see if it has been annotated correctly
    print(annotated_df.shape)
    annotated_df.head()

Adding annotations to merged single cells for run_20230920ChromaLiveTL_24hr4ch_MaxIP!
['Image_FileName_488_1', 'Image_FileName_488_2', 'Image_FileName_561', 'Image_FileName_DNA', 'Image_PathName_488_2', 'Image_PathName_561', 'Image_PathName_DNA', 'Cells_AreaShape_BoundingBoxMinimum_X', 'Cells_AreaShape_BoundingBoxMinimum_Y', 'Cells_AreaShape_BoundingBoxMaximum_X', 'Cells_AreaShape_BoundingBoxMaximum_Y', 'Nuclei_Location_Center_X', 'Nuclei_Location_Center_Y', 'Metadata_ImageNumber', 'Image_Metadata_Well', 'Image_Metadata_FOV']
Annotations have been added to run_20230920ChromaLiveTL_24hr4ch_MaxIP and saved!
(128058, 1939)
Adding annotations to merged single cells for run_20231017ChromaLive_6hr_4ch_MaxIP!
['Image_FileName_488_1', 'Image_FileName_488_2', 'Image_FileName_561', 'Image_FileName_DNA', 'Image_PathName_488_2', 'Image_PathName_561', 'Image_PathName_DNA', 'Cells_AreaShape_BoundingBoxMinimum_X', 'Cells_AreaShape_BoundingBoxMinimum_Y', 'Cells_AreaShape_BoundingBoxMaximum_X', 'Cells_