# Annotate merged single cells with metadata from platemap file

## Import libraries

In [1]:
import argparse
import pathlib
import sys

import lancedb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pycytominer import annotate
from pycytominer.cyto_utils import output

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def euclidean_coordinate_fuzzy_match(
    df_left: pd.DataFrame,
    df_right: pd.DataFrame,
    left_on: list,
    right_on: list,
    coordinate_column: str,
    unique_image_column: str,
    pixel_cutt_off: int = 5,
) -> pd.DataFrame:
    """
    This function fuzzy merges two dataframes based on the euclidean distance between the coordinates in the coordinate_column.

    Parameters
    ----------
    df_left : pd.DataFrame
        left dataframe to merge
    df_right : pd.DataFrame
        right dataframe to merge
    left_on : list
        Left dataframe columns to match on
    right_on : list
        Right dataframe columns to match on
    coordinate_column : str
        The column name that contains the coordinates to match on
        Note that the coordinates should be in a tuple format
    unique_image_column : str
        The column name that contains the unique image identifier to split the dataframes on
        This ensures that coordinates from other images are not matched together

    Returns
    -------
    pd.DataFrame
        A merged dataframe of the two input dataframes based on the euclidean distance between the coordinates
    """
    # split each data frame into each cell_merge_column
    all_images = df_left[unique_image_column].unique()

    merged_df_list = []  # list to store the merged dataframes
    total_CP_cells = 0  # total number of cells in the left dataframe
    total_annotated_cells = 0  # total number of cells that were annotated
    distances = []  # list to store the distances between the coordinates

    for image in all_images:
        subset_df_left = df_left[df_left[unique_image_column] == image]
        subset_df_right = df_right[df_right[unique_image_column] == image]
        total_CP_cells += subset_df_left.shape[0]
        # loop through the rows in the subset_annotated_df and find the closest coordinate set in the location metadata
        for index1, row1 in subset_df_left.iterrows():
            dist = np.inf
            for index2, row2 in subset_df_right.iterrows():
                coord1 = row1[coordinate_column]
                coord2 = row2[coordinate_column]
                try:
                    temp_dist = np.linalg.norm(np.array(coord1) - np.array(coord2))
                except:
                    temp_dist = np.inf
                if temp_dist <= dist:
                    dist = temp_dist
                    coord2_index = index2

            # set cut off of 5,5 pixel in the euclidean distance
            euclidean_cut_off = np.linalg.norm(
                np.array([0, 0]) - np.array([pixel_cutt_off, pixel_cutt_off])
            )

            if dist < np.inf:
                temp_merged_df = pd.merge(
                    subset_df_left.loc[[index1]],
                    subset_df_right.loc[[coord2_index]],
                    how="inner",
                    left_on=right_on,
                    right_on=left_on,
                )
                distances.append(dist)
                total_annotated_cells += temp_merged_df.shape[0]
                merged_df_list.append(temp_merged_df)
    if len(merged_df_list) == 0:
        return pd.DataFrame()
    merged_df = pd.concat(merged_df_list)
    merged_df["distance"] = distances
    print(f"Percentage of annotated cells: {total_annotated_cells/total_CP_cells*100}%")
    return merged_df

## Set paths and variables

In [3]:
# load in platemap file as a pandas dataframe
platemap_path = pathlib.Path("../../data/").resolve()

# directory where parquet files are located
data_dir = pathlib.Path("../data/converted_data").resolve()

# directory where the annotated parquet files are saved to
output_dir = pathlib.Path("../data/annotated_data")
output_dir.mkdir(exist_ok=True)

if not in_notebook:
    print("Running as script")
    # set up arg parser
    parser = argparse.ArgumentParser(description="Single cell extraction")

    parser.add_argument(
        "--well_fov",
        type=str,
        help="Path to the input directory containing the tiff images",
    )

    args = parser.parse_args()
    well_fov = args.well_fov
    images_dir = pathlib.Path(data_dir / well_fov).resolve(strict=True)
else:
    print("Running in a notebook")
    well_fov = "C-02_F0001"

Running in a notebook


In [4]:
# dictionary with each run for the cell type
dict_of_inputs = {
    "run_20231017ChromaLive_6hr_4ch_MaxIP": {
        "source_path": pathlib.Path(f"{data_dir}/timelapse/{well_fov}.parquet").resolve(
            strict=True
        ),
        "platemap_path": pathlib.Path(f"{platemap_path}/platemap_6hr_4ch.csv").resolve(
            strict=True
        ),
        "output_file": pathlib.Path(
            f"{output_dir}/timelapse/{well_fov}_sc.parquet"
        ).resolve(),
    },
    "20231017ChromaLive_endpoint_w_AnnexinV_2ch_MaxIP": {
        "source_path": pathlib.Path(f"{data_dir}/endpoint/{well_fov}.parquet").resolve(
            strict=True
        ),
        "platemap_path": pathlib.Path(
            f"{platemap_path}/platemap_AnnexinV_2ch.csv"
        ).resolve(strict=True),
        "output_file": pathlib.Path(
            f"{output_dir}/endpoint/{well_fov}_sc.parquet"
        ).resolve(),
    },
}

## Annotate merged single cells

In [5]:
for data_run, info in dict_of_inputs.items():
    # load in converted parquet file as df to use in annotate function
    single_cell_df = pd.read_parquet(info["source_path"])
    print(single_cell_df.shape)
    single_cell_df = single_cell_df.rename(
        columns={
            "Image_Metadata_FOV": "Metadata_FOV",
            "Image_Metadata_Time": "Metadata_Time",
        },
    )
    platemap_df = pd.read_csv(info["platemap_path"])

    print(f"Adding annotations to merged single cells for {data_run}!")

    # add metadata from platemap file to extracted single cell features
    annotated_df = annotate(
        profiles=single_cell_df,
        platemap=platemap_df,
        join_on=["Metadata_well", "Image_Metadata_Well"],
    )
    print(annotated_df.shape)

    # move metadata well and single cell count to the front of the df (for easy visualization in python)
    well_column = annotated_df.pop("Metadata_Well")
    singlecell_column = annotated_df.pop("Metadata_number_of_singlecells")
    # insert the column as the second index column in the dataframe
    annotated_df.insert(1, "Metadata_Well", well_column)
    annotated_df.insert(2, "Metadata_number_of_singlecells", singlecell_column)

    # rename metadata columns to match the expected column names
    columns_to_rename = {
        "Nuclei_Location_Center_Y": "Metadata_Nuclei_Location_Center_Y",
        "Nuclei_Location_Center_X": "Metadata_Nuclei_Location_Center_X",
    }
    # Image_FileName cols
    for col in annotated_df.columns:
        if "Image_FileName" in col:
            columns_to_rename[col] = f"Metadata_{col}"
        elif "Image_PathName" in col:
            columns_to_rename[col] = f"Metadata_{col}"
        elif "TrackObjects" in col:
            columns_to_rename[col] = f"Metadata_{col}"
    # rename metadata columns
    annotated_df.rename(columns=columns_to_rename, inplace=True)

    info["output_file"].parent.mkdir(exist_ok=True, parents=True)

    # save annotated df as parquet file
    output(
        df=annotated_df,
        output_filename=info["output_file"],
        output_type="parquet",
    )
    print(
        f"Annotations have been added to {data_run} and saved to {info['output_file']}"
    )
    # check last annotated df to see if it has been annotated correctly
    print(annotated_df.shape)
    annotated_df.head()
del annotated_df

(2309, 2318)
Adding annotations to merged single cells for run_20231017ChromaLive_6hr_4ch_MaxIP!
(2309, 2322)
Annotations have been added to run_20231017ChromaLive_6hr_4ch_MaxIP and saved to /home/lippincm/Documents/live_cell_timelapse_apoptosis/5.process_CP_features/data/annotated_data/timelapse/C-02_F0001_sc.parquet
(2309, 2322)
(155, 1202)
Adding annotations to merged single cells for 20231017ChromaLive_endpoint_w_AnnexinV_2ch_MaxIP!
(1860, 1206)
Annotations have been added to 20231017ChromaLive_endpoint_w_AnnexinV_2ch_MaxIP and saved to /home/lippincm/Documents/live_cell_timelapse_apoptosis/5.process_CP_features/data/annotated_data/endpoint/C-02_F0001_sc.parquet
(1860, 1206)
