# Annotate merged single cells with metadata from platemap file

## Import libraries

In [1]:
import argparse
import pathlib
import sys

import lancedb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm
from pycytominer import annotate
from pycytominer.cyto_utils import output

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

  from .autonotebook import tqdm as notebook_tqdm


## Set paths and variables

In [2]:
# load in platemap file as a pandas dataframe
platemap_path = pathlib.Path("../../data/").resolve()

# directory where parquet files are located
data_dir = pathlib.Path("../data/1.annotated_data").resolve()

# directory where the annotated parquet files are saved to
profiles_output_dir = pathlib.Path(
    "../data/2.sc_tracks_annotated_data/profiles/"
).resolve()
stats_output_dir = pathlib.Path("../data/2.sc_tracks_annotated_data/stats/").resolve()

profiles_output_dir.mkdir(exist_ok=True, parents=True)
stats_output_dir.mkdir(exist_ok=True, parents=True)

if not in_notebook:
    print("Running as script")
    # set up arg parser
    parser = argparse.ArgumentParser(description="Single cell extraction")

    parser.add_argument(
        "--well_fov",
        type=str,
        help="Path to the input directory containing the tiff images",
    )

    args = parser.parse_args()
    well_fov = args.well_fov
    images_dir = pathlib.Path(data_dir / well_fov).resolve(strict=True)
else:
    print("Running in a notebook")
    well_fov = "C-02_F0001"

Running in a notebook


In [3]:
tracks = pathlib.Path(
    f"../../4.cell_tracking/results/{well_fov}_tracks.parquet"
).resolve(strict=True)
profiles = pathlib.Path(
    f"../data/1.annotated_data/timelapse/{well_fov}_sc.parquet"
).resolve(strict=True)

tracks = pd.read_parquet(tracks)
profiles = pd.read_parquet(
    profiles,
    # columns=["Metadata_Time", "Nuclei_AreaShape_Center_X", "Nuclei_AreaShape_Center_Y","Metadata_compound","Metadata_dose"]
)
# prepend NMF_ to the tracks columns
tracks.columns = ["NMF_" + str(col) for col in tracks.columns]
tracks["NMF_coordinates"] = list(zip(tracks["NMF_x"], tracks["NMF_y"]))
profiles["NMF_coordinates"] = list(
    zip(profiles["Nuclei_AreaShape_Center_X"], profiles["Nuclei_AreaShape_Center_Y"])
)

profiles["Metadata_Time"] = profiles["Metadata_Time"].astype(float)
profiles["Metadata_Time"] = profiles["Metadata_Time"] - 1

In [4]:
coordinate_column_left = "NMF_coordinates"
coordinate_column_right = "NMF_coordinates"
pixel_cutt_off = 5
left_on = ["Metadata_Time"]
right_on = ["NMF_t"]
merged_df_list = []  # list to store the merged dataframes
total_CP_cells = 0  # total number of cells in the left dataframe
total_annotated_cells = 0  # total number of cells that were annotated
distances = []  # list to store the distances between the coordinates

In [5]:
for time in profiles["Metadata_Time"].unique():
    df_left = profiles.copy().loc[profiles["Metadata_Time"] == time]
    df_right = tracks.copy().loc[tracks["NMF_t"] == time]

    total_CP_cells += df_left.shape[0]
    # loop through the rows in the subset_annotated_df and find the closest coordinate set in the location metadata
    for index1, row1 in df_left.iterrows():
        dist = np.inf
        for index2, row2 in df_right.iterrows():
            coord1 = row1[coordinate_column_left]
            coord2 = row2[coordinate_column_right]
            try:
                temp_dist = np.linalg.norm(np.array(coord1) - np.array(coord2))
            except:
                temp_dist = np.inf
            if temp_dist <= dist:
                dist = temp_dist
                coord2_index = index2

            # set cut off of 5,5 pixel in the euclidean distance
            euclidean_cut_off = np.linalg.norm(
                np.array([0, 0]) - np.array([pixel_cutt_off, pixel_cutt_off])
            )

        if dist < euclidean_cut_off:
            temp_merged_df = pd.merge(
                df_left.loc[[index1]],
                df_right.loc[[coord2_index]],
                how="inner",
                left_on=left_on,
                right_on=right_on,
            )
            distances.append(dist)
            total_annotated_cells += temp_merged_df.shape[0]
            merged_df_list.append(temp_merged_df)
if len(merged_df_list) == 0:
    merged_df_list.append(pd.DataFrame())
merged_df = pd.concat(merged_df_list)
merged_df["NMF_distance"] = distances

# replace Metadata string in column names with NMF (Non Morphology Features)
merged_df.columns = [
    x.replace("Metadata_", "NMF_") if "Metadata_" in x else x for x in merged_df.columns
]

print(f"Annotated cells: {total_annotated_cells} out of {total_CP_cells}")
print(f"Percentage of annotated cells: {total_annotated_cells/total_CP_cells*100}%")
print(merged_df.shape)
merged_df.to_parquet(profiles_output_dir / f"{well_fov}_annotated_tracks.parquet")
merged_df.head()

Annotated cells: 1700 out of 2309
Percentage of annotated cells: 73.6249458640104%
(1700, 2332)


Unnamed: 0,NMF_plate,NMF_Well,NMF_number_of_singlecells,NMF_compound,NMF_dose,NMF_control,NMF_ImageNumber,NMF_FOV,NMF_Time,NMF_Cells_Number_Object_Number,...,NMF_coordinates_x,NMF_track_id,NMF_t,NMF_y,NMF_x,NMF_id,NMF_parent_track_id,NMF_parent_id,NMF_coordinates_y,NMF_distance
0,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,6,...,"(1272.3892376681615, 16.955156950672645)",1,0.0,17.0,1272.0,1000004.0,-1,-1.0,"(1272.0, 17.0)",0.391812
0,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,7,...,"(1503.8977788995458, 28.629227662796566)",2,0.0,29.0,1504.0,1000005.0,-1,-1.0,"(1504.0, 29.0)",0.384605
0,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,9,...,"(1894.5, 16.607142857142858)",3,0.0,17.0,1894.0,1000009.0,-1,-1.0,"(1894.0, 17.0)",0.635875
0,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,11,...,"(1204.1402042711236, 61.70612813370474)",5,0.0,62.0,1204.0,1000012.0,-1,-1.0,"(1204.0, 62.0)",0.325604
0,1,C-02,180,Staurosporine,0.0,negative,1,1,0.0,12,...,"(1380.8329879817654, 45.818897637795274)",4,0.0,46.0,1381.0,1000011.0,-1,-1.0,"(1381.0, 46.0)",0.246356


In [6]:
# get the number of tracks for each track length
list_of_track_lengths = []
for track in merged_df["NMF_track_id"].unique():
    track_length = merged_df.loc[merged_df["NMF_track_id"] == track].shape[0]
    list_of_track_lengths.append(track_length)
list_of_track_lengths_df = pd.DataFrame(list_of_track_lengths, columns=["track_length"])
list_of_track_lengths_df = (
    list_of_track_lengths_df.value_counts().to_frame().reset_index()
)
list_of_track_lengths_df["well_fov"] = well_fov
# save the list of track lengths to a parquet file
list_of_track_lengths_df.to_parquet(
    stats_output_dir / f"{well_fov}_track_lengths.parquet"
)

In [7]:
well_fov_stats_df = pd.DataFrame(
    {
        "well_fov": [well_fov],
        "total_CP_cells": [total_CP_cells],
        "total_annotated_cells": [total_annotated_cells],
        "percentage_annotated_cells": [total_annotated_cells / total_CP_cells * 100],
    }
)
well_fov_stats_df
# save the stats to a parquet file
well_fov_stats_df.to_parquet(stats_output_dir / f"{well_fov}_stats.parquet")