# Annotate merged single cells with metadata from platemap file

## Import libraries

In [1]:
import pathlib
import sys

import lancedb
import pandas as pd
from pycytominer import annotate
from pycytominer.cyto_utils import output

  from .autonotebook import tqdm as notebook_tqdm


## Set paths and variables

In [2]:
# load in platemap file as a pandas dataframe
platemap_path = pathlib.Path("../../data/").resolve()

# directory where parquet files are located
data_dir = pathlib.Path("../data/converted_data")

# directory where the annotated parquet files are saved to
output_dir = pathlib.Path("../data/annotated_data")
output_dir.mkdir(exist_ok=True)

In [3]:
# dictionary with each run for the cell type
dict_of_inputs = {
    "run_20231017ChromaLive_6hr_4ch_MaxIP": {
        "source_path": pathlib.Path(
            f"{data_dir}/20231017ChromaLive_6hr_4ch_MaxIP.parquet"
        ).resolve(strict=True),
        "platemap_path": pathlib.Path(f"{platemap_path}/platemap_6hr_4ch.csv").resolve(
            strict=True
        ),
    },
    # "20231017ChromaLive_endpoint_w_AnnexinV_2ch_MaxIP": {
    #     "source_path": pathlib.Path(
    #         f"{data_dir}/20231017ChromaLive_endpoint_w_AnnexinV_2ch_MaxIP.parquet"
    #     ).resolve(strict=True),
    #     "platemap_path": pathlib.Path(
    #         f"{platemap_path}/platemap_AnnexinV_2ch.csv"
    #     ).resolve(strict=True),
    # },
}

## Annotate merged single cells

In [4]:
for data_run, info in dict_of_inputs.items():
    # load in converted parquet file as df to use in annotate function
    single_cell_df = pd.read_parquet(info["source_path"])
    print(single_cell_df.shape)
    single_cell_df.rename(
        columns={
            "Image_Metadata_FOV": "Metadata_FOV",
            "Image_Metadata_Time": "Metadata_Time",
        },
        inplace=True,
    )
    platemap_df = pd.read_csv(info["platemap_path"])
    output_file = str(pathlib.Path(f"{output_dir}/{data_run}_sc.parquet"))
    print(f"Adding annotations to merged single cells for {data_run}!")

    # add metadata from platemap file to extracted single cell features
    annotated_df = annotate(
        profiles=single_cell_df,
        platemap=platemap_df,
        join_on=["Metadata_well", "Image_Metadata_Well"],
    )
    print(annotated_df.shape)

    # move metadata well and single cell count to the front of the df (for easy visualization in python)
    well_column = annotated_df.pop("Metadata_Well")
    singlecell_column = annotated_df.pop("Metadata_number_of_singlecells")
    # insert the column as the second index column in the dataframe
    annotated_df.insert(1, "Metadata_Well", well_column)
    annotated_df.insert(2, "Metadata_number_of_singlecells", singlecell_column)

    # rename metadata columns to match the expected column names
    columns_to_rename = {
        "Nuclei_Location_Center_Y": "Metadata_Nuclei_Location_Center_Y",
        "Nuclei_Location_Center_X": "Metadata_Nuclei_Location_Center_X",
    }
    # Image_FileName cols
    for col in annotated_df.columns:
        if "Image_FileName" in col:
            columns_to_rename[col] = f"Metadata_{col}"
        elif "Image_PathName" in col:
            columns_to_rename[col] = f"Metadata_{col}"
        elif "TrackObjects" in col:
            columns_to_rename[col] = f"Metadata_{col}"
    # rename metadata columns
    annotated_df.rename(columns=columns_to_rename, inplace=True)

    # save annotated df as parquet file
    output(
        df=annotated_df,
        output_filename=output_file,
        output_type="parquet",
    )
    print(f"Annotations have been added to {data_run} and saved!")
    # check last annotated df to see if it has been annotated correctly
    print(annotated_df.shape)
    annotated_df.head()

(19382, 2321)
Adding annotations to merged single cells for run_20231017ChromaLive_6hr_4ch_MaxIP!
(19382, 2325)
Annotations have been added to run_20231017ChromaLive_6hr_4ch_MaxIP and saved!
(19382, 2325)


### Merge the terminal and single cell data

## Add the object tacking from SAM2

In [5]:
# set and connect to the db
# create the database object
uri = pathlib.Path("../../data/objects_db").resolve()
db = lancedb.connect(uri)

In [6]:
# get the db schema and tables
db.table_names()
# load table
table = db["1.masked_images"]
location_metadata_df = table.to_pandas()
print(location_metadata_df.shape)
location_metadata_df.head()

(15557, 8)


Unnamed: 0,image_set_name,frame,object_id,x,y,mask_path,mask_file_name,mask_file_path
0,E-11_F0002,0,1,120.634918,15.555555,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,0.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...
1,E-11_F0002,0,2,293.525635,23.012821,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,0.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...
2,E-11_F0002,0,3,1115.543457,20.380434,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,0.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...
3,E-11_F0002,0,4,46.547619,38.988094,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,0.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...
4,E-11_F0002,0,5,1820.597778,37.445652,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,0.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...


In [7]:
# change frame to Metadata_Time
location_metadata_df.rename(columns={"frame": "Metadata_Time"}, inplace=True)
# add 1 to Metadata_Time to match the timepoints in the single cell data
location_metadata_df["Metadata_Time"] = location_metadata_df["Metadata_Time"] + 1
# change formatting to leading 4 zeros
location_metadata_df["Metadata_Time"] = location_metadata_df["Metadata_Time"].apply(
    lambda x: f"{x:04}"
)
location_metadata_df["Metadata_unique_cell_name"] = (
    location_metadata_df["image_set_name"]
    + "_"
    + location_metadata_df["object_id"].astype(str)
)
print(location_metadata_df.shape)
location_metadata_df.head()

(15557, 9)


Unnamed: 0,image_set_name,Metadata_Time,object_id,x,y,mask_path,mask_file_name,mask_file_path,Metadata_unique_cell_name
0,E-11_F0002,1,1,120.634918,15.555555,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,0.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,E-11_F0002_1
1,E-11_F0002,1,2,293.525635,23.012821,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,0.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,E-11_F0002_2
2,E-11_F0002,1,3,1115.543457,20.380434,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,0.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,E-11_F0002_3
3,E-11_F0002,1,4,46.547619,38.988094,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,0.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,E-11_F0002_4
4,E-11_F0002,1,5,1820.597778,37.445652,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,0.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,E-11_F0002_5


### Loop through the saved annotated dfs and add the object tracking

In [8]:
for data_run in dict_of_inputs.keys():
    if "endpoint" in data_run:
        # get only the endpoint timepoint for the endpoint data
        location_metadata_df = location_metadata_df[
            location_metadata_df["Metadata_Time"] == "0014"
        ]
    elif "endpoint" not in data_run:
        # remove the endpoint timepoint for the 6hr data
        location_metadata_df = location_metadata_df[
            location_metadata_df["Metadata_Time"] != "0014"
        ]
    # load in annotated parquet file as df to use in annotate function
    annotated_df = pd.read_parquet(
        pathlib.Path(f"{output_dir}/{data_run}_sc.parquet").resolve(strict=True)
    )
    print(f"Oringinal shape of {data_run} is {annotated_df.shape}")
    print(f"Adding location metadata to single cells for {data_run}!")

    annotated_df["Metadata_image_set_name"] = (
        annotated_df["Metadata_Well"].astype(str)
        + "_"
        + "F"
        + annotated_df["Metadata_FOV"].astype(str)
    )
    image_set_names = annotated_df.pop("Metadata_image_set_name")
    # move to front
    annotated_df.insert(0, "Metadata_image_set_name", image_set_names)
    time = annotated_df.pop("Metadata_Time")
    annotated_df.insert(1, "Metadata_Time", time)
    x_coord = annotated_df.pop("Metadata_Nuclei_Location_Center_X")
    Y_coord = annotated_df.pop("Metadata_Nuclei_Location_Center_Y")
    annotated_df.insert(2, "Metadata_Nuclei_Location_Center_X", x_coord)
    annotated_df.insert(3, "Metadata_Nuclei_Location_Center_Y", Y_coord)

    num_cells_over_time = annotated_df.shape[0]
    # drop NaN values in the centroid columns from annotated_df
    annotated_df = annotated_df.dropna(
        subset=[
            "Metadata_Nuclei_Location_Center_X",
            "Metadata_Nuclei_Location_Center_Y",
        ]
    )
    print(
        f"There were {num_cells_over_time - annotated_df.shape[0]} NaN values in the centroid columns"
    )
    # match the x and y coordinates to the image set name in the location metadata df
    # convert to be in image coordinate system
    # annotated_df['Metadata_Nuclei_Location_Center_X'] = 1900 - annotated_df['Metadata_Nuclei_Location_Center_X']
    # annotated_df['Metadata_Nuclei_Location_Center_Y'] = 1900 - annotated_df['Metadata_Nuclei_Location_Center_Y']

    annotated_df["Metadata_Nuclei_Location_Center_X"] = annotated_df[
        "Metadata_Nuclei_Location_Center_X"
    ].astype(int)
    annotated_df["Metadata_Nuclei_Location_Center_Y"] = annotated_df[
        "Metadata_Nuclei_Location_Center_Y"
    ].astype(int)
    location_metadata_df["x"] = location_metadata_df["x"].astype(int)
    location_metadata_df["y"] = location_metadata_df["y"].astype(int)

    merged_df = pd.merge(
        annotated_df,
        location_metadata_df,
        how="inner",
        left_on=[
            "Metadata_Nuclei_Location_Center_X",
            "Metadata_Nuclei_Location_Center_Y",
            "Metadata_Time",
            "Metadata_image_set_name",
        ],
        right_on=["x", "y", "Metadata_Time", "image_set_name"],
    )
    # add the object_id to the merged df
    # merged_df["Metadata_unique_cell_name"] = location_metadata_df["Metadata_unique_cell_name"]
    # sort by image_set_name and Metadata_Time
    merged_df = merged_df.sort_values(
        by=[
            "Metadata_image_set_name",
            # "Metadata_Time"
        ]
    )
    # drop right columns
    # merged_df = merged_df.drop(
    #     columns=[
    #         "image_set_name",
    #         "object_id",
    #         "x",
    #         "y",
    #         "mask_path",
    #         "mask_file_name",
    #         "mask_file_path",
    #     ]
    # )

    # pop all columns
    image_set = merged_df.pop("Metadata_image_set_name")
    object_id = merged_df.pop("object_id")
    x = merged_df.pop("x")
    y = merged_df.pop("y")
    Metadata_Nuclei_Location_Center_X = merged_df.pop(
        "Metadata_Nuclei_Location_Center_X"
    )
    Metadata_Nuclei_Location_Center_Y = merged_df.pop(
        "Metadata_Nuclei_Location_Center_Y"
    )

    merged_df.insert(0, "Metadata_image_set_name", image_set)
    merged_df.insert(1, "object_id", object_id)
    merged_df.insert(2, "x", x)
    merged_df.insert(3, "y", y)
    merged_df.insert(
        4, "Metadata_Nuclei_Location_Center_X", Metadata_Nuclei_Location_Center_X
    )
    merged_df.insert(
        5, "Metadata_Nuclei_Location_Center_Y", Metadata_Nuclei_Location_Center_Y
    )

    cell = merged_df.pop("Metadata_unique_cell_name")
    merged_df.insert(1, "Metadata_unique_cell_name", cell)
    print(f"The final merged shape of {data_run} is {merged_df.shape}")
    # save annotated df as parquet file
    # output(
    #     df=merged_df,
    #     output_filename=output_file,
    #     output_type="parquet",
    # )

Oringinal shape of run_20231017ChromaLive_6hr_4ch_MaxIP is (19382, 2325)
Adding location metadata to single cells for run_20231017ChromaLive_6hr_4ch_MaxIP!
There were 13466 NaN values in the centroid columns
The final merged shape of run_20231017ChromaLive_6hr_4ch_MaxIP is (3, 2334)


  annotated_df.insert(2, "Metadata_Nuclei_Location_Center_X", x_coord)
  annotated_df.insert(3, "Metadata_Nuclei_Location_Center_Y", Y_coord)


In [9]:
merged_df

Unnamed: 0,Metadata_image_set_name,Metadata_unique_cell_name,object_id,x,y,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Time,Metadata_plate,Metadata_Well,...,Nuclei_Texture_Variance_CL_561_3_02_256,Nuclei_Texture_Variance_CL_561_3_03_256,Nuclei_Texture_Variance_DNA_3_00_256,Nuclei_Texture_Variance_DNA_3_01_256,Nuclei_Texture_Variance_DNA_3_02_256,Nuclei_Texture_Variance_DNA_3_03_256,image_set_name,mask_path,mask_file_name,mask_file_path
0,C-02_F0003,C-02_F0003_156,156,1529,1657,1529,1657,4,1,C-02,...,0.109375,0.0,0.0,0.0,0.0,0.0,C-02_F0003,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,3.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...
1,E-11_F0001,E-11_F0001_16,16,1185,209,1185,209,3,1,E-11,...,1.305257,1.135521,0.0,0.0,0.0,0.0,E-11_F0001,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,2.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...
2,E-11_F0004,E-11_F0004_44,44,963,519,963,519,2,1,E-11,...,0.214167,0.207064,0.001592,0.001658,0.001639,0.001862,E-11_F0004,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,1.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...


In [10]:
annotated_df.head()
# sort by image_set_name and Metadata_Time
annotated_df = annotated_df.sort_values(
    by=[
        "Metadata_image_set_name",
        "Metadata_Time",
        "Metadata_Nuclei_Location_Center_X",
        "Metadata_Nuclei_Location_Center_Y",
    ]
)
# reset index

annotated_df = annotated_df.reset_index(drop=True)
annotated_df.head()

Unnamed: 0,Metadata_image_set_name,Metadata_Time,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,...,Nuclei_Texture_Variance_CL_488_2_3_02_256,Nuclei_Texture_Variance_CL_488_2_3_03_256,Nuclei_Texture_Variance_CL_561_3_00_256,Nuclei_Texture_Variance_CL_561_3_01_256,Nuclei_Texture_Variance_CL_561_3_02_256,Nuclei_Texture_Variance_CL_561_3_03_256,Nuclei_Texture_Variance_DNA_3_00_256,Nuclei_Texture_Variance_DNA_3_01_256,Nuclei_Texture_Variance_DNA_3_02_256,Nuclei_Texture_Variance_DNA_3_03_256
0,C-02_F0001,1,52,1789,1,C-02,162,Staurosporine,0.0,negative,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C-02_F0001,1,82,1478,1,C-02,162,Staurosporine,0.0,negative,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,C-02_F0001,1,85,635,1,C-02,162,Staurosporine,0.0,negative,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,C-02_F0001,1,119,1644,1,C-02,162,Staurosporine,0.0,negative,...,0.0,0.0,0.488674,0.513299,0.471069,0.447777,0.0,0.0,0.0,0.0
4,C-02_F0001,1,129,829,1,C-02,162,Staurosporine,0.0,negative,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
annotated_df["Metadata_Nuclei_Location_Center_Y"] = (
    1900 - annotated_df["Metadata_Nuclei_Location_Center_Y"]
)
annotated_df.head()

Unnamed: 0,Metadata_image_set_name,Metadata_Time,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,...,Nuclei_Texture_Variance_CL_488_2_3_02_256,Nuclei_Texture_Variance_CL_488_2_3_03_256,Nuclei_Texture_Variance_CL_561_3_00_256,Nuclei_Texture_Variance_CL_561_3_01_256,Nuclei_Texture_Variance_CL_561_3_02_256,Nuclei_Texture_Variance_CL_561_3_03_256,Nuclei_Texture_Variance_DNA_3_00_256,Nuclei_Texture_Variance_DNA_3_01_256,Nuclei_Texture_Variance_DNA_3_02_256,Nuclei_Texture_Variance_DNA_3_03_256
0,C-02_F0001,1,52,111,1,C-02,162,Staurosporine,0.0,negative,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C-02_F0001,1,82,422,1,C-02,162,Staurosporine,0.0,negative,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,C-02_F0001,1,85,1265,1,C-02,162,Staurosporine,0.0,negative,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,C-02_F0001,1,119,256,1,C-02,162,Staurosporine,0.0,negative,...,0.0,0.0,0.488674,0.513299,0.471069,0.447777,0.0,0.0,0.0,0.0
4,C-02_F0001,1,129,1071,1,C-02,162,Staurosporine,0.0,negative,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
annotated_df["Metadata_Nuclei_Location_Center_X"] = (
    1900 - annotated_df["Metadata_Nuclei_Location_Center_X"]
)
annotated_df.head()

Unnamed: 0,Metadata_image_set_name,Metadata_Time,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,...,Nuclei_Texture_Variance_CL_488_2_3_02_256,Nuclei_Texture_Variance_CL_488_2_3_03_256,Nuclei_Texture_Variance_CL_561_3_00_256,Nuclei_Texture_Variance_CL_561_3_01_256,Nuclei_Texture_Variance_CL_561_3_02_256,Nuclei_Texture_Variance_CL_561_3_03_256,Nuclei_Texture_Variance_DNA_3_00_256,Nuclei_Texture_Variance_DNA_3_01_256,Nuclei_Texture_Variance_DNA_3_02_256,Nuclei_Texture_Variance_DNA_3_03_256
0,C-02_F0001,1,1848,111,1,C-02,162,Staurosporine,0.0,negative,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C-02_F0001,1,1818,422,1,C-02,162,Staurosporine,0.0,negative,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,C-02_F0001,1,1815,1265,1,C-02,162,Staurosporine,0.0,negative,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,C-02_F0001,1,1781,256,1,C-02,162,Staurosporine,0.0,negative,...,0.0,0.0,0.488674,0.513299,0.471069,0.447777,0.0,0.0,0.0,0.0
4,C-02_F0001,1,1771,1071,1,C-02,162,Staurosporine,0.0,negative,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
location_metadata_df.head()
# sort by image_set_name and Metadata_Time and x and y
location_metadata_df = location_metadata_df.sort_values(
    by=["image_set_name", "Metadata_Time", "x", "y"]
)
# reset index
location_metadata_df = location_metadata_df.reset_index(drop=True)
location_metadata_df.head(25)

Unnamed: 0,image_set_name,Metadata_Time,object_id,x,y,mask_path,mask_file_name,mask_file_path,Metadata_unique_cell_name
0,C-02_F0001,1,128,15,1483,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,0.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,C-02_F0001_128
1,C-02_F0001,1,81,18,850,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,0.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,C-02_F0001_81
2,C-02_F0001,1,8,22,45,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,0.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,C-02_F0001_8
3,C-02_F0001,1,96,22,1054,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,0.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,C-02_F0001_96
4,C-02_F0001,1,148,72,1687,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,0.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,C-02_F0001_148
5,C-02_F0001,1,75,85,773,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,0.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,C-02_F0001_75
6,C-02_F0001,1,105,88,1165,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,0.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,C-02_F0001_105
7,C-02_F0001,1,169,90,1886,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,0.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,C-02_F0001_169
8,C-02_F0001,1,32,105,291,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,0.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,C-02_F0001_32
9,C-02_F0001,1,11,107,65,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,0.png,/gpfs/alpine1/scratch/mlippincott@xsede.org/li...,C-02_F0001_11


In [14]:
# print(annotated_df.shape)
# annotated_df["Metadata_image_set_name"] = (
#     annotated_df["Metadata_Well"].astype(str)
#     + "_"
#     + "F"
#     + annotated_df["Metadata_FOV"].astype(str)
# )
# image_set_names = annotated_df.pop("Metadata_image_set_name")
# # move to front
# annotated_df.insert(0, "Metadata_image_set_name", image_set_names)
# time = annotated_df.pop("Metadata_Time")
# annotated_df.insert(1, "Metadata_Time", time)
# x_coord = annotated_df.pop("Metadata_Nuclei_Location_Center_X")
# Y_coord = annotated_df.pop("Metadata_Nuclei_Location_Center_Y")
# annotated_df.insert(2, "Metadata_Nuclei_Location_Center_X", x_coord)
# annotated_df.insert(3, "Metadata_Nuclei_Location_Center_Y", Y_coord)
# annotated_df.head()

In [15]:
# # drop NaN values in the centroid columns from annotated_df
# print(annotated_df.shape)
# annotated_df = annotated_df.dropna(
#     subset=["Metadata_Nuclei_Location_Center_X", "Metadata_Nuclei_Location_Center_Y"]
# )
# print(annotated_df.shape)
# print(location_metadata_df.shape)
# # match the x and y coordinates to the image set name in the location metadata df
# annotated_df["Metadata_Nuclei_Location_Center_X"] = annotated_df[
#     "Metadata_Nuclei_Location_Center_X"
# ].astype(int)
# annotated_df["Metadata_Nuclei_Location_Center_Y"] = annotated_df[
#     "Metadata_Nuclei_Location_Center_Y"
# ].astype(int)
# location_metadata_df["x"] = location_metadata_df["x"].astype(int)
# location_metadata_df["y"] = location_metadata_df["y"].astype(int)

# merged_df = annotated_df.merge(
#     location_metadata_df,
#     how="left",
#     left_on=[
#         "Metadata_Nuclei_Location_Center_X",
#         "Metadata_Nuclei_Location_Center_Y",
#         "Metadata_Time",
#         "Metadata_image_set_name",
#     ],
#     right_on=["x", "y", "Metadata_Time", "image_set_name"],
# )
# print(merged_df.shape)
# # sort by image_set_name and Metadata_Time
# merged_df = merged_df.sort_values(by=["Metadata_image_set_name", "Metadata_Time"])
# # drop right columns
# merged_df = merged_df.drop(
#     columns=[
#         "image_set_name",
#         "object_id",
#         "x",
#         "y",
#         "mask_path",
#         "mask_file_name",
#         "mask_file_path",
#     ]
# )
# merged_df.head()

In [16]:
# # save annotated df as parquet file
# output(
#     df=merged_df,
#     output_filename=output_file,
#     output_type="parquet",
# )