# Merge single cells from CellProfiler outputs using CytoTable

In [1]:
import argparse
import pathlib
import pprint
import sys

import pandas as pd
from cytotable import convert, presets

sys.path.append("../../utils")
import sc_extraction_utils as sc_utils
from parsl.config import Config
from parsl.executors import HighThroughputExecutor

# check if in a jupyter notebook
try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

## Set paths and variables

All paths must be string but we use pathlib to show which variables are paths

In [None]:
# type of file output from CytoTable (currently only parquet)
dest_datatype = "parquet"

# s1lite directory
source_dir = pathlib.Path(
    "../../4.cellprofiler_analysis/analysis_output/test/"
).resolve(strict=True)
# directory where parquet files are saved to
output_dir = pathlib.Path("../data/0.converted_data")
output_dir.mkdir(exist_ok=True, parents=True)

if not in_notebook:
    print("Running as script")
    # set up arg parser
    parser = argparse.ArgumentParser(description="Single cell extraction")

    parser.add_argument(
        "--well_fov",
        type=str,
        help="Path to the input directory containing the tiff images",
    )

    args = parser.parse_args()
    well_fov = args.well_fov
else:
    print("Running in a notebook")
    well_fov = "C-02_F0001"

Running in a notebook


## set config joins for each preset

In [3]:
# preset configurations based on typical CellProfiler outputs
preset = "cellprofiler_sqlite_pycytominer"

In [4]:
dict_of_inputs = {
    "20231017ChromaLive_6hr_4ch_MaxIP": {
        "source_path": pathlib.Path(
            f"{source_dir}/timelapse/{well_fov}/timelapse_4ch_analysis.sqlite"
        ).resolve(strict=True),
        "dest_path": pathlib.Path(
            f"{output_dir}/timelapse/{well_fov}.parquet"
        ).resolve(),
        "preset": """WITH Per_Image_Filtered AS (
                SELECT
                    Metadata_ImageNumber,
                    Image_Metadata_Well,
                    Image_Metadata_FOV,
                    Image_Metadata_Time,
                    Image_PathName_CL_488_1
                    Image_PathName_CL_488_2,
                    Image_PathName_CL_561,
                    Image_FileName_CL_488_1,
                    Image_FileName_CL_488_2,
                    Image_FileName_CL_561,
                    Image_FileName_DNA,

                FROM
                    read_parquet('per_image.parquet')
                )
            SELECT
                *
            FROM
                Per_Image_Filtered AS per_image
            LEFT JOIN read_parquet('per_cytoplasm.parquet') AS per_cytoplasm ON
                per_cytoplasm.Metadata_ImageNumber = per_image.Metadata_ImageNumber
            LEFT JOIN read_parquet('per_cells.parquet') AS per_cells ON
                per_cells.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
                AND per_cells.Metadata_Cells_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Cells
            LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON
                per_nuclei.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
                AND per_nuclei.Metadata_Nuclei_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
                """,
    },
    "20231017ChromaLive_endpoint_w_AnnexinV_2ch_MaxIP": {
        "source_path": pathlib.Path(
            f"{source_dir}/endpoint/{well_fov}/timelapse_2ch_analysis.sqlite"
        ).resolve(),
        "dest_path": pathlib.Path(
            f"{output_dir}/endpoint/{well_fov}.parquet"
        ).resolve(),
        "preset": """WITH Per_Image_Filtered AS (
                SELECT
                    Metadata_ImageNumber,
                    Image_Metadata_Well,
                    Image_Metadata_FOV,
                    Image_Metadata_Time,
                    Image_PathName_AnnexinV,
                    Image_PathName_DNA,
                    Image_FileName_AnnexinV,
                    Image_FileName_DNA


                FROM
                    read_parquet('per_image.parquet')
                )
            SELECT
                *
            FROM
                Per_Image_Filtered AS per_image
            LEFT JOIN read_parquet('per_cytoplasm.parquet') AS per_cytoplasm ON
                per_cytoplasm.Metadata_ImageNumber = per_image.Metadata_ImageNumber
            LEFT JOIN read_parquet('per_cells.parquet') AS per_cells ON
                per_cells.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
                AND per_cells.Metadata_Cells_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Cells
            LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON
                per_nuclei.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
                AND per_nuclei.Metadata_Nuclei_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
                """,
    },
    "20231017ChromaLive_endpoint_w_AnnexinV_2ch_MaxIP_whole_image": {
        "source_path": pathlib.Path(
            f"{source_dir}/endpoint/{well_fov}/timelapse_2ch_analysis.sqlite"
        ).resolve(),
        "dest_path": pathlib.Path(
            f"{output_dir}/endpoint/{well_fov}_whole_image.parquet"
        ).resolve(),
        "preset": """WITH Per_Image_Filtered AS (
                SELECT
                    Metadata_ImageNumber,
                    Image_Metadata_Well,
                    Image_Metadata_FOV,
                    Image_Metadata_Time,
                    Image_PathName_AnnexinV,
                    Image_PathName_DNA,
                    Image_FileName_AnnexinV,
                    Image_FileName_DNA


                FROM
                    read_parquet('per_image.parquet')
                )
            SELECT
                *
            FROM
                Per_Image_Filtered AS per_image
            LEFT JOIN read_parquet('per_cytoplasm.parquet') AS per_cytoplasm ON
                per_cytoplasm.Metadata_ImageNumber = per_image.Metadata_ImageNumber
            LEFT JOIN read_parquet('per_cells.parquet') AS per_cells ON
                per_cells.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
                AND per_cells.Metadata_Cells_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Cells
            LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON
                per_nuclei.Metadata_ImageNumber = per_cytoplasm.Metadata_ImageNumber
                AND per_nuclei.Metadata_Nuclei_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
                """,
    },
}

## Convert SQLite file and merge single cells into parquet file

This was not run to completion as we use the nbconverted python file for full run.

In [5]:
# run through each run with each set of paths based on dictionary
for sqlite_file, info in dict_of_inputs.items():
    source_path = info["source_path"]
    dest_path = info["dest_path"]
    presets.config["cellprofiler_sqlite_pycytominer"]["CONFIG_JOINS"] = info["preset"]
    print(f"Performing merge single cells and conversion on {sqlite_file}!")
    print(f"Source path: {source_path}")
    print(f"Destination path: {dest_path}")
    # merge single cells and output as parquet file
    convert(
        source_path=source_path,
        dest_path=dest_path,
        dest_datatype=dest_datatype,
        preset=preset,
        parsl_config=Config(
            executors=[HighThroughputExecutor()],
        ),
        chunk_size=10000,
    )
    print(f"Merged and converted {pathlib.Path(dest_path).name}!")
    df = pd.read_parquet(dest_path)
    print(f"Shape of {pathlib.Path(dest_path).name}: {df.shape}")
    # add single cell count per well as metadata column to parquet file and save back to same path
    sc_utils.add_sc_count_metadata_file(
        data_path=dest_path,
        well_column_name="Metadata_ImageNumber",
        file_type="parquet",
    )
    # read the parquet file to check if metadata was added
    df1 = pd.read_parquet(dest_path)
    print(f"Shape of {pathlib.Path(dest_path).name}: {df.shape}")
    print(f"Added single cell count as metadata to {pathlib.Path(dest_path).name}!")

Performing merge single cells and conversion on 20231017ChromaLive_6hr_4ch_MaxIP!
Source path: /home/lippincm/Documents/live_cell_timelapse_apoptosis/4.cellprofiler_analysis/analysis_output/test/timelapse/C-02_F0001/timelapse_4ch_analysis.sqlite
Destination path: /home/lippincm/Documents/live_cell_timelapse_apoptosis/5.process_CP_features/data/0.converted_data/timelapse/C-02_F0001.parquet
Merged and converted C-02_F0001.parquet!
Shape of C-02_F0001.parquet: (2309, 2317)


Reusing previously loaded Parsl configuration.


Shape of C-02_F0001.parquet: (2309, 2317)
Added single cell count as metadata to C-02_F0001.parquet!
Performing merge single cells and conversion on 20231017ChromaLive_endpoint_w_AnnexinV_2ch_MaxIP!
Source path: /home/lippincm/Documents/live_cell_timelapse_apoptosis/4.cellprofiler_analysis/analysis_output/test/endpoint/C-02_F0001/timelapse_2ch_analysis.sqlite
Destination path: /home/lippincm/Documents/live_cell_timelapse_apoptosis/5.process_CP_features/data/0.converted_data/endpoint/C-02_F0001.parquet
Merged and converted C-02_F0001.parquet!
Shape of C-02_F0001.parquet: (155, 1201)


Reusing previously loaded Parsl configuration.


Shape of C-02_F0001.parquet: (155, 1201)
Added single cell count as metadata to C-02_F0001.parquet!
Performing merge single cells and conversion on 20231017ChromaLive_endpoint_w_AnnexinV_2ch_MaxIP_whole_image!
Source path: /home/lippincm/Documents/live_cell_timelapse_apoptosis/4.cellprofiler_analysis/analysis_output/test/endpoint/C-02_F0001/timelapse_2ch_analysis.sqlite
Destination path: /home/lippincm/Documents/live_cell_timelapse_apoptosis/5.process_CP_features/data/0.converted_data/endpoint/C-02_F0001_whole_image.parquet
Merged and converted C-02_F0001_whole_image.parquet!
Shape of C-02_F0001_whole_image.parquet: (155, 1201)
Shape of C-02_F0001_whole_image.parquet: (155, 1201)
Added single cell count as metadata to C-02_F0001_whole_image.parquet!


In [6]:
df1.head()

Unnamed: 0,Metadata_ImageNumber,Image_Metadata_FOV,Metadata_number_of_singlecells,Image_Metadata_Time,Image_Metadata_Well,Metadata_Cells_Number_Object_Number,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_ImageNumber_1,Metadata_ImageNumber_2,...,Nuclei_Texture_SumVariance_DNA_3_02_256,Nuclei_Texture_SumVariance_DNA_3_03_256,Nuclei_Texture_Variance_AnnexinV_3_00_256,Nuclei_Texture_Variance_AnnexinV_3_01_256,Nuclei_Texture_Variance_AnnexinV_3_02_256,Nuclei_Texture_Variance_AnnexinV_3_03_256,Nuclei_Texture_Variance_DNA_3_00_256,Nuclei_Texture_Variance_DNA_3_01_256,Nuclei_Texture_Variance_DNA_3_02_256,Nuclei_Texture_Variance_DNA_3_03_256
0,1,1,155,,C-02,1,1,2,1,1,...,0.0,0.0,0.033796,0.032255,0.032193,0.026836,0.0,0.0,0.0,0.0
1,1,1,155,,C-02,2,2,4,1,1,...,0.0,0.0,0.02886,0.032965,0.041805,0.029744,0.0,0.0,0.0,0.0
2,1,1,155,,C-02,3,3,3,1,1,...,0.0,0.0,0.381156,0.403686,0.536844,0.11,0.0,0.0,0.0,0.0
3,1,1,155,,C-02,4,4,6,1,1,...,1.66911,1.61948,0.189713,0.184953,0.187761,0.179701,0.584543,0.576708,0.575668,0.584256
4,1,1,155,,C-02,5,5,5,1,1,...,0.0,0.0,0.216543,0.230878,0.214638,0.208162,0.0,0.0,0.0,0.0
