# Merge single cells from CellProfiler outputs using CytoTable

## Import libraries

In [1]:
import sys
import pathlib
import yaml
import pprint
import pandas as pd

# cytotable will merge objects from SQLite file into single cells and save as parquet file
from cytotable import convert, presets

# import utility to use function that will add single-cell count per well as a metadata column
sys.path.append("../utils")
import extraction_utils as sc_utils

## Set paths and variables

In [2]:
# type of file output from CytoTable (currently only parquet)
dest_datatype = "parquet"

# preset configurations based on typical CellProfiler outputs
preset = "cellprofiler_sqlite_pycytominer"

# update preset to include site metadata and cell counts
joins = presets.config["cellprofiler_sqlite_pycytominer"]["CONFIG_JOINS"].replace(
    "Image_Metadata_Well,",
    "Image_Metadata_Well, Image_Metadata_Site,",
)

# Add the PathName columns separately
joins = joins.replace(
    "COLUMNS('Image_FileName_.*'),",
    "COLUMNS('Image_FileName_.*'),\n COLUMNS('Image_PathName_.*'),"
)

# set main output dir for all parquet files
output_dir = pathlib.Path("./data/converted_data/")
output_dir.mkdir(exist_ok=True)

# directory where SQLite files are located
sqlite_dir = pathlib.Path("../2.cellprofiler_analysis/analysis_output/")

# list for plate names based on folders to use to create dictionary
plate_names = []

# iterate through 0.download_data and append plate names from folder names
# that contain image data from that plate
# (Note, you must first run `0.download_data/download_plates.ipynb`)
for file_path in pathlib.Path("../0.download_data/").iterdir():
    if str(file_path.stem).startswith("Plate"):
        plate_names.append(str(file_path.stem))

print(plate_names)

['Plate_4', 'Plate_2', 'Plate_3_prime', 'Plate_6', 'Plate_5', 'Plate_1', 'Plate_3']


## Create dictionary with info for each plate

**Note:** All paths must be string to use with CytoTable.

In [3]:
# create plate info dictionary with all parts of the CellProfiler CLI command to run in parallel
plate_info_dictionary = {
    name: {
        "source_path": str(
            pathlib.Path(
                list(sqlite_dir.rglob(f"{name}_nf1_analysis.sqlite"))[0]
            ).resolve(strict=True)
        ),
        "dest_path": str(pathlib.Path(f"{output_dir}/{name}.parquet")),
    }
    for name in plate_names
    if not pathlib.Path(
        f"{output_dir}/{name}.parquet"
    ).exists()  # skip if parquet file exists
}

# view the dictionary to assess that all info is added correctly
pprint.pprint(plate_info_dictionary, indent=4)

{   'Plate_3': {   'dest_path': 'data/converted_data/Plate_3.parquet',
                   'source_path': '/media/18tbdrive/1.Github_Repositories/nf1_schwann_cell_painting_data/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite'},
    'Plate_3_prime': {   'dest_path': 'data/converted_data/Plate_3_prime.parquet',
                         'source_path': '/media/18tbdrive/1.Github_Repositories/nf1_schwann_cell_painting_data/2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite'},
    'Plate_5': {   'dest_path': 'data/converted_data/Plate_5.parquet',
                   'source_path': '/media/18tbdrive/1.Github_Repositories/nf1_schwann_cell_painting_data/2.cellprofiler_analysis/analysis_output/Plate_5/Plate_5_nf1_analysis.sqlite'},
    'Plate_6': {   'dest_path': 'data/converted_data/Plate_6.parquet',
                   'source_path': '/media/18tbdrive/1.Github_Repositories/nf1_schwann_cell_painting_data/2.cellprofiler_analysis/analy

## Merge objects to single cells and convert SQLite to parquet file + add single cell metadata

In [4]:
# run through each run with each set of paths based on dictionary
for plate, info in plate_info_dictionary.items():
    source_path = info["source_path"]
    dest_path = info["dest_path"]

    print(f"Performing merge single cells and conversion on {plate}!")

    # merge single cells and output as parquet file
    convert(
        source_path=source_path,
        dest_path=dest_path,
        dest_datatype=dest_datatype,
        preset=preset,
        joins=joins,
    )
    print(f"Merged and converted {pathlib.Path(dest_path).name}!")

    # add single cell count per well as metadata column to parquet file and save back to same path
    sc_utils.add_sc_count_metadata_file(
        data_path=dest_path, well_column_name="Image_Metadata_Well", file_type="parquet"
    )

    print(f"Added single cell count as metadata to {pathlib.Path(dest_path).name}!")

Performing merge single cells and conversion on Plate_3_prime!
Merged and converted Plate_3_prime.parquet!


Reusing previously loaded Parsl configuration.


Added single cell count as metadata to Plate_3_prime.parquet!
Performing merge single cells and conversion on Plate_6!
Merged and converted Plate_6.parquet!


Reusing previously loaded Parsl configuration.


Added single cell count as metadata to Plate_6.parquet!
Performing merge single cells and conversion on Plate_5!
Merged and converted Plate_5.parquet!


Reusing previously loaded Parsl configuration.


Added single cell count as metadata to Plate_5.parquet!
Performing merge single cells and conversion on Plate_3!
Merged and converted Plate_3.parquet!
Added single cell count as metadata to Plate_3.parquet!


## Update the files to remove NA rows added as artifacts of CytoTable

In [5]:
for plate, info in plate_info_dictionary.items():
    file_path = pathlib.Path(info["dest_path"])

    # Load the DataFrame from the Parquet file
    df = pd.read_parquet(file_path)

    # assert that there are column names with PathName and FileName in the dataset
    assert any("PathName" in col or "FileName" in col for col in df.columns)

    # Check for NaNs in "Metadata_ImageNumber" column
    if df["Metadata_ImageNumber"].isna().any():
        print(f"NaNs found in 'Metadata_ImageNumber' column for {plate}")
        # If any, drop rows where "Metadata_ImageNumber" is NaN (artifact of cytotable)
        df = df.dropna(subset=["Metadata_ImageNumber"])
    else:
        print(f"No NaNs found in 'Metadata_ImageNumber' column for {plate}")

    # Columns to rename with Metadata prefix
    columns_to_rename = [
        "Nuclei_Location_Center_X",
        "Nuclei_Location_Center_Y",
        "Cells_Location_Center_X",
        "Cells_Location_Center_Y",
    ]

    # Rename columns with "Metadata_" prefix
    df = df.rename(
        columns=lambda col: "Metadata_" + col if col in columns_to_rename else col
    )

    # Move all columns that start with "Image_" or "Metadata_" to the front
    metadata_columns = [col for col in df.columns if col.startswith("Image_") or col.startswith("Metadata_")]
    other_columns = [col for col in df.columns if col not in metadata_columns]
    df = df[metadata_columns + other_columns]

    # Save the processed DataFrame as Parquet in the same path
    df.to_parquet(file_path, index=False)

No NaNs found in 'Metadata_ImageNumber' column for Plate_3_prime
No NaNs found in 'Metadata_ImageNumber' column for Plate_6
No NaNs found in 'Metadata_ImageNumber' column for Plate_5
No NaNs found in 'Metadata_ImageNumber' column for Plate_3


### Check if converted data looks correct

In [6]:
# Automatically select one plate from the current dictionary
selected_plate = next(iter(plate_info_dictionary))
print(f"Selected plate: {selected_plate}")

# Load the DataFrame from the Parquet file of the selected plate
converted_df = pd.read_parquet(plate_info_dictionary[selected_plate]["dest_path"])

# Print the shape and head of the DataFrame
print(converted_df.shape)
converted_df.head()

Selected plate: Plate_3_prime
(7300, 2321)


Unnamed: 0,Metadata_ImageNumber,Image_Metadata_Plate,Metadata_number_of_singlecells,Image_Metadata_Site,Image_Metadata_Well,Metadata_Cells_Number_Object_Number,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Nuclei_Number_Object_Number,Image_FileName_CY5,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,183,Plate_3_prime,321,15,C12,1,1,8,8,C12_01_3_15_CY5_001_illumcorrect.tiff,...,2666.512416,2725.83354,569.550302,545.735441,549.774822,573.410232,206.586243,202.444229,200.314037,210.27864
1,65,Plate_3_prime,76,20,B3,1,1,6,6,B3_01_3_20_CY5_001_illumcorrect.tiff,...,734.772898,667.564667,184.936092,169.177476,173.206454,170.684616,474.013885,468.030715,516.87258,490.639492
2,84,Plate_3_prime,219,8,B4,1,1,6,6,B4_01_3_8_CY5_001_illumcorrect.tiff,...,1643.702342,1669.490817,228.022375,234.970245,233.153803,236.468362,216.805122,218.543485,214.610375,210.813835
3,179,Plate_3_prime,321,10,C12,1,1,6,6,C12_01_3_10_CY5_001_illumcorrect.tiff,...,820.456512,804.943815,65.933688,64.745949,63.288276,64.09995,145.008956,144.465398,144.19284,144.019745
4,234,Plate_3_prime,137,14,C3,1,1,6,6,C3_01_3_14_CY5_001_illumcorrect.tiff,...,1059.576249,975.720272,79.432881,78.691527,79.371986,79.677796,254.487,254.277954,248.561145,245.343853


## Write dictionary to yaml file for use in downstream steps

In [7]:
dictionary_path = pathlib.Path("./plate_info_dictionary.yaml")
with open(dictionary_path, "w") as file:
    yaml.dump(plate_info_dictionary, file)