# Merge single cells from CellProfiler outputs using CytoTable

## Import libraries

In [1]:
import sys
import pathlib
import yaml
import pprint
import pandas as pd

# cytotable will merge objects from SQLite file into single cells and save as parquet file
from cytotable import convert, presets

# import utility to use function that will add single-cell count per well as a metadata column
sys.path.append("../utils")
import extraction_utils as sc_utils

## Set paths and variables

In [2]:
# type of file output from CytoTable (currently only parquet)
dest_datatype = "parquet"

# preset configurations based on typical CellProfiler outputs
preset = "cellprofiler_sqlite_pycytominer"

# update preset to include site metadata and cell counts
joins = presets.config["cellprofiler_sqlite_pycytominer"]["CONFIG_JOINS"].replace(
    "Image_Metadata_Well,",
    "Image_Metadata_Well, Image_Metadata_Site,",
)

# set main output dir for all parquet files
output_dir = pathlib.Path("./data/converted_data/")
output_dir.mkdir(exist_ok=True)

# directory where SQLite files are located
sqlite_dir = pathlib.Path("../2.cellprofiler_analysis/analysis_output/")

# list for plate names based on folders to use to create dictionary
plate_names = []

# iterate through 0.download_data and append plate names from folder names
# that contain image data from that plate
# (Note, you must first run `0.download_data/download_plates.ipynb`)
for file_path in pathlib.Path("../0.download_data/").iterdir():
    if str(file_path.stem).startswith("Plate"):
        plate_names.append(str(file_path.stem))

print(plate_names)

['Plate_4', 'Plate_2', 'Plate_3_prime', 'Plate_6', 'Plate_5', 'Plate_1', 'Plate_3']


## Create dictionary with info for each plate

**Note:** All paths must be string to use with CytoTable.

In [3]:
# create plate info dictionary with all parts of the CellProfiler CLI command to run in parallel
plate_info_dictionary = {
    name: {
        "source_path": str(
            pathlib.Path(
                list(sqlite_dir.rglob(f"{name}_nf1_analysis.sqlite"))[0]
            ).resolve(strict=True)
        ),
        "dest_path": str(pathlib.Path(f"{output_dir}/{name}.parquet")),
    }
    for name in plate_names
    if not pathlib.Path(
        f"{output_dir}/{name}.parquet"
    ).exists()  # skip if parquet file exists
}

# view the dictionary to assess that all info is added correctly
pprint.pprint(plate_info_dictionary, indent=4)

{   'Plate_6': {   'dest_path': 'data/converted_data/Plate_6.parquet',
                   'source_path': '/media/18tbdrive/1.Github_Repositories/nf1_schwann_cell_painting_data/2.cellprofiler_analysis/analysis_output/Plate_6/Plate_6_nf1_analysis.sqlite'}}


## Merge objects to single cells and convert SQLite to parquet file + add single cell metadata

In [4]:
# run through each run with each set of paths based on dictionary
for plate, info in plate_info_dictionary.items():
    source_path = info["source_path"]
    dest_path = info["dest_path"]

    print(f"Performing merge single cells and conversion on {plate}!")

    # merge single cells and output as parquet file
    convert(
        source_path=source_path,
        dest_path=dest_path,
        dest_datatype=dest_datatype,
        preset=preset,
        joins=joins,
    )
    print(f"Merged and converted {pathlib.Path(dest_path).name}!")

    # add single cell count per well as metadata column to parquet file and save back to same path
    sc_utils.add_sc_count_metadata_file(
        data_path=dest_path, well_column_name="Image_Metadata_Well", file_type="parquet"
    )

    print(f"Added single cell count as metadata to {pathlib.Path(dest_path).name}!")

Performing merge single cells and conversion on Plate_6!
Merged and converted Plate_6.parquet!
Added single cell count as metadata to Plate_6.parquet!


## Update the files to remove NA rows added as artifacts of CytoTable

In [5]:
for plate, info in plate_info_dictionary.items():
    file_path = pathlib.Path(info["dest_path"])

    # Load the DataFrame from the Parquet file
    df = pd.read_parquet(file_path)

    # Update Image_Metadata_Plate column if plate is Plate_6
    # TODO: Remove this once the metadata is fixed in the CellProfiler pipeline
    if plate == "Plate_6":
        df["Image_Metadata_Plate"] = "Plate_6"

    # Check for NaNs in "Metadata_ImageNumber" column
    if df["Metadata_ImageNumber"].isna().any():
        print(f"NaNs found in 'Metadata_ImageNumber' column for {plate}")
        # If any, drop rows where "Metadata_ImageNumber" is NaN (artifact of cytotable)
        df = df.dropna(subset=["Metadata_ImageNumber"])
    else:
        print(f"No NaNs found in 'Metadata_ImageNumber' column for {plate}")

    # Columns to rename with Metadata prefix
    columns_to_rename = [
        "Nuclei_Location_Center_X",
        "Nuclei_Location_Center_Y",
        "Cells_Location_Center_X",
        "Cells_Location_Center_Y",
    ] + [col for col in df.columns if col.startswith("Image_FileName_")]

    # Rename columns with "Metadata_" prefix
    df = df.rename(
        columns=lambda col: "Metadata_" + col if col in columns_to_rename else col
    )

    # Move all columns that start with "Image_" or "Metadata_" to the front
    metadata_columns = [col for col in df.columns if col.startswith("Image_") or col.startswith("Metadata_")]
    other_columns = [col for col in df.columns if col not in metadata_columns]
    df = df[metadata_columns + other_columns]

    # Save the processed DataFrame as Parquet in the same path
    df.to_parquet(file_path, index=False)

No NaNs found in 'Metadata_ImageNumber' column for Plate_6


### Check if converted data looks correct

In [6]:
# Automatically select one plate from the current dictionary
selected_plate = next(iter(plate_info_dictionary))
print(f"Selected plate: {selected_plate}")

# Load the DataFrame from the Parquet file of the selected plate
converted_df = pd.read_parquet(plate_info_dictionary[selected_plate]["dest_path"])

# Print the shape and head of the DataFrame
print(converted_df.shape)
converted_df.head()

Selected plate: Plate_6
(7383, 2317)


Unnamed: 0,Metadata_ImageNumber,Image_Metadata_Plate,Metadata_number_of_singlecells,Image_Metadata_Site,Image_Metadata_Well,Metadata_Cells_Number_Object_Number,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Nuclei_Number_Object_Number,Metadata_Image_FileName_CY5,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,2,Plate_6,124,11,B10,1,1,1,1,B10_01_3_11_CY5_001_illumcorrect.tiff,...,248.758586,245.22973,36.001012,36.064029,40.51267,36.898446,321.90949,330.231591,313.973255,310.467536
1,3,Plate_6,124,12,B10,1,1,1,1,B10_01_3_12_CY5_001_illumcorrect.tiff,...,1328.555897,1308.333437,183.548209,222.173254,192.569512,176.849752,264.202176,259.868814,261.484819,263.934095
2,7,Plate_6,124,16,B10,1,1,1,1,B10_01_3_16_CY5_001_illumcorrect.tiff,...,1484.411027,1425.21442,416.393911,416.781146,462.623,401.407614,416.022241,413.578555,418.742383,415.707715
3,16,Plate_6,124,26,B10,1,1,1,1,B10_01_3_26_CY5_001_illumcorrect.tiff,...,3267.665061,3096.539506,3858.123818,3832.541827,3840.092495,3816.091328,1875.794483,1870.784173,1918.501351,1877.024963
4,26,Plate_6,124,3,B10,1,1,1,1,B10_01_3_3_CY5_001_illumcorrect.tiff,...,694.924763,706.675431,130.956197,118.199854,129.221522,134.114864,183.456251,180.68815,184.538725,182.622372


## Write dictionary to yaml file for use in downstream steps

In [7]:
dictionary_path = pathlib.Path("./plate_info_dictionary.yaml")
with open(dictionary_path, "w") as file:
    yaml.dump(plate_info_dictionary, file)