# Merge single cells from CellProfiler outputs using CytoTable

## Import libraries

In [1]:
import sys
import pathlib
import yaml
import pprint
import pandas as pd

# cytotable will merge objects from SQLite file into single cells and save as parquet file
from cytotable import convert, presets

# import utility to use function that will add single-cell count per well as a metadata column
sys.path.append("../utils")
import extraction_utils as sc_utils

## Set paths and variables

In [2]:
# Set pilot plates as list to avoid running
pilot_plates = ["Plate_1", "Plate_2"]

# type of file output from CytoTable (currently only parquet)
dest_datatype = "parquet"

# preset configurations based on typical CellProfiler outputs
preset = "cellprofiler_sqlite_pycytominer"

# update preset to include site metadata and cell counts
joins = presets.config["cellprofiler_sqlite_pycytominer"]["CONFIG_JOINS"].replace(
    "Image_Metadata_Well,",
    "Image_Metadata_Well, Image_Metadata_Site,",
)

# set main output dir for all parquet files
output_dir = pathlib.Path("./data/converted_data/")
output_dir.mkdir(exist_ok=True)

# directory where SQLite files are located
sqlite_dir = pathlib.Path("../2.cellprofiler_analysis/analysis_output/")

# list for plate names based on folders to use to create dictionary
plate_names = []

# iterate through 0.download_data and append plate names from folder names
# that contain image data from that plate
# (Note, you must first run `0.download_data/download_plates.ipynb`)
for file_path in pathlib.Path("../0.download_data/").iterdir():
    if str(file_path.stem).startswith("Plate"):
        plate_names.append(str(file_path.stem))
        
print(plate_names)

['Plate_4', 'Plate_1', 'Plate_3', 'Plate_2', 'Plate_5', 'Plate_3_prime']


## Create dictionary with info for each plate

**Note:** All paths must be string to use with CytoTable.

In [3]:
# create plate info dictionary with all parts of the CellProfiler CLI command to run in parallel
plate_info_dictionary = {
    name: {
        "source_path": str(pathlib.Path(
            list(sqlite_dir.rglob(f"{name}_nf1_analysis.sqlite"))[0]
        ).resolve(strict=True)),
        "dest_path": str(pathlib.Path(f"{output_dir}/{name}.parquet")),
    }
    for name in plate_names if name in ["Plate_3_prime"]  # focus on Plate_3_prime
}

# view the dictionary to assess that all info is added correctly
pprint.pprint(plate_info_dictionary, indent=4)

{   'Plate_3_prime': {   'dest_path': 'data/converted_data/Plate_3_prime.parquet',
                         'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite'}}


## Merge objects to single cells and convert SQLite to parquet file + add single cell metadata

In [4]:
# run through each run with each set of paths based on dictionary
for plate, info in plate_info_dictionary.items():
    source_path = info["source_path"]
    dest_path = info["dest_path"]
    
    print(f"Performing merge single cells and conversion on {plate}!")

    # merge single cells and output as parquet file
    convert(
        source_path=source_path,
        dest_path=dest_path,
        dest_datatype=dest_datatype,
        preset=preset,
        joins=joins,
    )
    print(f"Merged and converted {pathlib.Path(dest_path).name}!")

    # add single cell count per well as metadata column to parquet file and save back to same path
    sc_utils.add_sc_count_metadata_file(
        data_path=dest_path, well_column_name="Image_Metadata_Well", file_type="parquet"
    )
    
    print(f"Added single cell count as metadata to {pathlib.Path(dest_path).name}!")

Performing merge single cells and conversion on Plate_3_prime!
Merged and converted Plate_3_prime.parquet!
Added single cell count as metadata to Plate_3_prime.parquet!


## Update the files to remove NA rows added as artifacts of CytoTable

In [5]:
for file_path in output_dir.iterdir():
    if file_path.stem == "Plate_3_prime":
        # Load the DataFrame from the Parquet file
        df = pd.read_parquet(file_path)

        # If any, drop rows where "Metadata_ImageNumber" is NaN (artifact of cytotable)
        df = df.dropna(subset=["Metadata_ImageNumber"])

        # Columns to move to the front
        columns_to_move = ['Nuclei_Location_Center_X', 'Nuclei_Location_Center_Y', 'Cells_Location_Center_X', 'Cells_Location_Center_Y']

        # Rearrange columns and add "Metadata" prefix in one line
        df = (df[columns_to_move + [col for col in df.columns if col not in columns_to_move]]
                    .rename(columns=lambda col: 'Metadata_' + col if col in columns_to_move else col))

        # Save the processed DataFrame as Parquet in the same path
        df.to_parquet(file_path, index=False)

### Check if converted data looks correct

In [6]:
converted_df = pd.read_parquet(plate_info_dictionary["Plate_3_prime"]["dest_path"])

print(converted_df.shape)
converted_df.head()

(7300, 2313)


Unnamed: 0,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,Metadata_ImageNumber,Image_Metadata_Plate,Metadata_number_of_singlecells,Image_Metadata_Site,Image_Metadata_Well,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,630.039298,194.277705,672.953278,195.334634,1.0,Plate_3_prime,49,10,B10,2.0,...,1412.627307,1404.933055,149.716689,150.886582,154.244496,151.510768,342.967657,338.556766,341.530361,330.824752
1,271.331928,823.547418,278.99483,810.260558,1.0,Plate_3_prime,49,10,B10,4.0,...,1356.919019,1324.314619,113.719984,109.054486,124.799321,117.871515,288.029402,274.018166,275.104416,270.291488
2,931.918006,131.105817,971.745661,125.590698,2.0,Plate_3_prime,49,11,B10,1.0,...,3026.827811,2949.357004,496.145914,559.053617,587.016082,489.578883,1410.009546,1418.808846,1367.820689,1263.050543
3,854.100219,147.853779,781.667548,166.423687,2.0,Plate_3_prime,49,11,B10,2.0,...,2861.810145,2816.424948,378.68961,446.540605,418.564662,374.514184,868.289657,844.368547,896.472545,875.062262
4,1084.351386,330.845583,1076.326717,313.344874,2.0,Plate_3_prime,49,11,B10,3.0,...,1271.606076,1292.4147,255.795051,248.275737,242.725696,235.926697,872.258402,854.590638,836.385298,845.351825


## Write dictionary to yaml file for use in downstream steps

In [7]:
dictionary_path = pathlib.Path("./plate_info_dictionary.yaml")
with open(dictionary_path, 'w') as file:
    yaml.dump(plate_info_dictionary, file)