# Merge single cells from CellProfiler outputs using CytoTable

## Import libraries

In [1]:
import sys
import pathlib
import yaml
import pprint
import pandas as pd

# cytotable will merge objects from SQLite file into single cells and save as parquet file
from cytotable import convert, presets

# import utility to use function that will add single-cell count per well as a metadata column
sys.path.append("../utils")
import extraction_utils as sc_utils

## Set paths and variables

In [2]:
# type of file output from CytoTable (currently only parquet)
dest_datatype = "parquet"

# preset configurations based on typical CellProfiler outputs
preset = "cellprofiler_sqlite_pycytominer"

# update preset to include site metadata and cell counts
joins = presets.config["cellprofiler_sqlite_pycytominer"]["CONFIG_JOINS"].replace(
    "Image_Metadata_Well,",
    "Image_Metadata_Well, Image_Metadata_Site,",
)

# set main output dir for all parquet files
output_dir = pathlib.Path("./data/converted_data/")
output_dir.mkdir(exist_ok=True)

# directory where SQLite files are located
sqlite_dir = pathlib.Path("../2.cellprofiler_analysis/analysis_output/")

# list for plate names based on folders to use to create dictionary
plate_names = []

# iterate through 0.download_data and append plate names from folder names
# that contain image data from that plate
# (Note, you must first run `0.download_data/download_plates.ipynb`)
for file_path in pathlib.Path("../0.download_data/").iterdir():
    if str(file_path.stem).startswith("Plate"):
        plate_names.append(str(file_path.stem))
        
print(plate_names)

['Plate_4', 'Plate_1', 'Plate_3', 'Plate_2', 'Plate_5', 'Plate_3_prime']


## Create dictionary with info for each plate

**Note:** All paths must be string to use with CytoTable.

In [3]:
# create plate info dictionary with all parts of the CellProfiler CLI command to run in parallel
plate_info_dictionary = {
    name: {
        "source_path": str(pathlib.Path(
            list(sqlite_dir.rglob(f"{name}_nf1_analysis.sqlite"))[0]
        ).resolve(strict=True)),
        "dest_path": str(pathlib.Path(f"{output_dir}/{name}.parquet")),
    }
    for name in plate_names if not name in ["Plate_1", "Plate_2"]  # focus on non-pilot plates
}

# view the dictionary to assess that all info is added correctly
pprint.pprint(plate_info_dictionary, indent=4)

{   'Plate_3': {   'dest_path': 'data/converted_data/Plate_3.parquet',
                   'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3/Plate_3_nf1_analysis.sqlite'},
    'Plate_3_prime': {   'dest_path': 'data/converted_data/Plate_3_prime.parquet',
                         'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_3_prime/Plate_3_prime_nf1_analysis.sqlite'},
    'Plate_4': {   'dest_path': 'data/converted_data/Plate_4.parquet',
                   'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_4/Plate_4_nf1_analysis.sqlite'},
    'Plate_5': {   'dest_path': 'data/converted_data/Plate_5.parquet',
                   'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_5/Plate_5_nf1_analysis.sqlite'}}


## Merge objects to single cells and convert SQLite to parquet file + add single cell metadata

In [4]:
# run through each run with each set of paths based on dictionary
for plate, info in plate_info_dictionary.items():
    source_path = info["source_path"]
    dest_path = info["dest_path"]
    
    print(f"Performing merge single cells and conversion on {plate}!")

    # merge single cells and output as parquet file
    convert(
        source_path=source_path,
        dest_path=dest_path,
        dest_datatype=dest_datatype,
        preset=preset,
        joins=joins,
    )
    print(f"Merged and converted {pathlib.Path(dest_path).name}!")

    # add single cell count per well as metadata column to parquet file and save back to same path
    sc_utils.add_sc_count_metadata_file(
        data_path=dest_path, well_column_name="Image_Metadata_Well", file_type="parquet"
    )
    
    print(f"Added single cell count as metadata to {pathlib.Path(dest_path).name}!")

Performing merge single cells and conversion on Plate_4!
Merged and converted Plate_4.parquet!


Reusing previously loaded Parsl configuration.


Added single cell count as metadata to Plate_4.parquet!
Performing merge single cells and conversion on Plate_3!
Merged and converted Plate_3.parquet!


Reusing previously loaded Parsl configuration.


Added single cell count as metadata to Plate_3.parquet!
Performing merge single cells and conversion on Plate_5!
Merged and converted Plate_5.parquet!


Reusing previously loaded Parsl configuration.


Added single cell count as metadata to Plate_5.parquet!
Performing merge single cells and conversion on Plate_3_prime!
Merged and converted Plate_3_prime.parquet!
Added single cell count as metadata to Plate_3_prime.parquet!


## Update the files to remove NA rows added as artifacts of CytoTable

In [5]:
for file_path in output_dir.iterdir():
    if file_path.stem not in ["Plate_1", "Plate_2"]:
        # Load the DataFrame from the Parquet file
        df = pd.read_parquet(file_path)

        # If any, drop rows where "Metadata_ImageNumber" is NaN (artifact of cytotable)
        df = df.dropna(subset=["Metadata_ImageNumber"])

        # Columns to move to the front
        columns_to_move = ['Nuclei_Location_Center_X', 'Nuclei_Location_Center_Y', 'Cells_Location_Center_X', 'Cells_Location_Center_Y']

        # Rearrange columns and add "Metadata" prefix in one line
        df = (df[columns_to_move + [col for col in df.columns if col not in columns_to_move]]
                    .rename(columns=lambda col: 'Metadata_' + col if col in columns_to_move else col))

        # Save the processed DataFrame as Parquet in the same path
        df.to_parquet(file_path, index=False)

### Check if converted data looks correct

In [6]:
converted_df = pd.read_parquet(plate_info_dictionary["Plate_4"]["dest_path"])

print(converted_df.shape)
converted_df.head()

(7308, 2313)


Unnamed: 0,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,Metadata_ImageNumber,Image_Metadata_Plate,Metadata_number_of_singlecells,Image_Metadata_Site,Image_Metadata_Well,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,993.576031,631.322487,1001.623945,583.874355,1.0,Plate_4,83,10,B10,4.0,...,1281.874186,1257.435761,65.965695,52.068222,50.44578,51.851812,425.319446,409.351012,418.021018,425.06852
1,851.475825,698.344141,854.813596,712.334404,1.0,Plate_4,83,10,B10,5.0,...,1085.75046,1113.144205,139.037112,140.802921,141.819546,149.091779,512.879573,499.756267,513.44706,507.419635
2,592.680236,384.974721,595.042565,410.149194,2.0,Plate_4,83,11,B10,2.0,...,633.124457,642.170387,190.690537,173.126428,170.503677,178.200219,401.039364,412.623493,420.041994,402.738604
3,1046.634989,478.314255,1058.060509,512.748667,2.0,Plate_4,83,11,B10,3.0,...,894.732816,829.273862,142.997128,131.232052,126.981214,128.412295,357.660331,351.831903,351.38658,357.795596
4,723.749813,597.963351,735.217695,601.070032,2.0,Plate_4,83,11,B10,4.0,...,2054.657442,2025.967932,2151.383916,1980.111635,2106.078983,2153.802496,1115.200657,1076.123098,1048.7071,1022.242934


## Write dictionary to yaml file for use in downstream steps

In [7]:
dictionary_path = pathlib.Path("./plate_info_dictionary.yaml")
with open(dictionary_path, 'w') as file:
    yaml.dump(plate_info_dictionary, file)