# Merge single cells from CellProfiler outputs using CytoTable

## Import libraries

In [1]:
# Modified Jenna's nf1_ic.ipynb file from the Cellpainting repo
# https://github.com/WayScience/nf1_cellpainting_data/blob/main/3.processing_features/0.merge_sc_cytotable.ipynb

import pprint
import sys

# cytotable will merge objects from SQLite file into single cells and save as parquet file
from cytotable import convert, presets
import pandas as pd
import pathlib
import yaml

# import utility to use function that will add single-cell count per well as a metadata column
sys.path.append("../../utils/")
import extraction_utils as sc_utils

## Set paths and variables

In [2]:
# type of file output from CytoTable (currently only parquet)
dest_datatype = "parquet"

# preset configurations based on typical CellProfiler outputs
preset = "cellprofiler_sqlite_pycytominer"

# update preset to include site metadata and cell counts
joins = presets.config["cellprofiler_sqlite_pycytominer"]["CONFIG_JOINS"].replace(
    "Image_Metadata_Well,",
    "Image_Metadata_Well, Image_Metadata_Site,",
)

# set main output dir for all parquet files
output_dir = pathlib.Path("../outputs/converted_data/")
output_dir.mkdir(exist_ok=True)

# directory where SQLite files are located
sqlite_dir = pathlib.Path("../../2.cellprofiler_analysis/outputs/SQLites/")

# list for pipeline names based on files to use to create dictionary
pipeline_names = []

# iterate through "IC_function_apply_pipelines" and append pipline_names from file names
for file_path in pathlib.Path("../../2.cellprofiler_analysis/pipelines/IC_function_apply_pipelines").iterdir():
    if str(file_path.stem).startswith("pipeline_"):
        pipeline_names.append(str(file_path.stem))

print(pipeline_names)

['pipeline_2_IC', 'pipeline_1_IC']


## Create dictionary with info for each plate

**Note:** All paths must be string to use with CytoTable.

In [3]:
# create plate info dictionary with all parts of the CellProfiler CLI command to run in parallel
plate_info_dictionary = {
    name: {
        "dest_path": str(pathlib.Path(f"{output_dir}/{name}.parquet")),
    }
    for name in pipeline_names 
}

# iterate over the dictionary and add the source_path
for name, info in plate_info_dictionary.items():
    for i in range(len(pipeline_names)):
        if name == pipeline_names[i]:
            info["source_path"] = str(pathlib.Path(f"../../2.cellprofiler_analysis/outputs/SQLites/{pipeline_names[i]}.sqlite"))

# view the dictionary to assess that all info is added correctly
pprint.pprint(plate_info_dictionary, indent=4)

{   'pipeline_1_IC': {   'dest_path': '../outputs/converted_data/pipeline_1_IC.parquet',
                         'source_path': '../../2.cellprofiler_analysis/outputs/SQLites/pipeline_1_IC.sqlite'},
    'pipeline_2_IC': {   'dest_path': '../outputs/converted_data/pipeline_2_IC.parquet',
                         'source_path': '../../2.cellprofiler_analysis/outputs/SQLites/pipeline_2_IC.sqlite'}}


## Merge objects to single cells and convert SQLite to parquet file + add single cell metadata

In [4]:
# run through each run with each set of paths based on dictionary
for plate, info in plate_info_dictionary.items():
    source_path = info["source_path"]
    dest_path = info["dest_path"]
    
    print(f"Performing merge single cells and conversion on {plate}!")

    # merge single cells and output as parquet file
    convert(
        source_path=source_path,
        dest_path=dest_path,
        dest_datatype=dest_datatype,
        preset=preset,
        joins=joins,
    )
    print(f"Merged and converted {pathlib.Path(dest_path).name}!")

    # add single cell count per well as metadata column to parquet file and save back to same path
    sc_utils.add_sc_count_metadata_file(
        data_path=dest_path, well_column_name="Image_Metadata_Well", file_type="parquet"
    )
    
    print(f"Added single cell count as metadata and removed NAN ImageNumber rows to {pathlib.Path(dest_path).name}!")

Performing merge single cells and conversion on pipeline_2_IC!
Merged and converted pipeline_2_IC.parquet!


Reusing previously loaded Parsl configuration.


Added single cell count as metadata and removed NAN ImageNumber rows to pipeline_2_IC.parquet!
Performing merge single cells and conversion on pipeline_1_IC!
Merged and converted pipeline_1_IC.parquet!
Added single cell count as metadata and removed NAN ImageNumber rows to pipeline_1_IC.parquet!


### Check if converted data looks correct

In [5]:
converted_df = pd.read_parquet(plate_info_dictionary["pipeline_2_IC"]["dest_path"])

print(converted_df.shape)
converted_df.head()

(23, 2481)


Unnamed: 0,Metadata_ImageNumber,Image_Metadata_Plate,Metadata_number_of_singlecells,Image_Metadata_Site,Image_Metadata_Well,Metadata_Cells_Number_Object_Number,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Nuclei_Number_Object_Number,Cytoplasm_AreaShape_Area,...,Nuclei_Texture_Variance_DAPI_corrected_3_02_256,Nuclei_Texture_Variance_DAPI_corrected_3_03_256,Nuclei_Texture_Variance_GFP_corrected_3_00_256,Nuclei_Texture_Variance_GFP_corrected_3_01_256,Nuclei_Texture_Variance_GFP_corrected_3_02_256,Nuclei_Texture_Variance_GFP_corrected_3_03_256,Nuclei_Texture_Variance_RFP_correcteed_3_00_256,Nuclei_Texture_Variance_RFP_correcteed_3_01_256,Nuclei_Texture_Variance_RFP_correcteed_3_02_256,Nuclei_Texture_Variance_RFP_correcteed_3_03_256
0,2.0,Plate_3,6,2,B1,1.0,1.0,2.0,2.0,30110.0,...,1198.430369,1294.494956,240.688672,233.346807,219.733566,212.346865,397.418083,338.769971,382.593808,471.265107
1,2.0,Plate_3,6,2,B1,2.0,2.0,3.0,3.0,22132.0,...,1809.785919,1860.93215,186.68736,152.142095,145.528073,145.369445,113.220227,107.579641,107.373415,108.878103
2,3.0,Plate_3,6,3,B1,2.0,2.0,2.0,2.0,27963.0,...,1220.955114,1235.403337,373.586884,322.12923,341.651856,452.241851,62.183944,60.897613,61.645972,64.371103
3,3.0,Plate_3,6,3,B1,1.0,1.0,1.0,1.0,22323.0,...,1133.642833,1105.069444,183.865979,183.429892,203.985271,172.848928,169.92244,169.115257,185.677553,173.760943
4,,Plate_3,6,1,B1,,,,,,...,,,,,,,,,,


## Write dictionary to yaml file for use in downstream steps

In [6]:
dictionary_path = pathlib.Path("../outputs/plate_info_dictionary.yaml")
with open(dictionary_path, 'w') as file:
    yaml.dump(plate_info_dictionary, file)