# Merge single cells from CellProfiler outputs using CytoTable

## Import libraries

In [1]:
import sys
import pathlib
import yaml
import pprint
import pandas as pd

# cytotable will merge objects from SQLite file into single cells and save as parquet file
from cytotable import convert, presets

# import utility to use function that will add single-cell count per well as a metadata column
sys.path.append("../utils")
import extraction_utils as sc_utils

## Set paths and variables

In [2]:
# type of file output from CytoTable (currently only parquet)
dest_datatype = "parquet"

# preset configurations based on typical CellProfiler outputs
preset = "cellprofiler_sqlite_pycytominer"

# update preset to include site metadata and cell counts
joins = presets.config["cellprofiler_sqlite_pycytominer"]["CONFIG_JOINS"].replace(
    "Image_Metadata_Well,",
    "Image_Metadata_Well, Image_Metadata_Site,",
)

# set main output dir for all parquet files
output_dir = pathlib.Path("./data/converted_data/")
output_dir.mkdir(exist_ok=True)

# directory where SQLite files are located
sqlite_dir = pathlib.Path("../2.cellprofiler_analysis/analysis_output/")

# list for plate names based on folders to use to create dictionary
plate_names = []

# iterate through 0.download_data and append plate names from folder names
# that contain image data from that plate
# (Note, you must first run `0.download_data/download_plates.ipynb`)
for file_path in pathlib.Path("../0.download_data/").iterdir():
    if str(file_path.stem).startswith("Plate"):
        plate_names.append(str(file_path.stem))
        
print(plate_names)

['Plate_4', 'Plate_1', 'Plate_3', 'Plate_2', 'Plate_5', 'Plate_3_prime']


## Create dictionary with info for each plate

**Note:** All paths must be string to use with CytoTable.

In [3]:
# create plate info dictionary with all parts of the CellProfiler CLI command to run in parallel
plate_info_dictionary = {
    name: {
        "source_path": str(pathlib.Path(
            list(sqlite_dir.rglob(f"{name}_nf1_analysis.sqlite"))[0]
        ).resolve(strict=True)),
        "dest_path": str(pathlib.Path(f"{output_dir}/{name}.parquet")),
    }
    for name in plate_names if name=="Plate_5" # focus on plate 5
}

# view the dictionary to assess that all info is added correctly
pprint.pprint(plate_info_dictionary, indent=4)

{   'Plate_5': {   'dest_path': 'data/converted_data/Plate_5.parquet',
                   'source_path': '/home/jenna/nf1_cellpainting_data/2.cellprofiler_analysis/analysis_output/Plate_5/Plate_5_nf1_analysis.sqlite'}}


## Merge objects to single cells and convert SQLite to parquet file + add single cell metadata

In [4]:
# run through each run with each set of paths based on dictionary
for plate, info in plate_info_dictionary.items():
    source_path = info["source_path"]
    dest_path = info["dest_path"]
    
    print(f"Performing merge single cells and conversion on {plate}!")

    # merge single cells and output as parquet file
    convert(
        source_path=source_path,
        dest_path=dest_path,
        dest_datatype=dest_datatype,
        preset=preset,
        joins=joins,
    )
    print(f"Merged and converted {pathlib.Path(dest_path).name}!")

    # add single cell count per well as metadata column to parquet file and save back to same path
    sc_utils.add_sc_count_metadata_file(
        data_path=dest_path, well_column_name="Image_Metadata_Well", file_type="parquet"
    )
    
    print(f"Added single cell count as metadata to {pathlib.Path(dest_path).name}!")

Performing merge single cells and conversion on Plate_5!
Merged and converted Plate_5.parquet!
Added single cell count as metadata to Plate_5.parquet!


### Check if converted data looks correct

In [5]:
converted_df = pd.read_parquet(plate_info_dictionary["Plate_5"]["dest_path"])

print(converted_df.shape)
converted_df.head()

(8110, 2313)


Unnamed: 0,Metadata_ImageNumber,Image_Metadata_Plate,Metadata_number_of_singlecells,Image_Metadata_Site,Image_Metadata_Well,Metadata_Cells_Number_Object_Number,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Nuclei_Number_Object_Number,Cytoplasm_AreaShape_Area,...,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,1.0,Plate_5,155,10,B10,2.0,2.0,5.0,5.0,12763.0,...,525.6018,509.244793,308.397866,252.279696,253.409204,259.079831,240.377519,230.940172,222.310125,223.219007
1,1.0,Plate_5,155,10,B10,5.0,5.0,8.0,8.0,17757.0,...,1258.575123,1229.316284,97.734734,93.697297,89.753958,95.378803,153.527549,157.166351,150.835644,144.821095
2,1.0,Plate_5,155,10,B10,6.0,6.0,10.0,10.0,19239.0,...,714.894588,686.388996,89.261052,82.305568,84.354155,79.063655,172.147203,176.700007,173.560984,166.67877
3,1.0,Plate_5,155,10,B10,9.0,9.0,13.0,13.0,9650.0,...,342.062745,342.639164,351.243622,336.968977,333.950037,352.265877,97.741996,95.961809,95.940242,97.952776
4,2.0,Plate_5,155,11,B10,2.0,2.0,5.0,5.0,21363.0,...,737.851028,713.088598,80.762441,88.167968,77.893016,76.807025,84.503041,81.467462,83.746863,83.686748


## Write dictionary to yaml file for use in downstream steps

In [6]:
dictionary_path = pathlib.Path("./plate_info_dictionary.yaml")
with open(dictionary_path, 'w') as file:
    yaml.dump(plate_info_dictionary, file)