# Convert SQLite output(s) to parquet files with CytoTable

## Import libraries

In [1]:
import argparse
import logging
import pathlib

import pandas as pd

# cytotable will merge objects from SQLite file into single cells and save as parquet file
from cytotable import convert, presets

# Set the logging level to a higher level to avoid outputting unnecessary errors from config file in convert function
logging.getLogger().setLevel(logging.ERROR)
try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

In [2]:
if not in_notebook:
    print("Running as script")
    # set up arg parser
    parser = argparse.ArgumentParser(description="Segment the nuclei of a tiff image")

    parser.add_argument(
        "--patient",
        type=str,
        help="Patient ID",
    )

    args = parser.parse_args()
    patient = args.patient
else:
    print("Running in a notebook")
    patient = "NF0014"

middle_slice_input = pathlib.Path(
    f"../../data/{patient}/cellprofiler_middle_slice_output/"
).resolve(strict=True)
max_projected_input = pathlib.Path(
    f"../../data/{patient}/cellprofiler_zmax_proj_output/"
).resolve(strict=True)

Running in a notebook


## Set paths and variables

In [3]:
# preset configurations based on typical CellProfiler outputs
preset = "cellprofiler_sqlite_pycytominer"

# update preset to include site metadata and cell counts
joins = presets.config["cellprofiler_sqlite_pycytominer"]["CONFIG_JOINS"].replace(
    "Image_Metadata_Well,",
    "Image_Metadata_Well, Image_Metadata_Site, Image_Count_Cells,",
)

# type of file output from cytotable (currently only parquet)
dest_datatype = "parquet"

# directory for processed data
output_dir = pathlib.Path(f"../../data/{patient}/0.converted/").resolve()
output_dir.mkdir(parents=True, exist_ok=True)

# batch_names = []

# for file_path in sqlite_dir.iterdir():
#     batch_names.append(file_path.stem)

# # print the plate names and how many plates there are (confirmation)
# print(f"There are {len(batch_names)} plate(s) in this dataset.")
well_fov_dict = {}
for sqlite_dir in [middle_slice_input, max_projected_input]:
    well_fov_dict[sqlite_dir.name.split("_out")[0].split("cellprofiler_")[1]] = {}
    sqlites = list(sqlite_dir.rglob("*sqlite"))
    sqlites.sort()  # sort to ensure consistent order
    for file_path in sqlites:
        well_fov = file_path.parent.stem
        well_fov_dict[sqlite_dir.name.split("_out")[0].split("cellprofiler_")[1]][
            well_fov
        ] = {
            "image_path": file_path,
            "output_dir": output_dir / f"{well_fov}.parquet",
        }

In [4]:
presets.config["cellprofiler_sqlite_pycytominer"]["CONFIG_JOINS"]

"\n            SELECT\n                per_image.Metadata_ImageNumber,\n                per_image.Image_Metadata_Well,\n                per_image.Image_Metadata_Plate,\n                COLUMNS('Image_FileName_.*'),\n                per_cytoplasm.* EXCLUDE (Metadata_ImageNumber),\n                per_cells.* EXCLUDE (Metadata_ImageNumber),\n                per_nuclei.* EXCLUDE (Metadata_ImageNumber)\n            FROM\n                read_parquet('per_cytoplasm.parquet') AS per_cytoplasm\n            LEFT JOIN read_parquet('per_cells.parquet') AS per_cells USING (Metadata_ImageNumber)\n            LEFT JOIN read_parquet('per_nuclei.parquet') AS per_nuclei USING (Metadata_ImageNumber)\n            LEFT JOIN read_parquet('per_image.parquet') AS per_image USING (Metadata_ImageNumber)\n            WHERE\n                per_cells.Metadata_Cells_Number_Object_Number = per_cytoplasm.Metadata_Cytoplasm_Parent_Cells\n                AND per_nuclei.Metadata_Nuclei_Number_Object_Number = per_cyto

In [6]:
import uuid

from parsl.config import Config
from parsl.executors import HighThroughputExecutor

In [7]:
df = convert(
    "../../data/NF0014/cellprofiler_middle_slice_output/C11-2/gff_extracted_features.sqlite",
    preset=preset,
    joins=joins,
    chunk_size=500,
    dest_datatype=dest_datatype,
    parsl_config=Config(
        executors=[HighThroughputExecutor()],
        run_dir=f"cytotable_runinfo_{uuid.uuid4().hex}",
    ),
    dest_path="../../data/NF0014/0.converted/C11-2.parquet",
)

IndexError: list index out of range

## Convert SQLite to parquet file(s) for single-cell profiles

In [None]:
output_dict_of_dfs = {}
for sqlite_dir in [middle_slice_input, max_projected_input]:
    output_dict_of_dfs[sqlite_dir.name.split("_out")[0].split("cellprofiler_")[1]] = {
        "df_list": [],
    }
output_dict_of_dfs

{'middle_slice': {'df_list': []}, 'zmax_proj': {'df_list': []}}

In [None]:
for featurization_type in well_fov_dict.keys():
    for well_fov, file_info in well_fov_dict[featurization_type].items():
        sqlite_file = file_info["image_path"]
        print(f"Processing {sqlite_file} for {featurization_type} and {well_fov}")
        try:
            df = convert(
                sqlite_file,
                preset=preset,
                joins=joins,
                chunk_size=1,
                dest_datatype=dest_datatype,
                dest_path=well_fov_dict[featurization_type][well_fov]["output_dir"],
            )
            output_dict_of_dfs[featurization_type]["df_list"].append(df)
        except Exception as e:
            print(f"Error processing {sqlite_file}: {e}")
            continue

Processing /home/lippincm/4TB_A/NF1_2D_organoid_profiling_pipeline/data/NF0014/cellprofiler_middle_slice_output/C10-1/gff_extracted_features.sqlite for middle_slice and C10-1
Processing /home/lippincm/4TB_A/NF1_2D_organoid_profiling_pipeline/data/NF0014/cellprofiler_middle_slice_output/C10-2/gff_extracted_features.sqlite for middle_slice and C10-2
Processing /home/lippincm/4TB_A/NF1_2D_organoid_profiling_pipeline/data/NF0014/cellprofiler_middle_slice_output/C11-1/gff_extracted_features.sqlite for middle_slice and C11-1
Processing /home/lippincm/4TB_A/NF1_2D_organoid_profiling_pipeline/data/NF0014/cellprofiler_middle_slice_output/C11-2/gff_extracted_features.sqlite for middle_slice and C11-2


IndexError: list index out of range

In [None]:
# Concatenate all dataframes for each featurization type
for featurization_type in output_dict_of_dfs.keys():
    output_dict_of_dfs[featurization_type] = pd.concat(
        output_dict_of_dfs[featurization_type]["df_list"], ignore_index=True
    )
    # Define the list of columns to prioritize and prefix
    # prioritized_columns = [
    #     "Nuclei_Location_Center_X",
    #     "Nuclei_Location_Center_Y",
    #     "Cells_Location_Center_X",
    #     "Cells_Location_Center_Y",
    #     "Image_Count_Cells"
    # ]

    # for file_path in converted_dir.iterdir():
    #     # Load the DataFrame from the Parquet file
    #     df = pd.read_parquet(file_path)

    #     # If any, drop rows where "Metadata_ImageNumber" is NaN (artifact of cytotable)
    #     df = df.dropna(subset=["Metadata_ImageNumber"])

    #     # Rearrange columns and add "Metadata" prefix in one line
    #     df = df[
    #         prioritized_columns + [col for col in df.columns if col not in prioritized_columns]
    #     ].rename(
    #         columns=lambda col: "Metadata_" + col if col in prioritized_columns else col
    #     )
output_dict_of_dfs

In [None]:
# for file_path in sqlite_dir.iterdir():
#     output_path = pathlib.Path(
#         f"{output_dir}/converted_profiles/{file_path.stem}_sc_converted.parquet"
#     )
#     print("Starting conversion with cytotable for plate:", file_path.stem)
#     # Merge single cells and output as parquet file
#     convert(
#         source_path=str(file_path),
#         dest_path=str(output_path),
#         dest_datatype=dest_datatype,
#         preset=preset,
#         joins=joins,
#         chunk_size=500
#     )

# print("All plates have been converted with cytotable!")

Starting conversion with cytotable for plate: NF0014
All plates have been converted with cytotable!


# Load in converted profiles to update

In [None]:
# Directory with converted profiles
converted_dir = pathlib.Path(f"{output_dir}/converted_profiles")

# Define the list of columns to prioritize and prefix
prioritized_columns = [
    "Nuclei_Location_Center_X",
    "Nuclei_Location_Center_Y",
    "Cells_Location_Center_X",
    "Cells_Location_Center_Y",
    "Image_Count_Cells",
]

for file_path in converted_dir.iterdir():
    # Load the DataFrame from the Parquet file
    df = pd.read_parquet(file_path)

    # If any, drop rows where "Metadata_ImageNumber" is NaN (artifact of cytotable)
    df = df.dropna(subset=["Metadata_ImageNumber"])

    # Rearrange columns and add "Metadata" prefix in one line
    df = df[
        prioritized_columns
        + [col for col in df.columns if col not in prioritized_columns]
    ].rename(
        columns=lambda col: "Metadata_" + col if col in prioritized_columns else col
    )

    # Save the processed DataFrame as Parquet in the same path
    df.to_parquet(file_path, index=False)

## Check output to confirm process worked

To confirm the number of single cells is correct, please use any database browser software to see if the number of rows in the "Per_Cells" compartment matches the number of rows in the data frame.

In [None]:
converted_df = pd.read_parquet(
    f"./data/converted_profiles/{plate_names[0]}_sc_converted.parquet"
)

print(converted_df.shape)
converted_df.head()

(2142, 2910)


Unnamed: 0,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,Metadata_Image_Count_Cells,Metadata_ImageNumber,Image_Metadata_Plate,Image_Metadata_Site,Image_Metadata_Well,Image_Metadata_ZSlice,...,Nuclei_Texture_Variance_ER_3_02_256,Nuclei_Texture_Variance_ER_3_03_256,Nuclei_Texture_Variance_Hoechst_3_00_256,Nuclei_Texture_Variance_Hoechst_3_01_256,Nuclei_Texture_Variance_Hoechst_3_02_256,Nuclei_Texture_Variance_Hoechst_3_03_256,Nuclei_Texture_Variance_Mito_3_00_256,Nuclei_Texture_Variance_Mito_3_01_256,Nuclei_Texture_Variance_Mito_3_02_256,Nuclei_Texture_Variance_Mito_3_03_256
0,881.879955,915.249151,897.356882,897.287549,1,45,NF0014,2,E10,ZS012,...,8.812134,8.467408,108.839056,107.088383,110.573107,104.59252,6.244812,6.511083,6.112579,6.104619
1,582.367521,1021.031674,583.273637,1065.691171,7,1,NF0014,1,C10,ZS044,...,3.08566,3.01623,53.810984,53.505842,52.039812,49.243901,109.565486,109.867762,109.500629,112.251717
2,543.574542,916.39475,570.123399,929.378641,9,2,NF0014,2,C10,ZS018,...,1.461975,1.549177,1186.463306,1215.197101,1182.212837,1157.999266,12.582731,12.280363,12.860938,13.047288
3,468.02568,679.002266,473.275746,734.014534,16,3,NF0014,1,C11,ZS030,...,1.184913,1.206024,1516.60756,1472.995128,1531.257012,1522.003404,10.054106,9.948455,10.055683,10.099761
4,1130.959659,811.376705,1140.627994,775.280979,18,6,NF0014,2,C2,ZS006,...,371.230006,374.10762,631.486059,624.97038,613.39891,606.612988,82.094469,82.24912,82.790418,83.836445


## Extract organoid only profiles

In [None]:
for file_path in sqlite_dir.iterdir():
    output_path = pathlib.Path(
        f"{output_dir}/converted_profiles/{file_path.stem}_organoid_converted.parquet"
    )
    print("Starting conversion with cytotable for plate:", file_path.stem)

    # Merge single cells and output as parquet file
    convert(
        source_path=str(file_path),
        dest_path=str(output_path),
        dest_datatype=dest_datatype,
        metadata=["image"],
        compartments=["organoids"],
        identifying_columns=["ImageNumber"],
        joins="""
        SELECT
            *
        FROM
            read_parquet('per_image.parquet') as per_image
        INNER JOIN read_parquet('per_organoids.parquet') AS per_organoids ON
            per_organoids.Metadata_ImageNumber = per_image.Metadata_ImageNumber
        """,
        page_keys={
            "image": "ImageNumber",
            "organoids": "Organoids_Number_Object_Number",
            "join": "Organoids_Number_Object_Number",
        },
        chunk_size=10,
    )

print("All plates have been converted with cytotable!")

Starting conversion with cytotable for plate: NF0014
All plates have been converted with cytotable!


In [None]:
converted_df = pd.read_parquet(
    f"./data/converted_profiles/{plate_names[0]}_organoid_converted.parquet"
)

print(converted_df.shape)
converted_df.head()

(152, 1499)


Unnamed: 0,Metadata_ImageNumber,Image_ExecutionTime_02Metadata,Image_Metadata_Channel,Image_Metadata_FileLocation,Image_Metadata_Frame,Image_Metadata_Plate,Image_Metadata_Series,Image_Metadata_Site,Image_Metadata_Well,Image_Metadata_ZSlice,...,Organoids_Texture_Variance_ER_3_02_256,Organoids_Texture_Variance_ER_3_03_256,Organoids_Texture_Variance_Hoechst_3_00_256,Organoids_Texture_Variance_Hoechst_3_01_256,Organoids_Texture_Variance_Hoechst_3_02_256,Organoids_Texture_Variance_Hoechst_3_03_256,Organoids_Texture_Variance_Mito_3_00_256,Organoids_Texture_Variance_Mito_3_01_256,Organoids_Texture_Variance_Mito_3_02_256,Organoids_Texture_Variance_Mito_3_03_256
0,31,0.0,,,0,NF0014,0,1,D4,ZS058,...,1.783403,1.785666,507.320199,507.366403,507.537522,507.186298,9.442713,9.441585,9.441276,9.43941
1,11,0.0,,,0,NF0014,0,1,C5,ZS052,...,1.947012,1.942769,285.620945,285.249286,285.787073,285.224915,5.527802,5.517586,5.527372,5.499532
2,12,0.0,,,0,NF0014,0,2,C5,ZS030,...,7.812409,7.735139,585.425367,586.077236,585.362315,586.266941,81.750646,81.673493,81.765727,81.668038
3,13,0.0,,,0,NF0014,0,1,C6,ZS034,...,2.741727,2.767367,263.209426,266.067862,262.416556,262.829797,22.845596,22.953855,22.800617,22.943567
4,14,0.0,,,0,NF0014,0,2,C6,ZS054,...,1.265346,1.263935,329.987455,329.667528,329.939192,329.671474,27.272309,27.162311,27.285292,27.181438
