# Convert SQLite file output from CellProfiler into parquet file using Cytotable

## Import libraries

In [1]:
import logging
import pathlib

import pandas as pd

# cytotable will merge objects from SQLite file into single cells and save as parquet file
from cytotable import convert, presets

# Set the logging level to a higher level to avoid outputting unnecessary errors from config file in convert function
logging.getLogger().setLevel(logging.ERROR)

## Set paths and variables

In [2]:
# type of file output for CytoTable
dest_datatype = "parquet"

# set main output dir for all parquet files
output_dir = pathlib.Path("./data")
output_dir.mkdir(exist_ok=True)

# directory where SQLite files are located
sqlite_dir = pathlib.Path("../3.cp_analysis/analysis_output").resolve(strict=True)

# Set converted parquet dir
parquet_dir = pathlib.Path(f"{output_dir}/converted_profiles")
parquet_dir.mkdir(exist_ok=True)

# Set plate names as an empty list to append to
plate_names = []

# directory with plate maps
platemap_dir = pathlib.Path(f"../0.download_data/metadata/platemaps")

# list for plate names based on metadata files to use to create dictionary
plate_names = []
# iterate through metadata dir and append plate names from metadata files
for file_path in platemap_dir.iterdir():
    filename = file_path.stem
    first_index = filename.split(".")[0]
    plate_names.append(first_index)

# print the plate names and how many plates there are (confirmation)
print(f"There are {len(plate_names)} plates in this dataset. Below are the names:")
for name in plate_names:
    print(name)

There are 4 plates in this dataset. Below are the names:
slide2
slide1
slide4
slide3


## Run cytotable convert to output nuclei and image features separately for all plates

In [3]:
# Iterate over directory with SQLite outputs
for plate_folder in sqlite_dir.iterdir():
    # Using the plate names list, only process files within that list
    if plate_folder.name in plate_names:
        # Construct output path for converted parquet file
        output_path = pathlib.Path(f"{parquet_dir}/{plate_folder.stem}/{plate_folder.stem}_converted.parquet")
        
        print("Starting conversion with cytotable for plate:", plate_folder.stem)

        # merge single cells and output as parquet file
        convert(
            source_path=str(plate_folder),
            dest_path=str(output_path),
            dest_datatype=dest_datatype,
            metadata=["image"],
            compartments=["nuclei"],
            identifying_columns=["ImageNumber"],
            joins="""
            SELECT
                *
            FROM
                read_parquet('per_image.parquet') as per_image
            INNER JOIN read_parquet('per_nuclei.parquet') AS per_nuclei ON
                per_nuclei.Metadata_ImageNumber = per_image.Metadata_ImageNumber
            """,
            chunk_size=10000,
        )

        print("Conversion finished for plate:", plate_folder.stem)

Starting conversion with cytotable for plate: slide4
Conversion finished for plate: slide4
Starting conversion with cytotable for plate: slide2
Conversion finished for plate: slide2
Starting conversion with cytotable for plate: slide3
Conversion finished for plate: slide3
Starting conversion with cytotable for plate: slide1
Conversion finished for plate: slide1


## Remove unwanted image + metadata columns and split the bulk and single-cell data from the main parquet file

In [4]:
# path to unwanted image cols text file
unwanted_list_path = pathlib.Path("./unwanted_image_cols.txt")
# Load the list of columns to remove from the text file
with open(unwanted_list_path, "r") as file:
    columns_to_remove = [line.strip() for line in file]

# Iterate through directory with converted outputs
for plate_folder in parquet_dir.iterdir():
    # Only process the files that are in the plate names list
    if plate_folder.name in plate_names:
        # Read in each file as data frame
        plate_df = pd.read_parquet(
            pathlib.Path(f"{plate_folder}/{plate_folder.stem}_converted.parquet")
        )
        print(
            "Starting to edit image and nuclei data frames for plate:",
            plate_folder.stem,
        )

        # Drop the specified columns (ignore error if a column isn't there)
        plate_df = plate_df.drop(columns=columns_to_remove, errors="ignore")

        # Identify metadata columns for nuclei data frame
        metadata_columns = [
            "Metadata_ImageNumber",
            "Image_Metadata_Plate",
            "Image_Metadata_Site",
            "Image_Metadata_Well",
            "Image_Count_Nuclei",
        ]

        # Create nuclei (single-cell) data frame
        nuclei_df = plate_df[
            metadata_columns
            + [col for col in plate_df.columns if col.startswith("Nuclei_")]
        ]

        # Create image (bulk) data frame and groupby well
        image_df = plate_df[
            ["Metadata_ImageNumber"]
            + [col for col in plate_df.columns if col.startswith("Image_")]
        ]
        # Drop duplicate images in the image data frame since each image will have the same values even if the row is repeated
        image_df = image_df.drop_duplicates(subset="Metadata_ImageNumber")

        # Save nuclei and image data frames to the same folder as the plate
        nuclei_df.to_parquet(f"{plate_folder}/per_nuclei.parquet", index=False)
        image_df.to_parquet(f"{plate_folder}/per_image.parquet", index=False)

        # nuclei_df and image_df shape and one data frame to assess all looks correct
        print("Shape of nuclei data frame", nuclei_df.shape)
        print("Shape of image data frame", image_df.shape)

Starting to edit image and nuclei data frames for plate: slide4
Shape of nuclei data frame (65351, 584)
Shape of image data frame (407, 889)
Starting to edit image and nuclei data frames for plate: slide2
Shape of nuclei data frame (53637, 584)
Shape of image data frame (288, 889)
Starting to edit image and nuclei data frames for plate: slide3
Shape of nuclei data frame (58106, 584)
Shape of image data frame (382, 889)
Starting to edit image and nuclei data frames for plate: slide1
Shape of nuclei data frame (71962, 584)
Shape of image data frame (325, 889)


In [8]:
nuclei_df.head(50)

Unnamed: 0,Metadata_ImageNumber,Image_Metadata_Plate,Image_Metadata_Site,Image_Metadata_Well,Image_Count_Nuclei,Nuclei_AreaShape_Area,Nuclei_AreaShape_BoundingBoxArea,Nuclei_AreaShape_BoundingBoxMaximum_X,Nuclei_AreaShape_BoundingBoxMaximum_Y,Nuclei_AreaShape_BoundingBoxMinimum_X,...,Nuclei_Texture_Variance_A647_3_02_256,Nuclei_Texture_Variance_A647_3_03_256,Nuclei_Texture_Variance_DAPI_3_00_256,Nuclei_Texture_Variance_DAPI_3_01_256,Nuclei_Texture_Variance_DAPI_3_02_256,Nuclei_Texture_Variance_DAPI_3_03_256,Nuclei_Texture_Variance_GOLD_3_00_256,Nuclei_Texture_Variance_GOLD_3_01_256,Nuclei_Texture_Variance_GOLD_3_02_256,Nuclei_Texture_Variance_GOLD_3_03_256
0,1,slide1,M10,A1,25,1483.0,2550.0,53.0,201.0,23.0,...,14.731878,13.950219,2.508336,2.4868,2.539577,2.484139,2.586139,2.6852,2.649136,2.497674
1,1,slide1,M10,A1,25,1378.0,1974.0,2239.0,403.0,2192.0,...,50.715492,50.584647,11.958722,11.757222,11.748338,12.026326,8.242161,8.072166,8.239189,8.313167
2,1,slide1,M10,A1,25,1345.0,1974.0,2073.0,477.0,2026.0,...,121.489707,122.096126,50.735363,51.805624,50.60842,51.418746,28.538125,29.204032,28.407457,28.739371
3,1,slide1,M10,A1,25,1403.0,5130.0,2013.0,592.0,1959.0,...,0.0,0.0,47.224646,50.686761,46.432874,47.049561,0.0,0.0,0.0,0.0
4,1,slide1,M10,A1,25,1157.0,1950.0,1805.0,609.0,1766.0,...,197.081725,197.985838,116.485446,121.235388,115.894563,117.135912,40.604219,41.407578,40.21409,41.106731
5,1,slide1,M10,A1,25,1307.0,2024.0,2107.0,612.0,2061.0,...,62.931137,61.236397,25.143734,25.835207,25.285304,25.37158,12.703798,12.279735,12.622825,12.863864
6,1,slide1,M10,A1,25,1779.0,3904.0,259.0,690.0,198.0,...,126.946915,126.84839,22.801861,22.884667,22.920218,23.930735,34.804544,35.542534,34.971966,35.936627
7,1,slide1,M10,A1,25,1712.0,2583.0,2177.0,690.0,2114.0,...,46.875692,44.519211,2.955106,3.039135,2.974103,2.973642,4.032585,4.006469,3.922182,3.824892
8,1,slide1,M10,A1,25,1617.0,2350.0,1862.0,701.0,1815.0,...,87.614591,84.514297,8.386882,8.607762,8.641871,8.355843,13.396074,13.376742,13.453367,13.330475
9,1,slide1,M10,A1,25,3167.0,6890.0,1975.0,734.0,1869.0,...,171.406249,169.776471,150.233805,150.818605,149.670198,150.652194,46.092773,46.875176,46.309744,46.363151
