# Convert SQLite outputs to parquet files with cytotable

In [1]:
# Set parameter for papermill to use for processing
plate_id = "BR00143976"

In [2]:
# Parameters
plate_id = "BR00143978"


## Import libraries

In [3]:
import pathlib
import pandas as pd

# cytotable will merge objects from SQLite file into single cells and save as parquet file
from cytotable import convert, presets

import logging

# Set the logging level to a higher level to avoid outputting unnecessary errors from config file in convert function
logging.getLogger().setLevel(logging.ERROR)

## Set paths and variables

In [4]:
# preset configurations based on typical CellProfiler outputs
preset = "cellprofiler_sqlite_pycytominer"

# update preset to include both the site metadata, cell counts, and PathName columns
joins = presets.config["cellprofiler_sqlite_pycytominer"]["CONFIG_JOINS"].replace(
    "Image_Metadata_Well,",
    "Image_Metadata_Well, Image_Metadata_Site, Image_Count_Cells, Image_Metadata_Row, Image_Metadata_Col, "
)

# Add the PathName columns separately
joins = joins.replace(
    "COLUMNS('Image_FileName_.*'),",
    "COLUMNS('Image_FileName_.*'),\n COLUMNS('Image_PathName_.*'),"
)

# type of file output from cytotable (currently only parquet)
dest_datatype = "parquet"

# set path to directory with SQLite files
sqlite_dir = pathlib.Path("../2.feature_extraction/sqlite_outputs")

# directory for processed data
output_dir = pathlib.Path("data")
output_dir.mkdir(parents=True, exist_ok=True)

plate_names = []

for file_path in sqlite_dir.iterdir():
    plate_names.append(file_path.stem)

# print the plate names and how many plates there are (confirmation)
print(f"There are {len(plate_names)} plates in this dataset. Below are the names:")
for name in plate_names:
    print(name)

There are 6 plates in this dataset. Below are the names:
BR00143980
BR00143977
BR00143976
BR00143979
BR00143981
BR00143978


## Convert SQLite to parquet files

In [5]:
file_path = sqlite_dir / plate_id
output_path = pathlib.Path(f"{output_dir}/converted_profiles/{plate_id}_converted.parquet")

print("Starting conversion with cytotable for plate:", plate_id)
# Merge single cells and output as parquet file
convert(
    source_path=str(file_path),
    dest_path=str(output_path),
    dest_datatype=dest_datatype,
    preset=preset,
    joins=joins,
    chunk_size=15000,
)

print(f"Plate {plate_id} has been converted with cytotable!")

Starting conversion with cytotable for plate: BR00143978


Plate BR00143978 has been converted with cytotable!


# Load in converted profiles to update

In [6]:
# Directory with converted profiles
converted_dir = pathlib.Path(f"{output_dir}/converted_profiles")

# Define the list of columns to prioritize and prefix
prioritized_columns = [
    "Nuclei_Location_Center_X",
    "Nuclei_Location_Center_Y",
    "Cells_Location_Center_X",
    "Cells_Location_Center_Y",
    "Image_Count_Cells",
]

# Load the DataFrame from the Parquet file
file_path = converted_dir / f"{plate_id}_converted.parquet"
converted_df = pd.read_parquet(file_path)

# If any, drop rows where "Metadata_ImageNumber" is NaN (artifact of cytotable)
converted_df = converted_df.dropna(subset=["Metadata_ImageNumber"])

# Rearrange columns and add "Metadata" prefix in one line
converted_df = converted_df[
    prioritized_columns
    + [col for col in converted_df.columns if col not in prioritized_columns]
].rename(
    columns=lambda col: "Metadata_" + col if col in prioritized_columns else col
)

# assert that there are column names with PathName in the dataset
assert any("PathName" in col for col in converted_df.columns)

# Assert that Metadata_Row and Metadata_Col are present for downstream QC
assert {"Image_Metadata_Row", "Image_Metadata_Col"}.issubset(
    converted_df.columns
), "Missing required Metadata columns: Row and/or Col"

# Save the processed DataFrame as Parquet in the same path
converted_df.to_parquet(file_path, index=False)

# print shape and head of dataset
print(converted_df.shape)
converted_df.head()

(465740, 3035)


Unnamed: 0,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,Metadata_Image_Count_Cells,Metadata_ImageNumber,Image_Metadata_Col,Image_Metadata_Plate,Image_Metadata_Row,Image_Metadata_Site,...,Nuclei_Texture_Variance_CorrER_3_02_256,Nuclei_Texture_Variance_CorrER_3_03_256,Nuclei_Texture_Variance_CorrMito_3_00_256,Nuclei_Texture_Variance_CorrMito_3_01_256,Nuclei_Texture_Variance_CorrMito_3_02_256,Nuclei_Texture_Variance_CorrMito_3_03_256,Nuclei_Texture_Variance_CorrRNA_3_00_256,Nuclei_Texture_Variance_CorrRNA_3_01_256,Nuclei_Texture_Variance_CorrRNA_3_02_256,Nuclei_Texture_Variance_CorrRNA_3_03_256
0,515.69191,43.549053,506.037214,46.875598,112,1474,16,BR00143978,10,7,...,465.796553,448.696644,1955.198799,1973.297713,2061.520233,1972.107898,91.32999,87.737347,94.916564,93.063563
1,575.096369,44.695531,572.045509,43.98615,224,40,3,BR00143978,7,4,...,46.820266,49.394058,58.144693,57.474273,66.693041,62.292138,37.371225,37.090304,36.530281,39.298212
2,450.244898,9.789116,449.138686,12.135036,265,971,11,BR00143978,14,8,...,17.891513,18.2866,29.254078,30.443866,33.275295,33.563375,65.907876,68.209417,71.767377,70.538919
3,471.215548,11.583039,470.462209,12.043605,429,972,11,BR00143978,14,9,...,21.068466,19.689976,21.537735,19.801042,21.246854,20.987656,68.253157,69.145802,70.611565,68.065586
4,320.918429,30.383686,319.547059,28.95098,188,973,12,BR00143978,3,1,...,117.770368,111.713883,188.65911,241.283696,191.381148,183.899075,30.514193,35.904959,31.09089,29.246951


**To confirm the number of single cells is correct above, please use any database browser software to see if the number of rows in the "Per_Cells" compartment matches the number of rows in the data frame.**