# Convert SQLite outputs to parquet files with cytotable

In [1]:
# Set parameter for papermill to use for processing
plate_id = "BR00145816"

In [2]:
# Parameters
plate_id = "BR00147495"


## Import libraries

In [3]:
import pathlib
import pandas as pd

# cytotable will merge objects from SQLite file into single cells and save as parquet file
from cytotable import convert, presets

import logging

# Set the logging level to a higher level to avoid outputting unnecessary errors from config file in convert function
logging.getLogger().setLevel(logging.ERROR)

## Set paths and variables

In [4]:
# preset configurations based on typical CellProfiler outputs
preset = "cellprofiler_sqlite_pycytominer"

# update preset to include both the site metadata, cell counts, and PathName columns
joins = presets.config["cellprofiler_sqlite_pycytominer"]["CONFIG_JOINS"].replace(
    "Image_Metadata_Well,",
    "Image_Metadata_Well, Image_Metadata_Site, Image_Count_Cells, Image_Metadata_Row, Image_Metadata_Col, ",
)

# Add the PathName columns separately
joins = joins.replace(
    "COLUMNS('Image_FileName_.*'),",
    "COLUMNS('Image_FileName_.*'),\n COLUMNS('Image_PathName_.*'),",
)

# type of file output from cytotable (currently only parquet)
dest_datatype = "parquet"

# set the round of data that will be processed
round_id = "Round_3_data"

# set path to directory with SQLite files
sqlite_dir = pathlib.Path(f"../2.feature_extraction/sqlite_outputs/{round_id}")

# directory for processed data
output_dir = pathlib.Path("data")
output_dir.mkdir(parents=True, exist_ok=True)

plate_names = []

for file_path in sqlite_dir.iterdir():
    plate_names.append(file_path.stem)

# print the plate names and how many plates there are (confirmation)
print(f"There are {len(plate_names)} plates in this dataset. Below are the names:")
for name in plate_names:
    print(name)

There are 12 plates in this dataset. Below are the names:
BR00146998
BR00147496
BR00147000
BR00147002
BR00147261
BR00147262
BR00147263
BR00146999
BR00147003
BR00147001
BR00147497
BR00147495


## Convert SQLite to parquet files

In [5]:
file_path = sqlite_dir / plate_id
output_path = pathlib.Path(
    f"{output_dir}/converted_profiles/{round_id}/{plate_id}_converted.parquet"
)

print("Starting conversion with cytotable for plate:", plate_id)
# Merge single cells and output as parquet file
convert(
    source_path=str(file_path),
    dest_path=str(output_path),
    dest_datatype=dest_datatype,
    preset=preset,
    joins=joins,
    chunk_size=15000,
)

print(f"Plate {plate_id} has been converted with cytotable!")

Starting conversion with cytotable for plate: BR00147495


Plate BR00147495 has been converted with cytotable!


# Load in converted profiles to update

In [6]:
# Directory with converted profiles
converted_dir = pathlib.Path(f"{output_dir}/converted_profiles/{round_id}")

# Define the list of columns to prioritize and prefix
prioritized_columns = [
    "Nuclei_Location_Center_X",
    "Nuclei_Location_Center_Y",
    "Cells_Location_Center_X",
    "Cells_Location_Center_Y",
    "Image_Count_Cells",
]

# Load the DataFrame from the Parquet file
file_path = converted_dir / f"{plate_id}_converted.parquet"
converted_df = pd.read_parquet(file_path)

# If any, drop rows where "Metadata_ImageNumber" is NaN (artifact of cytotable)
converted_df = converted_df.dropna(subset=["Metadata_ImageNumber"])

# Rearrange columns and add "Metadata" prefix in one line
converted_df = converted_df[
    prioritized_columns
    + [col for col in converted_df.columns if col not in prioritized_columns]
].rename(columns=lambda col: "Metadata_" + col if col in prioritized_columns else col)

# assert that there are column names with PathName in the dataset
assert any("PathName" in col for col in converted_df.columns)

# Assert that Metadata_Row and Metadata_Col are present for downstream QC
assert {"Image_Metadata_Row", "Image_Metadata_Col"}.issubset(
    converted_df.columns
), "Missing required Metadata columns: Row and/or Col"

# Save the processed DataFrame as Parquet in the same path
converted_df.to_parquet(file_path, index=False)

# print shape and head of dataset
print(converted_df.shape)
converted_df.head()

(124555, 3035)


Unnamed: 0,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,Metadata_Image_Count_Cells,Metadata_ImageNumber,Image_Metadata_Col,Image_Metadata_Plate,Image_Metadata_Row,Image_Metadata_Site,...,Nuclei_Texture_Variance_CorrER_3_02_256,Nuclei_Texture_Variance_CorrER_3_03_256,Nuclei_Texture_Variance_CorrMito_3_00_256,Nuclei_Texture_Variance_CorrMito_3_01_256,Nuclei_Texture_Variance_CorrMito_3_02_256,Nuclei_Texture_Variance_CorrMito_3_03_256,Nuclei_Texture_Variance_CorrRNA_3_00_256,Nuclei_Texture_Variance_CorrRNA_3_01_256,Nuclei_Texture_Variance_CorrRNA_3_02_256,Nuclei_Texture_Variance_CorrRNA_3_03_256
0,938.863462,131.438462,938.353249,131.154088,41,1,3,BR00147495,3,1,...,638.665357,671.191789,119.518551,123.104904,118.302733,122.888871,597.725691,597.581346,610.53826,622.40211
1,146.716837,19.992347,145.310764,18.748264,30,3,3,BR00147495,3,3,...,476.185139,502.580704,2963.294867,2537.848737,2720.618897,2887.21187,174.556281,186.874975,183.180312,189.617032
2,294.131455,39.464789,294.136488,39.274795,44,4,3,BR00147495,3,4,...,429.19846,416.479645,514.354842,471.27328,464.041991,488.135889,234.239572,234.179651,247.430135,238.883069
3,176.803082,64.486301,178.063584,64.343353,26,5,3,BR00147495,3,5,...,367.085049,363.381763,1320.432965,1203.494435,1171.203814,1187.947528,192.10765,179.286782,170.484665,168.16028
4,58.291024,40.892112,53.443288,43.095734,57,6,3,BR00147495,3,6,...,123.273895,121.762792,403.319917,407.691252,436.34664,393.387957,89.649707,91.43974,88.492812,89.172983


**To confirm the number of single cells is correct above, please use any database browser software to see if the number of rows in the "Per_Cells" compartment matches the number of rows in the data frame.**