## Convert SQLite files to parquet

1. Load SQLite manifest
2. Use CytoTable to merge single cells and convert to parquet
3. Save converted files to disk

In [1]:
import pathlib
import pandas as pd

import cytotable
from parsl.config import Config
from parsl.executors import HighThroughputExecutor

In [2]:
# Set constants
manifest_file = pathlib.Path("metadata", "jump_sqlite_aws_file_locations.tsv")

In [3]:
manifest_df = pd.read_csv(manifest_file, sep="\t")

print(manifest_df.shape)
manifest_df.head()

(2380, 5)


Unnamed: 0,source,batch,plate,sqlite_file,Metadata_PlateType
0,source_1,Batch1_20221004,UL000109,s3://cellpainting-gallery/cpg0016-jump/source_...,COMPOUND_EMPTY
1,source_1,Batch1_20221004,UL001641,s3://cellpainting-gallery/cpg0016-jump/source_...,COMPOUND
2,source_1,Batch1_20221004,UL001643,s3://cellpainting-gallery/cpg0016-jump/source_...,COMPOUND
3,source_1,Batch1_20221004,UL001645,s3://cellpainting-gallery/cpg0016-jump/source_...,COMPOUND
4,source_1,Batch1_20221004,UL001651,s3://cellpainting-gallery/cpg0016-jump/source_...,COMPOUND


In [4]:
"/".join(manifest_df.sqlite_file[2].split("/")[0:-1])

's3://cellpainting-gallery/cpg0016-jump/source_1/workspace/backend/Batch1_20221004/UL001643'

In [5]:
parsl_config = Config(
    executors=[
        HighThroughputExecutor()
    ]
)

In [None]:
%%time
what = cytotable.convert(
    source_path="/".join(manifest_df.sqlite_file[2].split("/")[0:-1]),
    dest_path="test2.parquet",
    dest_datatype="parquet",
    chunk_size=150000,
    parsl_config=parsl_config,
    preset="cellprofiler_sqlite_pycytominer"
)