# coSMicQC Demonstration with JUMP Plate BR00117006

In [58]:
import json
import pathlib

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from cloudpathlib import S3Client

In [79]:
# reference the file without reading it entirely
target_file = pq.ParquetFile("data/plates/BR00117006/BR00117006.parquet")

# target image names
target_image_names = [
    "CellOutlines",
    "NucleiOutlines",
    "OrigAGP",
    "OrigDNA",
    "OrigRNA",
]

# create a column grouping for the columns we're interested in
target_image_column_groups = [
    (f"Image_PathName_{name}", f"Image_FileName_{name}") for name in target_image_names
]
# flatten the groupings from above
target_flattened_columns = [
    # flatten the groupings from above
    col
    for colgroup in target_image_columns
    for col in colgroup
]

# show the paired column names
print(json.dumps(target_image_column_groups, indent=4))

# check that the columns where images are included
print(
    json.dumps(
        [
            col
            for col in target_file.schema.names
            if any(target in col for target in target_flattened_columns)
        ],
        indent=4,
    )
)

[
    [
        "Image_PathName_CellOutlines",
        "Image_FileName_CellOutlines"
    ],
    [
        "Image_PathName_NucleiOutlines",
        "Image_FileName_NucleiOutlines"
    ],
    [
        "Image_PathName_OrigAGP",
        "Image_FileName_OrigAGP"
    ],
    [
        "Image_PathName_OrigDNA",
        "Image_FileName_OrigDNA"
    ],
    [
        "Image_PathName_OrigRNA",
        "Image_FileName_OrigRNA"
    ]
]
[
    "Image_FileName_CellOutlines",
    "Image_FileName_NucleiOutlines",
    "Image_FileName_OrigAGP",
    "Image_FileName_OrigDNA",
    "Image_FileName_OrigRNA",
    "Image_PathName_CellOutlines",
    "Image_PathName_NucleiOutlines",
    "Image_PathName_OrigAGP",
    "Image_PathName_OrigDNA",
    "Image_PathName_OrigRNA"
]


In [71]:
# show first row of output to help determine where files are located
# note: we do this to avoid reading the full dataset, which is > 20 GB
df_example = pa.Table.from_batches(
    [next(target_file.iter_batches(batch_size=1))]
).to_pandas()[target_flattened_columns]

# print the dictionary with indentation from the json module
print(json.dumps(df_example.to_dict(orient="records"), indent=4))

[
    {
        "Image_PathName_CellOutlines": "/home/ubuntu/local_output/BR00117006/analysis/BR00117006-A01-1/outlines",
        "Image_FileName_CellOutlines": "A01_s1--cell_outlines.png",
        "Image_PathName_NucleiOutlines": "/home/ubuntu/local_output/BR00117006/analysis/BR00117006-A01-1/outlines",
        "Image_FileName_NucleiOutlines": "A01_s1--nuclei_outlines.png",
        "Image_PathName_OrigAGP": "/home/ubuntu/local_input/projects/2019_07_11_JUMP-CP/2020_11_04_CPJUMP1/images/BR00117006__2020-11-02T19_54_45-Measurement1/Images",
        "Image_FileName_OrigAGP": "r01c01f01p01-ch2sk1fk1fl1.tiff",
        "Image_PathName_OrigDNA": "/home/ubuntu/local_input/projects/2019_07_11_JUMP-CP/2020_11_04_CPJUMP1/images/BR00117006__2020-11-02T19_54_45-Measurement1/Images",
        "Image_FileName_OrigDNA": "r01c01f01p01-ch5sk1fk1fl1.tiff",
        "Image_PathName_OrigRNA": "/home/ubuntu/local_input/projects/2019_07_11_JUMP-CP/2020_11_04_CPJUMP1/images/BR00117006__2020-11-02T19_54_45-Meas

In [76]:
# create inferred AWS S3 paths for all images using the example
s3_columns = []

for pathname_col, filename_col in target_image_column_groups:

    # form a column name for the s3 path
    s3_column_name = f"Image_S3Path_{filename_col.replace('Image_FileName_', '')}"

    # form an S3 path for the outlines, which are stored separately from originals
    if "Outlines" in filename_col:
        df_example[s3_column_name] = (
            df_example[pathname_col].str.replace(
                "/home/ubuntu/local_output/",
                "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/workspace/analysis/2020_11_04_CPJUMP1/",
            )
            + "/"
            + df_example[filename_col]
        )

    # form an S3 path for the originals
    if "Orig" in filename_col:
        df_example[s3_column_name] = (
            df_example[pathname_col].str.replace(
                "/home/ubuntu/local_input/projects/2019_07_11_JUMP-CP/2020_11_04_CPJUMP1/",
                "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/",
            )
            + "/"
            + df_example[filename_col]
        )

    # collect the s3 column name
    s3_columns.append(s3_column_name)

# print the dictionary with indentation from the json module
print(json.dumps(df_example.to_dict(orient="records"), indent=4))

[
    {
        "Image_PathName_CellOutlines": "/home/ubuntu/local_output/BR00117006/analysis/BR00117006-A01-1/outlines",
        "Image_FileName_CellOutlines": "A01_s1--cell_outlines.png",
        "Image_PathName_NucleiOutlines": "/home/ubuntu/local_output/BR00117006/analysis/BR00117006-A01-1/outlines",
        "Image_FileName_NucleiOutlines": "A01_s1--nuclei_outlines.png",
        "Image_PathName_OrigAGP": "/home/ubuntu/local_input/projects/2019_07_11_JUMP-CP/2020_11_04_CPJUMP1/images/BR00117006__2020-11-02T19_54_45-Measurement1/Images",
        "Image_FileName_OrigAGP": "r01c01f01p01-ch2sk1fk1fl1.tiff",
        "Image_PathName_OrigDNA": "/home/ubuntu/local_input/projects/2019_07_11_JUMP-CP/2020_11_04_CPJUMP1/images/BR00117006__2020-11-02T19_54_45-Measurement1/Images",
        "Image_FileName_OrigDNA": "r01c01f01p01-ch5sk1fk1fl1.tiff",
        "Image_PathName_OrigRNA": "/home/ubuntu/local_input/projects/2019_07_11_JUMP-CP/2020_11_04_CPJUMP1/images/BR00117006__2020-11-02T19_54_45-Meas

In [84]:
# create a custom s3 client to utilize no-sign-request (for anonymous access to s3 resources)
s3_cli = S3Client(no_sign_request=True)

# create paths
pathlib.Path("data/images/outlines").mkdir(exist_ok=True, parents=True)
pathlib.Path("data/images/orig").mkdir(exist_ok=True, parents=True)

# download images
for record in df_example[s3_columns].to_dict(orient="records"):

    # iterate through all s3 columns
    for s3_column in s3_columns:

        # create a cloudpath
        image_cloudpath = s3_cli.CloudPath(record[s3_column])

        # download outlines images
        if "Outlines" in s3_column:
            image_cloudpath.download_to(f"data/images/outlines/{image_cloudpath.name}")

        # download outlines images
        if "Orig" in s3_column:
            image_cloudpath.download_to(f"data/images/orig/{image_cloudpath.name}")

In [85]:
# show the data tree
!tree data

[01;34mdata[0m
├── [01;34mimages[0m
│   ├── [01;34morig[0m
│   │   ├── [00mr01c01f01p01-ch2sk1fk1fl1.tiff[0m
│   │   ├── [00mr01c01f01p01-ch3sk1fk1fl1.tiff[0m
│   │   └── [00mr01c01f01p01-ch5sk1fk1fl1.tiff[0m
│   └── [01;34moutlines[0m
│       ├── [00mA01_s1--cell_outlines.png[0m
│       └── [00mA01_s1--nuclei_outlines.png[0m
└── [01;34mplates[0m
    └── [01;34mBR00117006[0m
        └── [00mBR00117006.parquet[0m

6 directories, 6 files
