# Example of CellProfiler project for idrstream

## Import Libraries

In [1]:
import pathlib
import pandas as pd
import shutil

import sys

sys.path.append("../IDR_stream/")
from idrstream.CP_idr import CellProfilerRun

## Initialize idrstream

In [2]:
# directory with all locations data csvs (with plate/well/frame image location data for IDR_stream)
locations_dir = pathlib.Path("../../0.locate_data/locations/")

# path to directory with runcellpose.py file
plugins_directory = pathlib.Path("../IDR_stream/idrstream/CP_Plugins")
# path to CellProfiler pipeline file
pipeline_path = pathlib.Path("../stream_files/CP_files/mitocheck_idr_cp.cppipe")
# idr ID for MitoCheck data
idr_id = "idr0013"

# path to users home dir
home_dir_path = pathlib.Path.home()

# set downloader paths
aspera_path = pathlib.Path(f"{home_dir_path}/.aspera/ascli/sdk/ascp")
aspera_key_path = pathlib.Path("../stream_files/asperaweb_id_dsa.openssh")
screens_path = pathlib.Path("../stream_files/idr0013-screenA-plates.tsv")
idr_index_name = "idr0013-neumann-mitocheck"

# set fiji path
fiji_path = pathlib.Path(f"{home_dir_path}/Desktop/Fiji.app")
# specify no illumination correction
perform_illumination_correction = False

In [3]:
for data_locations_path in sorted(locations_dir.iterdir()):
    if "training" not in data_locations_path.name:
        continue

    # name of data being processed (training_data, negative_control_data, or positive_control_data)
    data_name = data_locations_path.name.replace("_locations.tsv", "_data")
    print(f"Running IDR_stream CP for {data_name}")

    # path to temporary data directory that holds intermediate idrstream files
    tmp_dir = pathlib.Path("tmp/")
    # remove tmp directory if it already exists (ex: from a previous IDR_stream run)
    shutil.rmtree(tmp_dir, ignore_errors=True)
    # path to final data directory (place final .csv.gz metadata+features are saved)
    final_data_dir = pathlib.Path(
        f"../extracted_features__no_ic/{data_name}/CP_features"
    )
    # path to log file
    log_file_path = pathlib.Path(f"logs/{data_name}/cp_idrstream.log")
    # remove log file if it already exists
    # log_file_path.unlink(missing_ok=True)
    # create parent directory for log file if it doesn't exist
    log_file_path.parent.mkdir(exist_ok=True, parents=True)

    # initialize IDR_stream cp run
    stream = CellProfilerRun(
        pipeline_path,
        plugins_directory,
        idr_id,
        tmp_dir,
        final_data_dir,
        log=log_file_path,
    )

    # pandas dataframe with plate/well/frame image location data for IDR_stream
    data_to_process = pd.read_csv(data_locations_path, sep="\t", index_col=0)

    # init downloader
    stream.init_downloader(aspera_path, aspera_key_path, screens_path, idr_index_name)

    # init preprocessor
    stream.init_preprocessor(fiji_path, perform_illumination_correction)

    # run cp IDR_stream!
    stream.run_cp_stream(data_to_process, batch_size=10, start_batch=19, batch_nums=[19])

Running IDR_stream CP for training_data
[INFO] Overriding Leica ROI Reader; identifier: command:de.biovoxxel.utilities.RoiReader; jar: file:/home/roshankern/Desktop/Fiji.app/plugins/Biovoxxel_Plugins-2.5.6.jar
Completed: 70140K bytes transferred in 2 seconds
 (218571K bits/sec), in 1 file.
CellH5Reader initializing /home/roshankern/Desktop/Github/mitocheck_data/1.idr_streams/streams__no_ic/tmp/downloads/LT0094_04/00319_01.ch5
Plate :/sample/0/plate/
Well :/sample/0/plate/LT0094_04--ex2006_08_09--sp2005_08_03--tt17--c4/experiment/
Site :/sample/0/plate/LT0094_04--ex2006_08_09--sp2005_08_03--tt17--c4/experiment/00319/position/
Parse segmentation ROIs for cell object primary__test : 0
Completed: 70140K bytes transferred in 2 seconds
 (214097K bits/sec), in 1 file.
CellH5Reader initializing /home/roshankern/Desktop/Github/mitocheck_data/1.idr_streams/streams__no_ic/tmp/downloads/LT0094_04/00319_01.ch5
Plate :/sample/0/plate/
Well :/sample/0/plate/LT0094_04--ex2006_08_09--sp2005_08_03--tt17

Times reported are CPU and Wall-clock times for each module
Thu Dec 28 16:15:53 2023: Image # 1, module Images # 1: CPU_time = 0.00 secs, Wall_time = 0.00 secs
Thu Dec 28 16:15:53 2023: Image # 1, module Metadata # 2: CPU_time = 0.00 secs, Wall_time = 0.00 secs
Thu Dec 28 16:15:53 2023: Image # 1, module NamesAndTypes # 3: CPU_time = 0.45 secs, Wall_time = 0.10 secs
Thu Dec 28 16:15:53 2023: Image # 1, module Groups # 4: CPU_time = 0.00 secs, Wall_time = 0.00 secs
** TORCH CUDA version installed and working. **
>>>> using GPU
>> cyto << model set to be used
>>>> model diam_mean =  30.000 (ROIs rescaled to this size during training)
~~~ ESTIMATING CELL DIAMETER(S) ~~~
estimated cell diameter(s) in 5.00 sec
>>> diameter(s) = 
[ 25.31 ]
~~~ FINDING MASKS ~~~
>>>> TOTAL TIME 6.26 sec
Thu Dec 28 16:15:53 2023: Image # 1, module RunCellpose # 5: CPU_time = 7.33 secs, Wall_time = 7.20 secs
  back_pixels = skimage.morphology.erosion(back_pixels_mask, selem=selem)
  back_pixels = skimage.morpho