In [1]:
import pathlib
import pandas as pd
import shutil

import sys

sys.path.append("../IDR_stream/")
from idrstream.DP_idr import DeepProfilerRun

In [2]:
# directory with all locations data csvs (with plate/well/frame image location data for IDR_stream)
locations_dir = pathlib.Path("../../0.locate_data/locations/")

# idr ID for MitoCheck data
idr_id = "idr0013"

# path to users home dir
home_dir_path = pathlib.Path.home()

# set downloader paths
aspera_path = pathlib.Path(f"{home_dir_path}/.aspera/ascli/sdk/ascp")
aspera_key_path = pathlib.Path("../stream_files/asperaweb_id_dsa.openssh")
screens_path = pathlib.Path("../stream_files/idr0013-screenA-plates.tsv")
idr_index_name = "idr0013-neumann-mitocheck"

# set fiji path
fiji_path = pathlib.Path(f"{home_dir_path}/Desktop/Fiji.app")
# specify no illumination correction
perform_illumination_correction = False

# set segmentation params for MitoCheck data
nuclei_model_specs = {
    "model_type": "cyto",
    "channels": [0, 0],
    "diameter": 0,
    "flow_threshold": 0.8,
    "cellprob_threshold": 0,
    "remove_edge_masks": True,
}

In [3]:
for data_locations_path in sorted(locations_dir.iterdir()):
    if "training" not in data_locations_path.name:
        continue
    
    # name of data being processed (training_data, negative_control_data, or positive_control_data)
    data_name = data_locations_path.name.replace("_locations.tsv", "_data")
    print(f"Running IDR_stream DP for {data_name}")

    # path to temporary data directory that holds intermediate idrstream files
    tmp_dir = pathlib.Path("tmp/")
    # remove tmp directory if it already exists (ex: from a previous IDR_stream run)
    shutil.rmtree(tmp_dir, ignore_errors=True)
    # path to final data directory (place final .csv.gz metadata+features are saved)
    final_data_dir = pathlib.Path(f"../extracted_features__no_ic/{data_name}/DP_features")
    # path to log file
    log_file_path = pathlib.Path(f"logs/{data_name}/dp_idrstream.log")
    # remove log file if it already exists
    # log_file_path.unlink(missing_ok=True)
    # create parent directory for log file if it doesn't exist
    log_file_path.parent.mkdir(exist_ok=True, parents=True)

    # initialize IDR_stream dp run
    stream = DeepProfilerRun(idr_id, tmp_dir, final_data_dir, log=log_file_path)

    # pandas dataframe with plate/well/frame image location data for IDR_stream
    data_to_process = pd.read_csv(data_locations_path, sep="\t", index_col=0)

    # initialize aspera downloader
    stream.init_downloader(aspera_path, aspera_key_path, screens_path, idr_index_name)

    # initialize fiji preprocessor
    stream.init_preprocessor(fiji_path, perform_illumination_correction)

    # initialize CellPose segmentor for MitoCheck data
    stream.init_segmentor(nuclei_model_specs)

    # copy necessary DP files to tmp dir
    config_path = pathlib.Path(
        "../stream_files/DP_files/mitocheck_profiling_config.json"
    )
    checkpoint_path = pathlib.Path(
        "../stream_files/DP_files/efficientnet-b0_weights_tf_dim_ordering_tf_kernels_autoaugment.h5"
    )
    stream.copy_DP_files(config_path, checkpoint_path)

    # run dp IDR_stream!
    # if data is for training, also extract outlines (later MitoCheck labels can be associated with the outlines)
    if data_name == "training_data":
        stream.run_dp_stream(
            data_to_process,
            batch_size=10,
            start_batch=0,
            batch_nums=[19],
            extra_metadata=["object_outlines"],
        )
    else:
        stream.run_dp_stream(
            data_to_process, batch_size=10, start_batch=0, #batch_nums=[0]
        )

Running IDR_stream DP for training_data


[ERROR] Cannot create plugin: org.scijava.plugins.scripting.javascript.JavaScriptScriptLanguage


[INFO] Overriding Leica ROI Reader; identifier: command:de.biovoxxel.utilities.RoiReader; jar: file:/home/roshankern/Desktop/Fiji.app/plugins/Biovoxxel_Plugins-2.5.6.jar
>>> GPU activated? 1
Completed: 70140K bytes transferred in 2 seconds
 (215355K bits/sec), in 1 file.
CellH5Reader initializing /home/roshankern/Desktop/Github/mitocheck_data/1.idr_streams/streams__no_ic/tmp/downloads/LT0094_04/00319_01.ch5
Plate :/sample/0/plate/
Well :/sample/0/plate/LT0094_04--ex2006_08_09--sp2005_08_03--tt17--c4/experiment/
Site :/sample/0/plate/LT0094_04--ex2006_08_09--sp2005_08_03--tt17--c4/experiment/00319/position/
Parse segmentation ROIs for cell object primary__test : 0
Completed: 70140K bytes transferred in 2 seconds
 (205867K bits/sec), in 1 file.
CellH5Reader initializing /home/roshankern/Desktop/Github/mitocheck_data/1.idr_streams/streams__no_ic/tmp/downloads/LT0094_04/00319_01.ch5
Plate :/sample/0/plate/
Well :/sample/0/plate/LT0094_04--ex2006_08_09--sp2005_08_03--tt17--c4/experiment/
Si

2023-12-28 22:28:55.472787: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
Instructions for updating:
non-resource variables are not supported in the long term
  self.loadSingle(filename, delimiter, dtype)
Instructions for updating:
`normal` is a deprecated alias for `truncated_normal`
Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
non-resource variables are not supported in the long term
Reading metadata form tmp/DP_project/inputs/metadata/index.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Metadata_Plate  10 non-null     object
 1   Metadata_Well   10 non-null     object
 2   Metadata_Site   10 non-null     int64 
 3   Plate_Map_Name  10 non-null     object
 4   DNA             10 non-null     object
 5   Gene            10 non-null     object
 6   Gene_Replicate  10 non-null     int64 
dtypes: int64(2), object(5)
memory usage: 688.0+ bytes
None
{'ENSG00000177426': 0, 'ENSG00000186143': 1, 'KIF20A': 2, 'MYST1': 3, 'failed QC': 4}
Instructions for updating:
`normal` is a deprecated alias for `truncated_normal`
Instructions for updating:
Colocations handled automatically by placer.


2023-12-28 22:28:58.171313: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2023-12-28 22:28:58.194669: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-12-28 22:28:58.194848: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:2e:00.0 name: NVIDIA GeForce RTX 3060 computeCapability: 8.6
coreClock: 1.882GHz coreCount: 28 deviceMemorySize: 11.76GiB deviceMemoryBandwidth: 335.32GiB/s
2023-12-28 22:28:58.194864: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2023-12-28 22:28:58.209052: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2023-12-28 22:28:58.209123: I tensorflow/stream_executor/pl


Model: "efficientnet-b0"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
stem_conv (Conv2D)              (None, 112, 112, 32) 864         input[0][0]                      
__________________________________________________________________________________________________
stem_bn (BatchNormalization)    (None, 112, 112, 32) 128         stem_conv[0][0]                  
__________________________________________________________________________________________________
stem_activation (Activation)    (None, 112, 112, 32) 0           stem_bn[0][0]                    
___________________________________________________________________________________

2023-12-28 22:28:59.603115: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "CropAndResize" attr { key: "T" value { type: DT_FLOAT } } attr { key: "extrapolation_value" value { f: 0 } } attr { key: "method" value { s: "bilinear" } } inputs { dtype: DT_FLOAT shape { dim { size: -5 } dim { size: -6 } dim { size: -7 } dim { size: 1 } } } inputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size: 4 } } } inputs { dtype: DT_INT32 shape { dim { size: -2 } } } inputs { dtype: DT_INT32 shape { dim { size: 2 } } value { dtype: DT_INT32 tensor_shape { dim { size: 2 } } tensor_content: "\200\000\000\000\200\000\000\000" } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA GeForce RTX 3060" frequency: 1882 num_cores: 28 environment { key: "architecture" value: "8.6" } environment { key: "cuda" value: "11020" } environment { key: "cudnn" value: "8100" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 2359296 shared_memory_size_

LT0094_04/319_66-1 (235 cells) : 3.30 secs
LT0094_04/319_89-1 (251 cells) : 1.48 secs
LT0094_44/319_74-1 (133 cells) : 0.95 secs
LT0106_02/287_6-1 (82 cells) : 0.69 secs
LT0106_02/287_33-1 (92 cells) : 0.75 secs


2023-12-28 22:29:07.722659: W tensorflow/core/common_runtime/bfc_allocator.cc:271] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.69GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2023-12-28 22:29:07.722704: W tensorflow/core/common_runtime/bfc_allocator.cc:271] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.69GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.


LT0012_25/349_83-1 (843 cells) : 4.10 secs
LT0017_19/365_83-1 (172 cells) : 1.11 secs
LT0017_19/365_93-1 (186 cells) : 1.18 secs
LT0023_04/5_50-1 (267 cells) : 1.60 secs
LT0023_04/5_51-1 (262 cells) : 1.58 secs
Profiling: done




  image_outline_data = pd.read_csv(


  image_outline_data = pd.read_csv(


  image_outline_data = pd.read_csv(


  image_outline_data = pd.read_csv(


  image_outline_data = pd.read_csv(


  image_outline_data = pd.read_csv(


  image_outline_data = pd.read_csv(


  image_outline_data = pd.read_csv(


  image_outline_data = pd.read_csv(


  image_outline_data = pd.read_csv(
