# Run segmentation and feature extraction on data

## Import libraries

In [1]:
import argparse
import pathlib
import pprint
import random

import sys

sys.path.append("../utils")
import cp_parallel

# check if in a jupyter notebook
try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

## Set paths and variables

In [2]:
# Batch name to process (always contains batch_ prefix then #)
batch_name = "batch_1"

# directory where the corrected images are located within the folder
images_base_dir = pathlib.Path(
    f"../1.illumination_correction/Corrected_Images/{batch_name}"
).resolve(strict=True)

if not in_notebook:
    print("Running as script")
    # set up arg parser
    parser = argparse.ArgumentParser(
        description="CellProfiler segmentation and feature extraction"
    )

    parser.add_argument(
        "--image_dir",
        type=str,
        help="Path to the image directory to process corrected images",
    )

    args = parser.parse_args()
    images_dir = pathlib.Path(args.image_dir).resolve(strict=True)
else:
    print("Running in a notebook")
    platemap_folders = [
        p
        for p in images_base_dir.iterdir()
        if p.is_dir() and p.name.startswith("platemap_")
    ]
    if not platemap_folders:
        raise ValueError(f"No platemap folders found in {batch_name}")
    images_dir = random.choice(platemap_folders).resolve(strict=True)
    print(f"Processing images in: {images_dir}")

# set the run type for the parallelization
run_name = "cp_analysis"

# set path for CellProfiler pipeline
path_to_pipeline = pathlib.Path("./pipeline/analysis.cppipe").resolve(strict=True)

# set main output dir for all plates if it doesn't exist
output_dir = pathlib.Path("./sqlite_outputs")
output_dir.mkdir(exist_ok=True)

Running in a notebook
Processing images in: /home/jenna/targeted_fibrosis_drug_screen/1.illumination_correction/Corrected_Images/batch_1/platemap_4


### Set up paths

In [3]:
# list for plate names
plate_names = []

# iterate through each plate folder
for plate_folder in images_dir.iterdir():
    if plate_folder.is_dir() and plate_folder.name.startswith("CARD"):
        plate_names.append(plate_folder.name)

print("There are a total of", len(plate_names), "plates. The names of the plates are:")
for plate in plate_names:
    print(plate)

There are a total of 4 plates. The names of the plates are:
CARD-CelIns-CX7_251212180001
CARD-CelIns-CX7_251211180001
CARD-CelIns-CX7_251212100001
CARD-CelIns-CX7_251213150001


## Create dictionary with all plate data to run CellProfiler in parallel

In [5]:
# set path to the analysis pipeline
path_to_pipeline = pathlib.Path("./pipeline/analysis.cppipe").resolve(strict=True)

# set main output dir for all plates if it doesn't exist
output_dir = pathlib.Path("./cp_output").resolve(strict=False)
output_dir.mkdir(exist_ok=True)

# create plate info dictionary
plate_info_dictionary = {}

for platemap_folder in images_base_dir.glob("platemap_*"):
    if platemap_folder.is_dir():
        for plate_folder in platemap_folder.iterdir():
            if plate_folder.is_dir() and plate_folder.name.startswith("CARD"):
                # create nested output dir: cp_output/batch/platemap_#/plate
                plate_output_dir = (
                    output_dir / batch_name / platemap_folder.name / plate_folder.name
                )
                # create output dir and set dictionary if plate hasn't been processed
                if not plate_output_dir.exists() or not any(plate_output_dir.iterdir()):
                    plate_output_dir.mkdir(parents=True, exist_ok=True)

                    # add info to dictionary
                    plate_info_dictionary[plate_folder.name] = {
                        "path_to_images": plate_folder.resolve(strict=True),
                        "path_to_output": plate_output_dir.resolve(strict=True),
                        "path_to_pipeline": path_to_pipeline,
                    }
                else:
                    print(
                        f"{plate_output_dir} already exists and contains files, skipping creation and dictionary."
                    )

# view the dictionary to check
pprint.pprint(plate_info_dictionary, indent=4)

{   'CARD-CelIns-CX7_251023210001': {   'path_to_images': PosixPath('/home/jenna/targeted_fibrosis_drug_screen/1.illumination_correction/Corrected_Images/batch_1/platemap_1/CARD-CelIns-CX7_251023210001'),
                                        'path_to_output': PosixPath('/home/jenna/targeted_fibrosis_drug_screen/2.cellprofiler_processing/cp_output/batch_1/platemap_1/CARD-CelIns-CX7_251023210001'),
                                        'path_to_pipeline': PosixPath('/home/jenna/targeted_fibrosis_drug_screen/2.cellprofiler_processing/pipeline/analysis.cppipe')},
    'CARD-CelIns-CX7_251124150001': {   'path_to_images': PosixPath('/home/jenna/targeted_fibrosis_drug_screen/1.illumination_correction/Corrected_Images/batch_1/platemap_1/CARD-CelIns-CX7_251124150001'),
                                        'path_to_output': PosixPath('/home/jenna/targeted_fibrosis_drug_screen/2.cellprofiler_processing/cp_output/batch_1/platemap_1/CARD-CelIns-CX7_251124150001'),
                          

## Run CellProfiler Parallel

Note: We do not run this code cell as we will run this process through the script.

In [None]:
# if dictionary is not empty, run CellProfiler in parallel
if plate_info_dictionary:
    cp_parallel.run_cellprofiler_parallel(
        plate_info_dictionary=plate_info_dictionary,
        run_name=run_name,
        group_level="plate",
    )
else:
    print("No new plates to process. Exiting script.")