# End-to-End Notebook GSK & Broad Institute

This notebook is designed to enable to the following steps:

1. Load pre-trained virtual stain model and generate predictions
2. Generate instance masks (nuclei or cytoplasm) via CellPose for virtual stain predictions
3. Normalise predictions to 8bit (cellprofiler requirement)
4. Run Cellprofiler pipeline and save features.





## Load libraries and helper functions


In [5]:

import os 
import sys
import pandas as pd
from pathlib import Path
import glob
from slurm.sbatch import submit_array
from slurm.commands import inference_cellpose
from cellpose.io import read_yaml, write_yaml
root_dir = "../../../"
sys.path.append(root_dir)

def dict_to_str(_dict: dict):
    """
    Change from dict to string conversion.

    :param _dict:
    :return:
    """
    s = ''
    for k, v in _dict.items():
        s = s + k + ' ' + v + ' '
    return s


def test_pix2pixHD(py_file: str, arg_dict: str):
    """
    Generate slurm command to train Pix2PixHD.

    :param py_file:
    :param arg_dict:
    :return:
    """
    command = f'python {py_file} {dict_to_str(arg_dict)}'
    return command

def normalize(arr):
    """
    8bit normalisation.
    Despite full range being 65535 we found when we randomly sampled
    from our DAPI stain the average of 10,000 samples over three random 
    seeds for min and max was 85.0 and 12573 so we perform minmax norm 
    with these values. This is only completed as Cellprofiler pipeline
    requires images to be 8bit intergers.

    :param arr:
    :return:
    """
    scale_arr = (arr - 85.0) / (12573.0 - 85.0)
    scale_arr = scale_arr * 255
    scale_arr = scale_arr.astype(np.uint8)
    return scale_arr

ModuleNotFoundError: No module named 'slurm'

## Load pre-trained virtual stain model and generate predictions

This notebook is used to run inference on a pre-trained Pix2PixHD GAN model

1. Example bright-field "AssayPlate_PerkinElmer_CellCarrierUltra_P24_t0024F005L01A01Z01C01.tif". Predictions are stored in seperate folders based on P24. </c>
    Leads to 387 folders. Each folder is then paralysed and run concurrently through the next stages .


In [7]:
repo_path = os.path.join(
    '/hpc/projects/upt/samuel_tonks_experimental_space/repos/gskgithub/GSK-Broad/pix2pixHD/'
    )

py_file = os.path.join(
        repo_path,
        'test_gskbroad.py'
    )

conda_path = os.path.join(
    '/hpc/user_apps/bioimaging_analytics/conda_environments/pix2pixHD_CUDA11/'
    )

model = 'nuclei' # nuclei or cyto

model_dir = os.path.join(
    repo_path,
    'weights',
    model
    )

output_folder_name = 'GSK-Broad'

input_dir = os.path.join(
    '/hpc/scratch/rdip1/smt29021/XAI/ELN128360/BFcellhealth_20230322_113718/AssayPlate_PerkinElmer_CellCarrierUltra/'
    )

output_dir = os.path.join(
    '/hpc/projects/upt/samuel_tonks_experimental_space/repos/gskgithub/'
    ,output_folder_name
    )


# Pix2PixHD GAN params
command_list = []
arg_dict = {}
# To be changed
arg_dict['--checkpoints_dir'] = model_dir
arg_dict['--dataroot'] = input_dir
arg_dict['--name'] = 'gsk-broad'

# GSK-Broad data is 1000x1000
arg_dict['--loadSize'] = '1000'
arg_dict['--output_reshape'] = '1000'  
arg_dict['--resize_or_crop'] = 'none'
arg_dict['--how_many'] = '3' #  total number of bright-field

# Not to be changed
arg_dict['--data_type'] = '16'
arg_dict['--label_nc'] = '0'
arg_dict['--input_nc'] = '1'
arg_dict['--output_nc'] = '1'
arg_dict['--no_instance'] = ''
arg_dict['--gpu_ids'] = '0'
arg_dict['--norm'] = 'instance'
arg_dict['--results_dir'] = os.path.join(output_dir)
arg_dict['--which_epoch'] = 'latest' 

if not os.path.exists(os.path.join(output_dir,f"test_{'latest'}")):
    os.makedirs(os.path.join(output_dir,f"test_{'latest'}"))

command = test_pix2pixHD(py_file,arg_dict)
command_list.append(command)
    
print("Number of Jobs {}".format(len(command_list)))

### Job Settings ###

job_name = 'VNuclei_PHD_Inference'

node_setting = ''
node_setting = node_setting+' --job-name={}'.format(job_name)
node_setting = node_setting+' --time=4:00:00'
node_setting = node_setting+' --nodes=1'
node_setting = node_setting+' --partition=gpu'
node_setting = node_setting+' --gres=gpu:a6000:1'
node_setting = node_setting+' --ntasks-per-node=1'
node_setting = node_setting+' --output=./slurm_outs/"slurm-%A_%a.out"'
node_setting = node_setting[1:]

os.makedirs('./slurm_outs', exist_ok=True)
 
jobid = submit_array(root_dir, command_list, node_setting, job_name, repo_path,conda_path)

Number of Jobs 1
Submitted batch job 26801



## Generate instance masks (nuclei or cytoplasm) via CellPose for virtual stain predictions

In [None]:
# HPC Settings
# Choose HPC
hpc = "US" # Should be "UK" or "US"

# Set HPC related variables
if hpc == "UK":
    hpc_project = "/hpc/projects/stv/bioimaging_analytics"
    hpc_cpu = " --partition=stv-cpu"
    hpc_gpu = " --partition=stv-gpu"

elif hpc == "US":
    hpc_project = "/hpc/projects/upt/samuel_tonks_experimental_space/experiments/"
    hpc_cpu = " --partition=up-cpu"
    hpc_gpu = " --partition=up-gpu"

else:
    print(f"HPC setting does not exists.")

# Define Common Variables

# Set dataset name, should be consistent with folder structure
dataset_name = 'GSKBroad'
conda_path = "/path/to/cellpose/env"

# Populate shared variables
project_dir = os.path.join(hpc_project, "GSK-Broad")
config_dir = os.path.join(root_dir, "configs")
tmp_dir = os.path.join(root_dir, "tmp")


In [None]:
##### GSK-Broad

# Identify all images
gsk_broad = sorted([p for p in Path(f'{output_dir}/').glob(f'**/*_virtualstain.tiff')])

# Identify Subfolders
folders = set([str(i).split('/')[-2] for i in gsk_broad])

# Iterate input folder and build cellpose command for cell and nuclei segmentation
command_list = []
for f in list(folders):
    dataset_name = f
    path = os.path.join(
        gsk_broad_path,
        f
    )
    chosen_path = path
    yml_dir = os.path.join(tmp_dir, "inference_cellpose", dataset_name)
    py_path = os.path.join(root_dir, "cellpose", "inference_cellpose.py")
    for root, dirs, files in os.walk(chosen_path):
        if not dirs:
            cell_list = []
            nuclei_list = []
            for file in files:
                # Build Input: Output pair
                input_file = os.path.join(chosen_path, file)
                output_file = os.path.join(chosen_path, file[:-4]+'_mask.tiff')
                io_dict = {"input": input_file, "output": output_file}
                # Append to corresponding list
                if model == 'cyto':
                    cell_list.append(io_dict)
                elif model == 'nuclei':
                    nuclei_list.append(io_dict)
                else:
                    continue
    # Build cellpose cell segmentation commands
    if model == 'cyto':
        cell_yml = os.path.join(yml_dir,dataset_name +"_Cell.yml")
        args_dict = {}
        args_dict["--chan"] = "0"
        args_dict["--save_tif"] = ""
        args_dict["--use_gpu"] = ""
        args_dict["--diameter"] = "0."
        args_dict["--file_yaml"] = cell_yml
        args_dict["--pretrained_model"] = "cyto"

        command = inference_cellpose(py_path, args_dict)
        command_list.append(command)
        write_yaml(cell_yml, cell_list)

    # Build cellpose nuclei segmentation commands
    elif model == 'nuclei':
        nuclei_yml = os.path.join(yml_dir, dataset_name+"_Nuclei.yml")
        args_dict = {}
        args_dict["--chan"] = "0"
        args_dict["--save_tif"] = ""
        args_dict["--use_gpu"] = ""
        args_dict["--diameter"] = "0."
        args_dict["--file_yaml"] = nuclei_yml
        args_dict["--pretrained_model"] = "nuclei"

        command = inference_cellpose(py_path, args_dict)
        command_list.append(command)
        write_yaml(nuclei_yml, nuclei_list)

print(command)
print(f"Number of Jobs: {len(command_list)}")

# Job array settings
job_name = "cellpose/"+dataset_name
out_dir = os.path.join(tmp_dir, job_name)

node_setting = ""
node_setting = node_setting+f" --job-name={job_name}"
node_setting = node_setting+" --time=2-00:00"
node_setting = node_setting+" --nodes=1"
node_setting = node_setting+hpc_gpu
node_setting = node_setting+" --gres=gpu:gtx1080ti:1"
node_setting = node_setting+" --mem=64000"
node_setting = node_setting+" --ntasks-per-node=1"
node_setting = node_setting+f" --output={out_dir}/'slurm-%A_%a.out'"
node_setting = node_setting[1:]

os.makedirs(out_dir, exist_ok=True)
submit_array(tmp_dir, command_list, node_setting, job_name, repo_path=None,conda_path=conda_path,model='cellpose')

## Normalise predictions to 8bit (cellprofiler requirement)

In [None]:
for i in folders:     
    output_dir = f'{gsk_broad_path}/8bit_images_16bit_masks/'
    if not os.path.exists(output_dir):
            os.makedirs(output_dir)               
    samples = sorted([p for p in Path(f'{gsk_broad_path}/').glob(f'**/*syn*.tif*')])
    masks = sorted([p for p in Path(f'{gsk_broad_path}/').glob(f'**/*mask*.tif*')])
    print(i,len(samples),len(masks))
    for index,f in enumerate(samples):
        tmp = imread(f)
        tmp_8bit = normalize(tmp)
        final_name = str(f).split('/')[-1]
        final_path = f'{output_dir}/{final_name}'
        imsave(final_path,tmp_8bit.astype(np.uint8),imagej=True)
        mask = imread(masks[index])
        mask = center_crop(mask)
        mask_name = str(masks[index]).split('/')[-1]
        final_path = f'{output_dir}/{mask_name}'
        imsave(final_path,mask,imagej=True)



## CellProfiler - run pipeline and save features.


In [None]:
# Iterate input folder and build cellpose command for cell and nuclei segmentation

for f in list(folders):
    command_list = []
    path = os.path.join(
        gsk_broad_path,
        f,)
    ### Run in via SLURM
    command_list = []
    cp_path = os.path.join(
        './cellprofiler/...cpproj')
    input_path = os.path.join(
        f'{gsk_broad_path}/8bit_images_16bit_masks/'
    )

    conda_path = '/path/to/cellprofiler/conda/env/'
    dataset_name = f'GSK-Broad: {f}' 
    command = f'cellprofiler -c -r -p {cp_path} -i {input_path} -o {input_path}'
    command_list.append(command)
    
# Job array settings
job_name = dataset_name
out_dir = os.path.join(tmp_dir, job_name)
node_setting = ""
node_setting = node_setting+f" --job-name={job_name}"
node_setting = node_setting+" --time=2-00:00"
node_setting = node_setting+" --nodes=1"
node_setting = node_setting+hpc_gpu
node_setting = node_setting+" --gres=gpu:gtx1080ti:1"
node_setting = node_setting+" --mem=64000"
node_setting = node_setting+" --ntasks-per-node=1"
node_setting = node_setting+" --export=NONE"
node_setting = node_setting+f" --output={out_dir}/'slurm-%A_%a.out'"
node_setting = node_setting[1:]

os.makedirs(out_dir, exist_ok=True)
submit_sbatch(tmp_dir, command_list, node_setting, job_name, repo_path=None,conda_path=None, model='cellprofiler')

## End