This notebook converts single channel grayscale images to 5 channel images by adding blank channels. 
This is done to make the images compatible with the pre-trained models that expect 5 channel images. 
The code in this notebook will need to change to match a unique dataset, regretfully.    

Note that the data used here has four channels, but the model needs 5 channels input.

In [1]:
import multiprocessing as mp
import os
import pathlib
import shutil
import sys
from typing import List, Optional, Tuple

import cv2

# show the image
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tifffile as tiff
import tqdm

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

## Import paths

In [2]:
# set the path to the data directory
data_file_dir = pathlib.Path(
    "../../../5.process_CP_features/data/4.normalized_data/profiles/normalized_profile.parquet"
).resolve(strict=True)

# read in the data
cp_feature_data = pd.read_parquet(data_file_dir)
# print the data
print(cp_feature_data.shape)
cp_feature_data.head()

(209311, 2332)


Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_Variance_CL_488_2_3_02_256,Nuclei_Texture_Variance_CL_488_2_3_03_256,Nuclei_Texture_Variance_CL_561_3_00_256,Nuclei_Texture_Variance_CL_561_3_01_256,Nuclei_Texture_Variance_CL_561_3_02_256,Nuclei_Texture_Variance_CL_561_3_03_256,Nuclei_Texture_Variance_DNA_3_00_256,Nuclei_Texture_Variance_DNA_3_01_256,Nuclei_Texture_Variance_DNA_3_02_256,Nuclei_Texture_Variance_DNA_3_03_256
0,1,D-02,192,Staurosporine,0.0,negative,1,1,0.0,2,...,-0.134476,-0.135105,-0.500936,-0.496649,-0.505593,-0.482682,-0.196127,-0.19531,-0.196272,-0.196929
1,1,D-02,192,Staurosporine,0.0,negative,1,1,0.0,23,...,-0.11927,-0.135105,-0.283499,-0.252914,-0.384439,-0.396934,-0.196127,-0.19531,-0.196272,-0.196929
2,1,D-02,192,Staurosporine,0.0,negative,1,1,0.0,31,...,-0.134476,-0.135105,-0.311619,-0.270867,-0.33241,-0.278933,-0.196127,-0.19531,-0.196272,-0.196929
3,1,D-02,192,Staurosporine,0.0,negative,1,1,0.0,35,...,-0.134476,-0.135105,-0.433799,-0.442503,-0.444581,-0.407869,-0.196127,-0.19531,-0.196272,-0.196929
4,1,D-02,192,Staurosporine,0.0,negative,1,1,0.0,40,...,-0.134476,-0.135105,0.614084,0.861652,0.451436,0.35431,-0.196127,-0.19531,-0.196272,-0.196929


In [3]:
well_fov = cp_feature_data["Metadata_Well"] + "_F" + cp_feature_data["Metadata_FOV"]
cp_feature_data.insert(0, "Metadata_Well_FOV", well_fov)
# get columns that contain Metadata
metadata_columns = [col for col in cp_feature_data.columns if "Metadata" in col]
metadata_df = cp_feature_data[metadata_columns]
# get columns that contain Features
feature_df = cp_feature_data.drop(columns=metadata_columns)
# show all columns
metadata_df.head()

Unnamed: 0,Metadata_Well_FOV,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,...,Metadata_coordinates_x,Metadata_track_id,Metadata_t,Metadata_y,Metadata_x,Metadata_id,Metadata_parent_track_id,Metadata_parent_id,Metadata_coordinates_y,Metadata_distance
0,D-02_F0001,1,D-02,192,Staurosporine,0.0,negative,1,1,0.0,...,"[825.4311965811966, 26.73931623931624]",1,0.0,27.0,825.0,1000002.0,-1,-1.0,"[825.0, 27.0]",0.503872
1,D-02_F0001,1,D-02,192,Staurosporine,0.0,negative,1,1,0.0,...,"[235.58823529411765, 212.98980392156864]",2,0.0,213.0,236.0,1000023.0,-1,-1.0,"[236.0, 213.0]",0.411891
2,D-02_F0001,1,D-02,192,Staurosporine,0.0,negative,1,1,0.0,...,"[435.6968306922435, 292.023352793995]",3,0.0,292.0,436.0,1000031.0,-1,-1.0,"[436.0, 292.0]",0.304067
3,D-02_F0001,1,D-02,192,Staurosporine,0.0,negative,1,1,0.0,...,"[954.0091533180778, 360.1592677345538]",4,0.0,360.0,954.0,1000036.0,-1,-1.0,"[954.0, 360.0]",0.159531
4,D-02_F0001,1,D-02,192,Staurosporine,0.0,negative,1,1,0.0,...,"[996.5272542027509, 379.46153846153845]",5,0.0,379.0,997.0,1000037.0,-1,-1.0,"[997.0, 379.0]",0.660686


This cell is not run as it takes a long time to run...

In [4]:
def get_crop_counts(list_of_counts: List[Tuple[int, int, int]]) -> Tuple[int, int, int]:
    """
    Get the total counts of successful crops and ommited crops

    Parameters
    ----------
    list_of_counts : List[Tuple[int, int, int]]
        A list of tuples containing the counts of successful crops and ommited crops

    Returns
    -------
    Tuple[int, int, int]
        A tuple containing the total counts of successful crops and ommited crops
    """
    total_ommited = 0
    total_sucessful = 0
    total_total = 0
    for ommited, sucessful, total in list_of_counts:
        total_ommited += ommited
        total_sucessful += sucessful
        total_total += total
    assert total_total == total_ommited + total_sucessful
    return (total_ommited, total_sucessful, total_total)

In [5]:
def crop_image(
    i: int,
    image_path: str,
    radius: int = 50,
    add_channels: Optional[bool] = False,
    total_channels: int = 5,
) -> None:
    """

    Crop the image based on the metadata and save the cropped image to disk
    Also output extracted metadata for the cropped image

    Parameters
    ----------
    i : int
        This is the iterator index for the metadata_df
    image_path : str
        Path to the image directory
    radius : int, optional
        The radius to crop the image by, by default 50
    add_channels : Optional[bool], optional
        This is a bool argument if set True will add extra channels to add up to 5 total , by default False

    Returns
    -------
    None
    """
    sucessful_count = 0
    ommited_count = 0
    total_count = 1
    image_information_df = metadata_df.copy().iloc[i]

    center_y = image_information_df["Metadata_Nuclei_Location_Center_Y"].astype(int)
    center_x = image_information_df["Metadata_Nuclei_Location_Center_X"].astype(int)
    well_fov = image_information_df["Metadata_Well_FOV"]
    image_path = pathlib.Path(f"{str(image_path)}{well_fov}").resolve(strict=True)
    # DNA
    image_name_DNA = pathlib.Path(image_information_df["Metadata_Image_FileName_DNA"])
    image_path_DNA = pathlib.Path(image_path / image_name_DNA).resolve(strict=True)
    # 488_1
    image_name_488_1 = pathlib.Path(
        image_information_df["Metadata_Image_FileName_CL_488_1"]
    )
    image_path_488_1 = pathlib.Path(image_path / image_name_488_1).resolve(strict=True)
    # 488_2
    image_name_488_2 = pathlib.Path(
        image_information_df["Metadata_Image_FileName_CL_488_2"]
    )
    image_path_488_2 = pathlib.Path(image_path / image_name_488_2).resolve(strict=True)
    # 561
    image_name_561 = pathlib.Path(
        image_information_df["Metadata_Image_FileName_CL_561"]
    )
    image_path_561 = pathlib.Path(image_path / image_name_561).resolve(strict=True)
    image_DNA = tiff.imread(image_path_DNA)
    image_488_1 = tiff.imread(image_path_488_1)
    image_488_2 = tiff.imread(image_path_488_2)
    image_561 = tiff.imread(image_path_561)

    image_DNA_crop = image_DNA[
        center_y - radius : center_y + radius, center_x - radius : center_x + radius
    ]
    image_488_1_crop = image_488_1[
        center_y - radius : center_y + radius, center_x - radius : center_x + radius
    ]
    image_488_2_crop = image_488_2[
        center_y - radius : center_y + radius, center_x - radius : center_x + radius
    ]
    image_561_crop = image_561[
        center_y - radius : center_y + radius, center_x - radius : center_x + radius
    ]

    # check if crop is an edge case
    # Where edge case is cells that are too close to the edge of the image to crop
    # This ensures that all crops are the same dimensions and can be used in the model
    if image_DNA_crop.shape[0] < radius * 2 or image_DNA_crop.shape[1] < radius * 2:
        ommited_count = 1
        return (ommited_count, sucessful_count, total_count)
    # merge the channels to a single image
    image_merge = np.stack(
        [image_DNA_crop, image_488_1_crop, image_488_2_crop, image_561_crop], axis=-1
    )
    if add_channels:
        if image_merge.shape[-1] < total_channels:
            channels_to_add = total_channels - image_merge.shape[-1]
            for _ in range(channels_to_add):
                # add a new channel of all zeros
                new_channels = np.zeros((image_merge.shape[0], image_merge.shape[1], 1))
                image_merge = np.concatenate((image_merge, new_channels), axis=-1)
    # save images to disk
    image_save_path = pathlib.Path(
        # f"../../../data/processed_images/crops/{image_information_df['Metadata_Well_FOV']}/time_{image_information_df['Metadata_Time']}_image_number_{image_information_df['Metadata_ImageNumber']}_cell_number_{image_information_df['Metadata_Nuclei_Number_Object_Number']}/"
        f"../data/processed_images/sc_crops/{image_information_df['Metadata_Well_FOV']}_time_{image_information_df['Metadata_Time']}_image_number_{image_information_df['Metadata_ImageNumber']}_cell_number_{image_information_df['Metadata_Nuclei_Number_Object_Number']}/"
    )
    image_save_path.mkdir(parents=True, exist_ok=True)
    file_name = image_information_df["Metadata_Image_FileName_DNA"].replace(
        ".tiff",
        f'image_number_{image_information_df["Metadata_ImageNumber"]}_cell_number_{image_information_df["Metadata_Nuclei_Number_Object_Number"]}_crop.tiff',
    )
    image_save_path = pathlib.Path(image_save_path / file_name)
    if os.path.exists(image_save_path):
        sucessful_count = 1
        return (ommited_count, sucessful_count, total_count)
    tiff.imwrite(image_save_path, image_merge)
    sucessful_count = 1
    return (ommited_count, sucessful_count, total_count)

In [6]:
image_path = pathlib.Path(
    "../../../2.cellprofiler_ic_processing/illum_directory/timelapse/20231017ChromaLive_6hr_4ch_MaxIP_"
)
radius = 50

In [7]:
# set the number of processes to use
if in_notebook:
    num_processes = mp.cpu_count() - 4
else:
    num_processes = mp.cpu_count()
print(f"Number of processes: {num_processes}")
# get the total number of rows in the metadata_df
total_crops = metadata_df.shape[0]
print(f"There are {total_crops:,} images to crop")

process_list = [
    mp.Process(target=crop_image, args=(i, image_path, radius, False))
    for i in range(total_crops)
]
print(f"There are {len(process_list):,} processes to run")
pool = mp.Pool(num_processes)
# run the processes in the pool with multiple args
results = pool.starmap_async(
    crop_image, [(i, image_path, radius, False) for i in range(total_crops)]
)
pool.close()
pool.join()
pool.terminate()
results = results.get()
final_counts = get_crop_counts(results)
# print the totals with commas for easy reading
print(
    f"Total crops: {final_counts[2]:,}, Sucessful crops: {final_counts[1]:,}, Ommited crops: {final_counts[0]:,}"
)

Number of processes: 20
There are 209,311 images to crop
There are 209,311 processes to run
Total crops: 209,311, Sucessful crops: 184,115, Ommited crops: 25,196
