### Code to generate train-val-test 70-15-15 split for 1024 x 1024 images. Then code to create a different version of dataset to create 256 by 256 versions of the images.

### First code chunk to rename images and move them

In [2]:
import os
import shutil
import random
import math
from PIL import Image

In [None]:
random_seed = 42
random.seed(random_seed)

s1_src = r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\sample 1"
s2_src = r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\sample 2"

train_dst = r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1\train"
val_dst = r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1\val"
test_dst = r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1\test"

def rename_and_split_images(src_folder, dst_folder):
    for subfolder in os.listdir(src_folder):
        subfolder_path = os.path.join(src_folder, subfolder)
        if os.path.isdir(subfolder_path):
            for root, dirs, files in os.walk(subfolder_path):
                for file in files:
                    if file.endswith(".png"):
                        sample = os.path.basename(src_folder)
                        parent_folder = os.path.basename(os.path.basename(os.path.basename(root)))
                        filename, ext = os.path.splitext(file)
                        new_filename = f"{sample}_{subfolder}_{parent_folder}_{filename}{ext}"
                        src_path = os.path.join(root, file)
                        dst_path = os.path.join(dst_folder, new_filename)
                        dst_subfolder_he = os.path.join(dst_folder, "HE")
                        dst_subfolder_ihc = os.path.join(dst_folder, "IHC")
                        os.makedirs(dst_subfolder_he, exist_ok=True)
                        os.makedirs(dst_subfolder_ihc, exist_ok=True)
                        os.rename(src_path, dst_path)

rename_and_split_images(s1_src, train_dst)
rename_and_split_images(s2_src, val_dst)

### Second chunk to actually split and move split images to according directory. repeat below for s1_image and s2_images, uncomment.

In [7]:
s1_images = [x for x in os.listdir(r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1\train") if x.endswith(".png")]
s1_images = [os.path.join(r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1\train", x) for x in s1_images]
# s2_images = [x for x in os.listdir(r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1\val") if x.endswith(".png")]
# s2_images = [os.path.join(r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1\val", x) for x in s2_images]
ihc_a_he_list, ihc_b_he_list, ihc_c_he_list = [], [], []
ihc_a_ihc_list, ihc_b_ihc_list, ihc_c_ihc_list = [], [], []

for images in s1_images:
    ihc_stain_type = images.split("sample 1_")[1].split(" ")[0]  # IHCA,IHCB,IHCC
    ihc_or_he = images.split("sample 1_")[1].split("_image_tile_")[0].split("_")[-1]  # HE or ISLET ,etc.
    image_tile = images.split("sample 1_")[1].split("_image_tile_")[-1]  # ex. 0542.png
    if ihc_stain_type == "IHCA":
        if ihc_or_he == "HE":
            ihc_a_he_list.append(images)
        else:
            ihc_a_ihc_list.append(images)
    if ihc_stain_type == "IHCB":
        if ihc_or_he == "HE":
            ihc_b_he_list.append(images)
        else:
            ihc_b_ihc_list.append(images)
    if ihc_stain_type == "IHCC":
        if ihc_or_he == "HE":
            ihc_c_he_list.append(images)
        else:
            ihc_c_ihc_list.append(images)

def shuffle_lists_equally(he_list, ihc_list):
    zipped = list(zip(he_list, ihc_list))
    random.shuffle(zipped)
    he_list, ihc_list = zip(*zipped)
    return he_list, ihc_list

ihc_a_ihc_list, ihc_a_he_list = shuffle_lists_equally(ihc_a_ihc_list,ihc_a_he_list)
ihc_b_ihc_list, ihc_b_he_list = shuffle_lists_equally(ihc_b_ihc_list,ihc_b_he_list)
ihc_c_ihc_list, ihc_c_he_list = shuffle_lists_equally(ihc_c_ihc_list,ihc_c_he_list)
ihc_a_ihc_list = list(ihc_a_ihc_list)
ihc_a_he_list = list(ihc_a_he_list)
ihc_b_ihc_list = list(ihc_b_ihc_list)
ihc_b_he_list = list(ihc_b_he_list)
ihc_c_ihc_list = list(ihc_c_ihc_list)
ihc_c_he_list = list(ihc_c_he_list)

In [8]:
def split_move_images(ihc_list, he_list, dst_src = r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1"):
    total_num = len(he_list)
    train_num = math.ceil(total_num*0.7)
    val_num = math.ceil(total_num*0.85)
    he_train_files = he_list[:train_num]
    he_val_files = he_list[train_num:val_num]
    he_test_files = he_list[val_num:]
    ihc_train_files = ihc_list[:train_num]
    ihc_val_files = ihc_list[train_num:val_num]
    ihc_test_files = ihc_list[val_num:]
    for files in he_train_files:
        filename = os.path.basename(files)
        new_img_dst = os.path.join(dst_src,"train","HE",filename)
        shutil.move(files, new_img_dst)
    for files1 in he_val_files:
        filename = os.path.basename(files1)
        new_img_dst = os.path.join(dst_src,"val","HE",filename)
        shutil.move(files1, new_img_dst)
    for files2 in he_test_files:
        filename = os.path.basename(files2)
        new_img_dst = os.path.join(dst_src,"test","HE",filename)
        shutil.move(files2, new_img_dst)
    for files3 in ihc_train_files:
        filename = os.path.basename(files3)
        new_img_dst = os.path.join(dst_src,"train","IHC",filename)
        shutil.move(files3, new_img_dst)
    for files4 in ihc_val_files:
        filename = os.path.basename(files4)
        new_img_dst = os.path.join(dst_src,"val","IHC",filename)
        shutil.move(files4, new_img_dst)
    for files5 in ihc_test_files:
        filename = os.path.basename(files5)
        new_img_dst = os.path.join(dst_src,"test","IHC",filename)
        shutil.move(files5, new_img_dst)

split_move_images(ihc_a_ihc_list, ihc_a_he_list,dst_src= r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1")
split_move_images(ihc_b_ihc_list, ihc_b_he_list,dst_src= r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1")
split_move_images(ihc_c_ihc_list, ihc_c_he_list,dst_src= r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1")

### Separate code chunk to now create 256 x 256 version of above dataset:

In [9]:
def resize_image(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    input_files = [x for x in os.listdir(input_dir) if x.endswith(".png")]
    for input_file in input_files:
        input_file_path = os.path.join(input_dir, input_file)
        image = Image.open(input_file_path)
        for i in range(4):
            for j in range(4):
                x_offset = i * 256
                y_offset = j * 256
                tile = image.crop((x_offset, y_offset, x_offset + 256, y_offset + 256))
                output_file = input_file.replace(".png", f"_{i + 1}{j + 1}.png")
                new_save_path = os.path.join(output_dir, output_file)
                tile.save(new_save_path)

    print("All images processed.")

In [10]:
resize_image(r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1\test\HE",r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1_256x256\test\HE")

All images processed.


In [11]:
resize_image(r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1\test\IHC",r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1_256x256\test\IHC")

All images processed.


In [12]:
resize_image(r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1\train\HE",r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1_256x256\train\HE")

All images processed.


In [13]:
resize_image(r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1\train\IHC",r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1_256x256\train\IHC")

All images processed.


In [14]:
resize_image(r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1\val\HE",r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1_256x256\val\HE")

All images processed.


In [15]:
resize_image(r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1\val\IHC",r"\\shelter\Kyu\IHC2HE\Balanced_Aligned\dataset_v1_256x256\val\IHC")

All images processed.
