In [None]:
# Load all imports

import os
from pathlib import Path
from PIL import Image
import numpy as np
import numpy.typing as npt
import pandas as pd

### 1.1 Prepare the bleaching dataset

Originally, the bleaching dataset only has bleached and non-bleached masks. 
The union of these two masks is saved to represent a coral mask.

In [None]:
# Define the directories
root_dir = Path("data/coral_bleaching")
images_dir = root_dir / "images"
bleached_dir = root_dir / "masks_bleached"
non_bleached_dir = root_dir / "masks_non_bleached"
out_dir = root_dir / "masks_coral"
out_dir.mkdir(parents=True, exist_ok=True)

# Loop over all images, and create the coral masks
created = fails = 0
for img_path in sorted(images_dir.glob("*.jpg")):
    stem = img_path.stem
    bleached_path = bleached_dir / f"{stem}_bleached.png"
    non_bleached_path = non_bleached_dir / f"{stem}_non_bleached.png"
    
    mask_bleached = np.array(Image.open(bleached_path).convert("L")).astype(np.uint8)
    mask_non_bleached = np.array(Image.open(non_bleached_path).convert("L")).astype(np.uint8)
    mask_coral = (mask_bleached | mask_non_bleached).astype(np.uint8)

    output_path = out_dir / f"{stem}.png"
    Image.fromarray((mask_coral).astype(np.uint8)).save(output_path)

The bleaching dataset and new coral mask is saved into a DataFrame.  
Any images with less than 1% coral coverage are removed.

In [None]:
# Load all datasets, and remove any coral masks that have less than 1% coverage.

root_dir = Path("data/coral_bleaching")
images_dir = root_dir / "images"
coral_dir = root_dir / "masks_coral"

rows = []

for img_path in sorted(images_dir.glob("*.jpg")):
    stem = img_path.stem
  
    paths = {
        "image": img_path,
        "mask_coral": coral_dir / f"{stem}.png",
    }
    
    array = np.array(Image.open(paths["mask_coral"]).convert("L")).astype(np.uint8) / 255
    coral_mask_H, coral_mask_W = array.shape
    coral_coverage = array.sum() / (coral_mask_H * coral_mask_W)
    if coral_coverage >= 0.01:
        rows.append({
        "stem": stem,
        "image_path": str(paths["image"]),
        "mask_coral_path": str(paths["mask_coral"]) if paths["mask_coral"].exists() else None,
        "coral_coverage": round(float(coral_coverage), 3),
        "source": "coral_bleaching_reefsupport", 
    })

df_clean = pd.DataFrame(rows)

### 1.2 Prepare the benthic dataset

The benthic dataset has masks for hard and soft corals. We create a new mask with the union of these two to represent a general coral mask.  
Any images with less than 1% coral coverage are removed, and the results are appended to the previous DataFrame.

In [None]:
root_dir = Path("data/benthic_datasets")

for site in sorted(os.listdir(root_dir)):
    images_dir = root_dir / site / "images"
    coral_stitched_dir = root_dir / site / "masks_stitched"
    coral_dir = root_dir / site / "masks_coral"
    coral_dir.mkdir(parents=True, exist_ok=True)
    rows = []

    for img_path in sorted(images_dir.glob("*.[jJ][pP][gG]")):
        stem = img_path.stem
        
        # Save all paths
        paths = {
            "image": img_path,
            "coral_stitched_dir": coral_stitched_dir / f"{stem}_mask.png",
            "mask_coral": coral_dir / f"{stem}_binary.png",
        }

        # Convert rgb array to coral-only binary array and save the new array
        with Image.open(paths["coral_stitched_dir"]) as mm:
            rgb_array = np.array(mm.convert("RGB"))
            binary_array = (rgb_array.any(axis=-1)).astype(np.uint8) * 255
        out_path = coral_dir / f"{stem}_binary.png"
        Image.fromarray(binary_array).save(out_path)


        # Compute the coral coverage, exclude any images with <0.01 coverage, and add the others to the dataframe
        coral_mask_H, coral_mask_W = binary_array.shape
        coral_coverage = (binary_array/255).sum() / (coral_mask_H * coral_mask_W)
        if coral_coverage >= 0.01:
            rows.append({
            "stem": stem,
            "image_path": str(paths["image"]),
            "mask_coral_path": str(paths["mask_coral"]) if paths["mask_coral"].exists() else None,
            "coral_coverage": round(float(coral_coverage), 3),
            "source": str(site), 
        })

    df_clean = pd.concat([df_clean, pd.DataFrame(rows)], ignore_index=True)

### 1.3 Train/test/val split

A train/test/val split of 70/15/15 is good practice.  
To ensure domain robustness across reef locations, images from one source should not be in multiple splits.  
To achieve this, the images from sources are divided among the splits.  
The division that was used achieves a 72/13/15 split, which is quite good.

In [None]:
# Show the number of images per source
for source in np.unique(df_clean["source"]):
    print(f"{source}: {len(df_clean[df_clean["source"] == source])}")
print()

# Create a custom division of the sources over the 3 splits.
train_sources = ["SEAFLOWER_BOLIVAR", "SEAVIEW_ATL", "SEAVIEW_PAC_AUS", "UNAL_BLEACHING_TAYRONA", "coral_bleaching_reefsupport"]
test_sources = ["SEAFLOWER_COURTOWN", "SEAVIEW_PAC_USA"]
val_sources = ["SEAVIEW_IDN_PHL", "TETES_PROVIDENCIA"]

df_clean["split"] = df_clean["source"].apply(
    lambda s: "train" if s in train_sources else
              "test" if s in test_sources else
              "val" if s in val_sources else None
)

# Save the csv file
Path("csv_folder/").mkdir(parents=True, exist_ok=True)
df_clean.to_csv("csv_folder/df_clean.csv")

# Show train/test/val ratios
split_counts = df_clean["split"].value_counts().sort_index()
split_perc = (split_counts / len(df_clean) * 100).round(1)

for split in ["train", "val", "test"]:
    count = split_counts.get(split, 0)
    perc = split_perc.get(split, 0)
    print(f"  {split:<6}: {count:>4} images ({perc:>4.1f}%)")

### 1.4 Resize the images and masks

After some testing, we decided to resize all images and masks to 512x512.  
This seemed like a good balance between not affecting model accuracy and maintaining an affordable computation effort. 

In [None]:
import cv2
import pandas as pd
from pathlib import Path
from tqdm import tqdm

output_dir = Path("data/final_dataset")
output_dir.mkdir(parents=True, exist_ok=True)
img_size = (512,512)
rows_out = []

for _, row in tqdm(df_clean.iterrows(), total=len(df_clean)):
    # Define variables per row
    stem = row["stem"]
    img_path = Path(row["image_path"])
    mask_path = Path(row["mask_coral_path"])
    source = row["source"]
    split = row["split"]

    # Define directories and paths
    out_img_dir  = output_dir / split / "images"
    out_mask_dir = output_dir / split / "masks_coral"
    out_img_dir.mkdir(parents=True, exist_ok=True)
    out_mask_dir.mkdir(parents=True, exist_ok=True)
    out_img_path  = out_img_dir / f"{stem}.jpg"
    out_mask_path = out_mask_dir / f"{stem}.png"

    # Read image and mask
    img = cv2.imread(str(img_path), cv2.IMREAD_COLOR)
    mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)

    # Resize image and mask to 512x512
    img_resized  = cv2.resize(img, img_size, interpolation=cv2.INTER_CUBIC)
    mask_resized = cv2.resize(mask, img_size, interpolation=cv2.INTER_NEAREST)

    # Save resized images and masks
    cv2.imwrite(str(out_img_path), img_resized, [cv2.IMWRITE_JPEG_QUALITY, 95])
    cv2.imwrite(str(out_mask_path), mask_resized)

    # Create new row for dataframe
    rows_out.append({
        "stem": stem,
        "image_path": str(out_img_path),
        "mask_coral_path": str(out_mask_path),
        "source": source,
        "split": split,
    })

df_resized = pd.DataFrame(rows_out)
df_resized.to_csv("csv_folder/resized_df.csv")