### Data exploration and preprocessing for the [...] dataset

In [None]:
"""
Load the source dataset and preprocess it

__ imgs
    |
    |__ fold_1
          |__ datasetname_id.png --> images
          |__ ...
    |__ fold_2
          |__ datasetname_id.png
          |__ ...
    |__ fold_3
          |__ datasetname_id.png
          |__ ...
    |__ fold_4
          |__ datasetname_id.png
          |__ ...
    |__ fold_5
          |__ datasetname_id.png
          |__ ...

__ masks
    |
    |__ fold_1
          |__ datasetname_id.png  --> segmentation mask
          |__ ...
    |__ fold_2
          |__ datasetname_id.png
          |__ ...
    |__ fold_3
          |__ datasetname_id.png
          |__ ...
    |__ fold_4
          |__ datasetname_id.png
          |__ ...
    |__ fold_5
          |__ datasetname_id.png
          |__ ...

For segmentation masks --> 0 and 1 (if more than 2 labels --> use SimpleITK to encode integers in images)
"""

In [None]:
import os
import zipfile
import random
import shutil
from PIL import Image
from tqdm import tqdm
import json 
import subprocess
from pycocotools import mask as coco_mask
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np

from helpers import check_dir_consistency, visualize_images, get_all_dims, create_histograms_of_dims

### Download the dataset and its annotations

In [None]:
seed = 42

In [None]:
data_source_dir = '/home/rob/Documents/3_projects/bench/coco/data' # <--- modify this
output_dir = '/home/rob/Documents/3_projects/bench/coco/output' # <--- modify this
tmp_data_dir = os.path.join(output_dir, 'tmp_data')

skip_dl = True
needs_unzip = False
needs_data_copy = False
[os.makedirs(dir, exist_ok=True) for dir in [data_source_dir, output_dir, tmp_data_dir]]

In [None]:
if skip_dl is False and len(os.listdir(data_source_dir)) == 0:
    # Download source directories
    subprocess.run(['wget', '-P', data_source_dir, 'http://images.cocodataset.org/zips/val2017.zip', '-O', os.path.join(data_source_dir, 'coco_val2017.zip')])
    subprocess.run(['wget', '-P', data_source_dir, 'http://images.cocodataset.org/zips/train2017.zip', '-O', os.path.join(data_source_dir, 'coco_train2017.zip')])
    subprocess.run(['wget', '-P', data_source_dir, 'http://images.cocodataset.org/annotations/stuff_annotations_trainval2017.zip', '-O', os.path.join(data_source_dir, 'stuff_annotations_trainval2017.zip')])
else:
    print(f'Skipping download.')

In [None]:
data_sources = [os.path.join(data_source_dir, f) for f in os.listdir(data_source_dir)]
data_sources

In [None]:
# Unzip downloaded directories
def unzip_dirs(data_sources: str, tmp_data_dir: str) -> list:
    subdirs: list = []

    for data_source in tqdm(data_sources, desc='Unzip dir'):
        subdir_name = os.path.splitext(os.path.basename(data_source))[0]
        subdir_path = os.path.join(tmp_data_dir, subdir_name)
        subdirs.append(subdir_path)

        # Make destination dir
        os.makedirs(subdir_path, exist_ok=True)
        
        # Unzip
        with zipfile.ZipFile(data_source, 'r') as zip_file:
            print(f'Unzipping {data_source} into {subdir_path}')
            zip_file.extractall(subdir_path)

    return subdirs

if needs_unzip:
    subdirs = unzip_dirs(data_sources, tmp_data_dir)

In [None]:
annotations_dir = os.path.join(tmp_data_dir, 'stuff_annotations_trainval2017/annotations')
annotations_train = os.path.join(annotations_dir, 'stuff_train2017.json')
annotations_val = os.path.join(annotations_dir, 'stuff_val2017.json')

## Only use Train Data and make 5 folds from them

In [None]:
img_source_dir = os.path.join(tmp_data_dir, 'coco_train2017', 'train2017')

In [None]:
all_dims = get_all_dims(img_source_dir)
all_dims

In [None]:
def plot_unique_dims(dimensions):
    x_values = [dim[0] for dim in dimensions]
    y_values = [dim[1] for dim in dimensions]

    # Création du scatter plot
    plt.scatter(x_values, y_values)
    plt.title("Unique Image Dimensions (Width x Height)")
    plt.xlabel("Width (pixels)")
    plt.ylabel("Height (pixels)")
    plt.grid(True)
    plt.show()

In [None]:
plot_unique_dims(all_dims)

In [None]:
min_width = min(dim[0] for dim in all_dims)
min_height = min(dim[1] for dim in all_dims)
max_width = max(dim[0] for dim in all_dims)
max_height = max(dim[1] for dim in all_dims)
print(f"Minimum width: {min_width} | max width: {max_width}")
print(f"Minimum height: {min_height} | max height: {max_height}")

In [None]:
print(img_source_dir)

In [None]:
train_masks_dir = os.path.join(tmp_data_dir, 'stuff_annotations_trainval2017', 'annotations', 'stuff_train2017_pixelmaps.zip')
valid_masks_dir = os.path.join(tmp_data_dir, 'stuff_annotations_trainval2017', 'annotations', 'stuff_val2017_pixelmaps.zip')

In [None]:
def extract_zip(zip_path, extract_to):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
        print(f"Finished extracting: {zip_path}")

In [None]:
if needs_unzip:
    extract_zip(train_masks_dir, tmp_data_dir)

In [None]:
if needs_unzip:
    extract_zip(valid_masks_dir, tmp_data_dir)

In [None]:
unziped_train_masks_dir = os.path.join(tmp_data_dir, 'stuff_train2017_pixelmaps')
unziped_valid_masks_dir = os.path.join(tmp_data_dir, 'stuff_val2017_pixelmaps')

## Get Train images

In [None]:
imgs_dir = os.path.join(tmp_data_dir, 'imgs')
os.makedirs(imgs_dir, exist_ok=True)

In [None]:
# valid_masks_files = set(os.listdir(unziped_valid_masks_dir))

In [None]:
# len(valid_masks_files)

In [None]:
if needs_data_copy:
    for img_file in tqdm(os.listdir(img_source_dir)):
        src_path = os.path.join(img_source_dir, img_file)
        dest_path = os.path.join(imgs_dir, img_file)
        shutil.copy(src_path, dest_path)

## Get Train masks

In [None]:
masks_dir = os.path.join(tmp_data_dir, 'masks')
os.makedirs(masks_dir, exist_ok=True)

In [None]:
if needs_data_copy:
    for img_file in tqdm(os.listdir(unziped_train_masks_dir)):
        src_path = os.path.join(unziped_train_masks_dir, img_file)
        dest_path = os.path.join(masks_dir, img_file)
        shutil.copy(src_path, dest_path)

In [None]:
print(imgs_dir)

In [None]:
print(masks_dir)

In [None]:
img_files = os.listdir(imgs_dir)

mask_files = os.listdir(masks_dir)
print(len(img_files) == len(mask_files))

In [None]:
result = check_dir_consistency(imgs_dir, masks_dir)
result

## Finally, move to imgs and masks output dir

In [None]:
output_dir_imgs = os.path.join(output_dir, 'imgs')
os.makedirs(output_dir_imgs, exist_ok=True)

output_dir_masks = os.path.join(output_dir, 'masks')
os.makedirs(output_dir_masks, exist_ok=True)

In [None]:
def create_folds_and_split_data(img_folder: str, mask_folder: str, output_dir: str, seed: int = 42, n_folds: int = 5):
    # 0. Set random seed
    np.random.seed(seed)
    
    # 1. Create folder structure
    img_output_dir = os.path.join(output_dir, 'imgs')
    mask_output_dir = os.path.join(output_dir, 'masks')

    for i in range(0, n_folds):
        os.makedirs(os.path.join(img_output_dir, f'fold_{i}'), exist_ok=True)
        os.makedirs(os.path.join(mask_output_dir, f'fold_{i}'), exist_ok=True)

    # 2. Shuffle and split images and masks
    img_files = sorted(os.listdir(img_folder))
    np.random.shuffle(img_files)

    fold_size = len(img_files) // n_folds
    print(f'Size of fold: {fold_size}')

    for i, img_file in tqdm(enumerate(img_files)):
        fold_num = (i // fold_size)
        if fold_num > n_folds - 1:  # Avoid additional fold with fewer images
            fold_num = n_folds - 1
            
        # 3. Copy files
        img_src = os.path.join(img_folder, img_file)
        img_dst = os.path.join(img_output_dir, f'fold_{fold_num}', img_file)
        shutil.copy(img_src, img_dst)

        # Copy corresponding mask
        mask_file = img_file.split('.')[0] + '.png'
        mask_src = os.path.join(mask_folder, mask_file)

        mask_dst = os.path.join(mask_output_dir, f'fold_{fold_num}', mask_file)
        shutil.copy(mask_src, mask_dst)
    
    return img_output_dir, mask_output_dir


In [None]:
img_output_dir, mask_output_dir = create_folds_and_split_data(imgs_dir, masks_dir, output_dir, seed)

In [None]:
[len(os.listdir(os.path.join(output_dir, f'masks/fold_{i}'))) for i in range(5)]

In [None]:
len(os.listdir(os.path.join(output_dir, 'masks/fold_4')))

In [None]:
def verify_image_mask_correspondence(output_dir, img_folder, mask_folder, n_folds=5):
    """
    Verify that every image in each fold has a corresponding mask and vice versa,
    and that the total count of images and masks matches the original dataset.

    Args:
    output_dir (str): The base directory where the 'imgs' and 'masks' subdirectories are located.
    img_folder (str): The directory containing the original images.
    mask_folder (str): The directory containing the original masks.
    n_folds (int): Number of folds to check for correspondence.

    Raises:
    Exception: If an image doesn't have a corresponding mask, a mask doesn't have a corresponding image,
               or the total count of images and masks doesn't match the original dataset.

    Returns:
    bool: True if all tests pass, False otherwise.
    """
    img_output_dir = os.path.join(output_dir, 'imgs')
    mask_output_dir = os.path.join(output_dir, 'masks')
    original_img_count = len(os.listdir(img_folder))
    original_mask_count = len(os.listdir(mask_folder))
    
    total_img_count = 0
    total_mask_count = 0

    for i in tqdm(range(0, n_folds)):
        img_fold_dir = os.path.join(img_output_dir, f'fold_{i}')
        mask_fold_dir = os.path.join(mask_output_dir, f'fold_{i}')

        img_files = set(os.listdir(img_fold_dir))
        img_files_no_ext = [img_file.split('.')[0] for img_file in img_files]

        mask_files = set(os.listdir(mask_fold_dir))
        mask_files_no_ext = [img_file.split('.')[0] for img_file in img_files]

        total_img_count += len(img_files)
        total_mask_count += len(mask_files)

        # Test 1: Each image has a corresponding mask
        for img_file in img_files_no_ext:
            if img_file not in mask_files_no_ext:
                raise Exception(f"Image {img_file} in fold_{i} does not have a corresponding mask.")

        # Test 2: Each mask has a corresponding image
        for mask_file in mask_files_no_ext:
            if mask_file not in img_files_no_ext:
                raise Exception(f"Mask {mask_file} in fold_{i} does not have a corresponding image.")

    # Test 3: Total count of images and masks matches the original dataset
    if total_img_count != original_img_count or total_mask_count != original_mask_count:
        raise Exception("The total count of images or masks does not match the original dataset.")

    return True

In [None]:
try:
    tests_passed = verify_image_mask_correspondence(output_dir, img_output_dir, mask_output_dir, n_folds=5)
    if tests_passed:
        print("All tests passed successfully.")
except Exception as e:
    print(f"Error: {e}")

## Colormaping for masks

In [4]:
annotation_file_json = os.path.join(tmp_data_dir, 'stuff_annotations_trainval2017', 'annotations', 'stuff_train2017.json')

In [5]:
with open(annotation_file_json, 'r') as f:
    coco_data = json.load(f)

In [6]:
categories = coco_data['annotations']

unique_category_ids = set(ann['category_id'] for ann in categories)

num_unique_classes = len(unique_category_ids)

print(f"Nombre de classes uniques dans les annotations : {num_unique_classes}")

Nombre de classes uniques dans les annotations : 92


In [7]:
unique_category_ids

{92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183}