In [1]:
from utils import tile_dataset, rasterize_shapefiles, resample
from MemoryMapDataset import MemmapDataset
import numpy as np
import psutil
import os
import gc

The processing pipeline assumes that the data in the Path folder is in the following format:
- Each chunk is in it's own folder and named 'Chunk x' or 'Chunk x x-x'
- The RGB tif should be named 'Chunkx.tif' or 'Chunkx_x-x.tif'
- label shape file and corresponding label files should be in a folder called 'labels' inside of the matching 'Chunk x' / 'Chunk x x-x' folder, the names of the files do not need to be formatted.

In [None]:
# Setup
BASE_PATH = "A:\\Desktop\\Drone_Data"
DATA_PATH = os.path.join(BASE_PATH, "original_data")
TILE_SIZE = 224

COMBINED_IMAGES_FILE = os.path.join(DATA_PATH, f'{TILE_SIZE}dataset_images.npy')
COMBINED_LABELS_FILE = os.path.join(DATA_PATH, f'{TILE_SIZE}dataset_labels.npy')

# Find how many chunks can be buffered at a time based on RAM threshold
average_chunk_size = 2 # Average chunk size in GB.
available_ram = psutil.virtual_memory().available / (1024 ** 2)
safe_ram_usage = available_ram - (8 + average_chunk_size) * 1024  # 8GB + chunk size below total RAM
CHUNK_BUFFER_SIZE = int(safe_ram_usage // (average_chunk_size * 1024))

if CHUNK_BUFFER_SIZE <= 0:
    raise ValueError("Insufficient RAM to process chunks. Please increase available RAM or reduce average chunk size and proceed with caution.")

print(CHUNK_BUFFER_SIZE, f"chunks can be buffered in RAM at a time based on {int(available_ram // 1024)}GB available RAM.")


19 chunks can be buffered in RAM at a time based on 48GB available RAM.


In [None]:
# If desired, resample the TIFF files to a lower resolution

resample_tiffs = False  # Set to True if you want to resample TIFF files
target_resolution = 0.06  # in meters
resample_output_path = os.path.join(BASE_PATH, str(target_resolution).replace('.', '_') + "m")

if resample_tiffs:
    if os.path.exists(resample_output_path):
        print("Resampled data already exists at", resample_output_path,". Setting PATH to resampled directory.")
    else:
        resample(DATA_PATH, resample_output_path, target_resolution)
    
    DATA_PATH = resample_output_path  # Update PATH to the resampled directory
    COMBINED_IMAGES_FILE = os.path.join(DATA_PATH, f'{TILE_SIZE}dataset_images.npy')
    COMBINED_LABELS_FILE = os.path.join(DATA_PATH, f'{TILE_SIZE}dataset_labels.npy')

In [None]:
# Convert all label shape files to tif
rasterize_shapefiles(DATA_PATH)

In [None]:
# Convert all tif pairs into tiled datasets

"""
NOTE: This will take a lot of time, memory, and storage space.
You should have at least 32GB of RAM and triple the chunk folder size of storage. If you don't have enough RAM,
you can run this script in smaller chunks by lowering the CHUNK_BUFFER_SIZE variable.
"""

tile_dataset(DATA_PATH, COMBINED_IMAGES_FILE, COMBINED_LABELS_FILE, chunk_buffer_size=CHUNK_BUFFER_SIZE, image_size=TILE_SIZE)

In [None]:
# Shuffle data one entry at a time using Fisher-Yates shuffle
def shuffle_data(images_path, labels_path):
    images = np.load(images_path, mmap_mode='r+')
    labels = np.load(labels_path, mmap_mode='r+')

    dataset_size = images.shape[0]

    for i in range(dataset_size-1, 0, -1):
        print(f"Percent Shuffled: {100*(dataset_size-i)/dataset_size:.2f}%", end='\r')
        j = np.random.randint(0, i+1)
        images[i], images[j] = images[j], images[i]
        labels[i], labels[j] = labels[j], labels[i]

        if i % 5000 == 0:
            del images
            del labels
            gc.collect()
            
            images = np.load(images_path, mmap_mode='r+')
            labels = np.load(labels_path, mmap_mode='r+')



shuffle_data(COMBINED_IMAGES_FILE, COMBINED_LABELS_FILE)

Percent Shuffled: 100.00%

In [7]:
# Example dataset

images = np.load(COMBINED_IMAGES_FILE, mmap_mode='r+')
labels = np.load(COMBINED_LABELS_FILE, mmap_mode='r+')

dataset = MemmapDataset(images, labels)
print(f"Dataset length: {len(dataset)}")
print(f"Dataset image shape: {dataset.images[0].shape}")
print(f"Dataset label shape: {dataset.labels[0].shape}")

Dataset length: 193249
Dataset image shape: (3, 224, 224)
Dataset label shape: (1, 224, 224)
