In [None]:
from utils import tile_dataset, rasterize_shapefiles, resample
from MemoryMapDataset import MemmapDataset
import numpy as np
import os

### This notebook shows how to process raw tiff files and their labels into a dataset.

The processing pipeline assumes that the data in the Path folder is in the following format:
- Each chunk is in it's own folder and named 'Chunk x' or 'Chunk x x-x'
- The RGB tif should be named 'Chunkx.tif' or 'Chunkx_x-x.tif'
- label shape file and corresponding label files should be in a folder called 'labels' inside of the matching 'Chunk x' / 'Chunk x x-x' folder, the names of the files do not need to be formatted.

In [10]:
# Setup
TILE_SIZE = 512

BASE_PATH = "A:\\Drone_Data"
DATA_PATH = os.path.join(BASE_PATH, "original_data")
COMBINED_IMAGES_FILE = os.path.join(DATA_PATH, f'{TILE_SIZE}dataset_images.npy')
COMBINED_LABELS_FILE = os.path.join(DATA_PATH, f'{TILE_SIZE}dataset_labels.npy')

In [None]:
# If desired, resample the TIFF files to a lower resolution
resample_tiffs = False  # Set to True if you want to resample TIFF files
target_resolution = 0.06  # in meters
resample_output_path = os.path.join(BASE_PATH, str(target_resolution).replace('.', '_') + "m")

if resample_tiffs:
    if os.path.exists(resample_output_path):
        print("Resampled data already exists at", resample_output_path,". Setting PATH to resampled directory.")
    else:
        resample(DATA_PATH, resample_output_path, target_resolution)
    
    DATA_PATH = resample_output_path  # Update PATH to the resampled directory
    COMBINED_IMAGES_FILE = os.path.join(DATA_PATH, f'{TILE_SIZE}dataset_images.npy')
    COMBINED_LABELS_FILE = os.path.join(DATA_PATH, f'{TILE_SIZE}dataset_labels.npy')

In [None]:
# Convert all label shape files to tif
rasterize_shapefiles(DATA_PATH)

In [5]:
# Convert all tif pairs into tiled datasets

"""
NOTE: This will take a lot of time, memory, and storage space.
You should have at least 16GB of RAM and triple the chunk folder size of storage.
"""

tile_dataset(DATA_PATH, COMBINED_IMAGES_FILE, COMBINED_LABELS_FILE, image_size=TILE_SIZE)

Processing 99 chunk directories

[1/99] Processing chunk: Chunk 1...
Number of valid pairs: 1968
Number of valid image tiles found: 1968
Buffer size reached 1 chunks, saving temp files.

[2/99] Processing chunk: Chunk 1 0-0...
Number of valid pairs: 1521
Number of valid image tiles found: 1521
Buffer size reached 1 chunks, saving temp files.

[3/99] Processing chunk: Chunk 1 0-1...
Number of valid pairs: 1521
Number of valid image tiles found: 1521
Buffer size reached 1 chunks, saving temp files.

[4/99] Processing chunk: Chunk 1 0-2...
Number of valid pairs: 1170
Number of valid image tiles found: 1170
Buffer size reached 1 chunks, saving temp files.

[5/99] Processing chunk: Chunk 1 1-0...
Number of valid pairs: 1521
Number of valid image tiles found: 1521
Buffer size reached 1 chunks, saving temp files.

[6/99] Processing chunk: Chunk 1 1-1...
Number of valid pairs: 1521
Number of valid image tiles found: 1521
Buffer size reached 1 chunks, saving temp files.

[7/99] Processing chunk

Combining temp files: 100%|██████████| 97/97 [14:12<00:00,  8.78s/file]



Done tiling tif pairs
Deleting temporary files...


In [None]:
import torch
import torchvision.transforms.v2 as v2

# Example dataset
images = np.load(COMBINED_IMAGES_FILE, mmap_mode='r')
labels = np.load(COMBINED_LABELS_FILE, mmap_mode='r')

dataset = MemmapDataset(images, labels, transforms=v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)]))
print(f"Dataset length: {len(dataset)}")
print(f"Dataset image shape: {dataset.images[0].shape}")
print(f"Dataset label shape: {dataset.labels[0].shape}")

Dataset length: 244746
Dataset image shape: (3, 512, 512)
Dataset label shape: (1, 512, 512)
