In [7]:
from utils import tile_dataset, rasterize_shapefiles, resample
from MemoryMapDataset import MemmapDataset
import numpy as np
import os

### This notebook shows how to process raw tiff files and their labels into a dataset.

The processing pipeline assumes that the data in the Path folder is in the following format:
- Each chunk is in it's own folder and named 'Chunk x' or 'Chunk x x-x'
- The RGB tif should be named 'Chunkx.tif' or 'Chunkx_x-x.tif'
- label shape file and corresponding label files should be in a folder called 'labels' inside of the matching 'Chunk x' / 'Chunk x x-x' folder, the names of the files do not need to be formatted.

In [8]:
# Setup
TILE_SIZE = 512
BASE_PATH = "/Users/gage/Desktop/Mangrove"
DATA_PATH = os.path.join(BASE_PATH, "Drone Data")
COMBINED_IMAGES_FILE = os.path.join(DATA_PATH, f'{TILE_SIZE}dataset_images.npy')
COMBINED_LABELS_FILE = os.path.join(DATA_PATH, f'{TILE_SIZE}dataset_labels.npy')

In [None]:
# If desired, resample the TIFF files to a lower resolution
resample_tiffs = False  # Set to True if you want to resample TIFF files
target_resolution = 0.06  # in meters
resample_output_path = os.path.join(BASE_PATH, str(target_resolution).replace('.', '_') + "m")

if resample_tiffs:
    if os.path.exists(resample_output_path):
        print("Resampled data already exists at", resample_output_path,". Setting PATH to resampled directory.")
    else:
        resample(DATA_PATH, resample_output_path, target_resolution)
    
    DATA_PATH = resample_output_path  # Update PATH to the resampled directory
    COMBINED_IMAGES_FILE = os.path.join(DATA_PATH, f'{TILE_SIZE}dataset_images.npy')
    COMBINED_LABELS_FILE = os.path.join(DATA_PATH, f'{TILE_SIZE}dataset_labels.npy')

In [9]:
# Convert all label shape files to tif
rasterize_shapefiles(DATA_PATH)

Rasterizing Chunk 4 2-0 shapefile... (1/88)
No shapefiles found in /Users/gage/Desktop/Mangrove/Drone Data/Chunk 4 2-0/labels
Failed to read Chunk 4 2-0 shapefile. Please check shape data.


Done rasterizing shapefiles


In [10]:
# Convert all tif pairs into tiled datasets

"""
NOTE: This will take a lot of time, memory, and storage space.
You should have at least 16GB of RAM and triple the chunk folder size of storage.
"""

tile_dataset(DATA_PATH, COMBINED_IMAGES_FILE, COMBINED_LABELS_FILE, image_size=TILE_SIZE, filter_monolithic_labels=0.9)

Processing 88 chunk directories

[1/88] Processing chunk: Chunk 4 2-0...
No label TIFF files found in /Users/gage/Desktop/Mangrove/Drone Data/Chunk 4 2-0/labels. Skipping...

[2/88] Processing chunk: Chunk 7 0-2...
Number of valid pairs: 214
Saving temp files...

[3/88] Processing chunk: Chunk 7 2-1...
Number of valid pairs: 686
Saving temp files...

[4/88] Processing chunk: Chunk 6 2-1...
Number of valid pairs: 179
Saving temp files...

[5/88] Processing chunk: Chunk 10...
Number of valid pairs: 1041
Saving temp files...

[6/88] Processing chunk: Chunk 17...
Number of valid pairs: 2060
Saving temp files...

[7/88] Processing chunk: Chunk 6 0-2...
Number of valid pairs: 533
Saving temp files...

[8/88] Processing chunk: Chunk 21...
Number of valid pairs: 2485
Saving temp files...

[9/88] Processing chunk: Chunk 19...
Number of valid pairs: 2196
Saving temp files...

[10/88] Processing chunk: Chunk 4 0-2...
Number of valid pairs: 41
Saving temp files...

[11/88] Processing chunk: Chunk 

Combining temp files: 100%|██████████| 84/84 [01:04<00:00,  1.30file/s]



Done tiling tif pairs
Deleting temporary files...


In [11]:
import torch
import torchvision.transforms.v2 as v2

# Example dataset
images = np.load(COMBINED_IMAGES_FILE, mmap_mode='r+')
labels = np.load(COMBINED_LABELS_FILE, mmap_mode='r+')

dataset = MemmapDataset(images, labels, transforms=v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)]))
print(f"Dataset length: {len(dataset)}")
print(f"Dataset image shape: {dataset.images[0].shape}")
print(f"Dataset label shape: {dataset.labels[0].shape}")

uniques = np.unique(labels[:1000])
print(f"Unique values in labels: {uniques}")

Dataset length: 63260
Dataset image shape: (3, 512, 512)
Dataset label shape: (1, 512, 512)
Unique values in labels: [  0   1 255]


In [12]:
dataset.shuffle()

Percent Shuffled: 100.00%