# Setup

In [1]:
import sys
import os
import numpy as np
from dotenv import load_dotenv
import rasterio
import threading
from PIL import Image

load_dotenv()
sys.path.append(os.getenv('CODE_ROOT_PATH'))

from src.data import dataset_utility as dutil

ALL_INDICES_PATH = os.getenv('ALL_INDICES_PATH')
PREPROCESSED_DATA_DIR = os.getenv('PREPROCESSED_DATA_DIR')
RANDOM_INDICES_PATH = os.getenv('RANDOM_INDICES_PATH')

# Preprocessing

In [2]:
patch_indices = np.load(ALL_INDICES_PATH)
np.random.shuffle(patch_indices)

In [3]:
dataset_size = 400000
random_indices = [int(patch_indices[i]) for i in range(dataset_size)]
np.save(RANDOM_INDICES_PATH, random_indices)

In [4]:
def store_patches(indices, start_idx, end_idx):
    for patch_idx in range(start_idx, end_idx):
        arr = dutil.get_patch(indices[patch_idx])
        np.save(os.path.join(PREPROCESSED_DATA_DIR, str(patch_idx)), arr)

Executing the next cell takes ~18h for 400000 patches.
- `$ ls | wc -l` in directory `PREPROCESSED_DATA_DIR`: #files in directory
- `$ du -sh`: size of directory

In [5]:
print(f"Reading {dataset_size} random image patches.")

num_threads = 10
patches_per_thread = int(dataset_size/num_threads)

threads = list()
for i in range(num_threads):
    x = threading.Thread(target=store_patches, args=(random_indices, i*patches_per_thread, (i+1)*patches_per_thread))
    threads.append(x)
    x.start()

for index, thread in enumerate(threads):
    thread.join()

Reading 400000 random image patches.
