In [1]:
import os
from pathlib import Path
from pprint import pprint

import rasterio
from config import SETTINGS
from minio import Minio
from minio.commonconfig import CopySource
from minio.error import InvalidResponseError, S3Error
from tqdm.notebook import tqdm

# Search MinIO for mosaic images

This notebook searches storage buckets on MinIO and attempts to identify georeferenced mosaic images for further processing (e.g. for upload to GeoServer). **The algorithm for identifying mosaics uses simple rules and will probably miss some images**, but it's a start. A relevant mosaic is identified if:

 * The file extension is `.tif`
 
 * The file size is larger than some user-specified threshold
 
 * The number of image bands is greater than some minimum threshold (e.g. to avoid processing single-band images, such as DSMs)
 
 * The file has a valid co-ordinate reference system
 
## 1. Manipulating files on MinIO from JupyterHub

The MinIO PVC is mounted as a **read-only** volume within users' `$HOME` directories. This means we can use standard Python tools to search the file system and identify mosaics, but we can't modify files or copy them to new locations. **The volume is read-only to prevent accidental deletion of SeaBee data**. To edit/write data on the MinIO PVC from JupyterHub, users should use the MinIO client. This is a little more fiddly, but provides fine-grained user-level access control, which is valuable.

### 1.1. Identifying mosaics

In [2]:
# Top-level folder to search recursively for .tif files
parent_folder = r"/home/notebook/shared-seabee-ns9879k/niva/2022/2022-08-31_RUNDE/2022-08-31_Remoy"

# MinIO bucket to copy identified mosaics to
mosaic_bucket = "all-mosaics"

# Minimum image size to consider in MB
min_size_mb = 100

# Minimum number of bands
min_bands = 3

In [3]:
# Find all .tif files recursively
flist = [
    path
    for path in Path(parent_folder).rglob("*.tif")
    if os.path.getsize(path) > min_size_mb * 1e6
]
print(f"{len(flist)} '.tif' files found with size > {min_size_mb} MB.")

# How many have a valid CRS?
mosaic_list = []
for fpath in flist:
    with rasterio.open(fpath) as src:
        crs = src.crs
        n_bands = src.count
        if crs and (n_bands >= min_bands):
            mosaic_list.append(fpath)
print(f"Of these, {len(mosaic_list)} have a valid CRS and {min_bands} or more bands.")
pprint(mosaic_list)

104 '.tif' files found with size > 100 MB.
Of these, 7 have a valid CRS and 3 or more bands.
[PosixPath('/home/notebook/shared-seabee-ns9879k/niva/2022/2022-08-31_RUNDE/2022-08-31_Remoy/1_drone/2_SpectroFly/1404_MS_120m/Reflectance_Maps/composite/Remoy_20220831_1404_MS_comp.tif'),
 PosixPath('/home/notebook/shared-seabee-ns9879k/niva/2022/2022-08-31_RUNDE/2022-08-31_Remoy/1_drone/2_SpectroFly/0730_RGB_120m/Mosaics/20220831_0730_RGB_120m_transparent_mosaic_group1.tif'),
 PosixPath('/home/notebook/shared-seabee-ns9879k/niva/2022/2022-08-31_RUNDE/2022-08-31_Remoy/1_drone/2_SpectroFly/0730_RGB_120m/PIX4d_Mapper/20220831_0730_RGB_120m/3_dsm_ortho/2_mosaic/20220831_0730_RGB_120m_transparent_mosaic_group1.tif'),
 PosixPath('/home/notebook/shared-seabee-ns9879k/niva/2022/2022-08-31_RUNDE/2022-08-31_Remoy/1_drone/2_SpectroFly/1043_MS_120m/Reflectance_Maps/composite/Remoy_20220831_1043_MS_comp_v2.tif'),
 PosixPath('/home/notebook/shared-seabee-ns9879k/niva/2022/2022-08-31_RUNDE/2022-08-31_Remoy/

### 1.2. Copying mosaics

The mosaics identified above are copied to a new folder on MinIO.

In [4]:
# Login to MinIO
sigma2_client = Minio(
    "storage.seabee.sigma2.no",
    access_key=SETTINGS.ACCESS_ID,
    secret_key=SETTINGS.SECRET_KEY,
)

In [5]:
# Create new bucket if necessary
found = sigma2_client.bucket_exists(mosaic_bucket)
if not found:
    sigma2_client.make_bucket(mosaic_bucket)

# Copy files
for fpath in tqdm(mosaic_list):
    # The original folder structure on MinIO contains duplicate files in different
    # locations. Check whether the file is already copied, and skip it if so
    try:
        meta = sigma2_client.stat_object(mosaic_bucket, fpath.name)
        exists = True
    except S3Error:
        exists = False

    # Copy the file
    if not exists:
        src_bucket = fpath.parts[4]
        src_path = "/".join(fpath.parts[5:])
        try:
            copy_result = sigma2_client.copy_object(
                mosaic_bucket,
                fpath.name,
                CopySource(src_bucket, src_path),
            )
        except InvalidResponseError as err:
            print(err)
        except ValueError:
            print("Cannot parse:", fpath)

  0%|          | 0/7 [00:00<?, ?it/s]

### 1.3. Upload (local) files to MinIO

Instead of copying files that are already on MinIO, the code below illustrates how to upload new files (either from JupyterHub or another local directory).

In [None]:
# bucket = "test-bucket"
# found = sigma2_client.bucket_exists(bucket)
# if not found:
#     sigma2_client.make_bucket(bucket)

# fpath = (
#     r"/home/jovyan/shared/drones/20220831_0730_RGB_120m_transparent_mosaic_group1.tif"
# )
# fname = os.path.basename(fpath)

# try:
#     with open(fpath, "rb") as file_data:
#         file_stat = os.stat(fpath)
#         sigma2_client.put_object(bucket, fname, file_data, file_stat.st_size)
# except InvalidResponseError as err:
#     print(err)