## OTC Load test

### User input

In [1]:
input_df = "https://artifactory.vgt.vito.be/artifactory/auxdata-public/gfmap/DEMO_CROPTYPE.gpkg"
output_path = '/data/users/Public/vincent.verelst/otc_load_test/'

### Setting up the logging module 

In [2]:
# Configuring the logging for the openeo_gfmap package
from openeo_gfmap.manager import _log
import logging

_log.setLevel(logging.DEBUG)

stream_handler = logging.StreamHandler()
_log.addHandler(stream_handler)

formatter = logging.Formatter('%(asctime)s|%(name)s|%(levelname)s:  %(message)s')
stream_handler.setFormatter(formatter)

# Exclude the other loggers from other libraries
class MyLoggerFilter(logging.Filter):
    def filter(self, record):
        return record.name == _log.name

stream_handler.addFilter(MyLoggerFilter())


### First step: splitting the job

In [3]:
from pathlib import Path
import geopandas as gpd
from openeo_gfmap.manager.job_splitters import split_job_s2grid

base_df_path = input_df
base_df = gpd.read_file(base_df_path)
# Splits the job using GFMAP
split_jobs = split_job_s2grid(
    base_df, max_points=200
)

print(f'{len(split_jobs)} sub-datasets.')

# Remove the geometry where there are no points with the "extract" flag
split_jobs = [
    job for job in split_jobs if job.extract.any()
]
print(f'{len(split_jobs)} sub-datasets after filtering sub-datasets with no point to extract.')


  polygons["geometry"] = polygons.geometry.centroid


95 sub-datasets.
87 sub-datasets after filtering sub-datasets with no point to extract.



  s2_grid["geometry"] = s2_grid.geometry.centroid



### Second step: creating a dataframe for the GFMAP Job Manager

In [4]:
from openeo_gfmap import Backend
from typing import List
import pandas as pd

def create_job_dataframe(
    backend: Backend, split_jobs: List[gpd.GeoDataFrame]
) -> pd.DataFrame:
    """Create a dataframe from the split jobs, containg all the necessary information to run the job."""
    columns = [
        "backend_name",
        "out_extension",
        "start_date",
        "end_date",
        "s2_tile",
        "geometry",
    ]
    rows = []
    for job in split_jobs:
        # Compute the average in the valid date and make a buffer of 1.5 year around
        median_time = pd.to_datetime(job.valid_date).mean()
        start_date = median_time - pd.Timedelta(days=275)  # A bit more than 9 months
        end_date = median_time + pd.Timedelta(days=275)  # A bit more than 9 months
        s2_tile = job.tile.iloc[0]
        rows.append(
            pd.Series(
                dict(
                    zip(
                        columns,
                        [
                            backend.value,
                            ".parquet",
                            start_date.strftime("%Y-%m-%d"),
                            end_date.strftime("%Y-%m-%d"),
                            s2_tile,
                            job.to_json(),
                        ],
                    )
                )
            )
        )

    return pd.DataFrame(rows)

job_df = create_job_dataframe(Backend.OTC, split_jobs)

job_df

Unnamed: 0,backend_name,out_extension,start_date,end_date,s2_tile,geometry
0,otc,.parquet,2020-08-30,2022-03-03,31UDS,"{""type"": ""FeatureCollection"", ""features"": [{""i..."
1,otc,.parquet,2020-08-30,2022-03-03,31UES,"{""type"": ""FeatureCollection"", ""features"": [{""i..."
2,otc,.parquet,2020-08-30,2022-03-03,31UFS,"{""type"": ""FeatureCollection"", ""features"": [{""i..."
3,otc,.parquet,2020-08-30,2022-03-03,32TPT,"{""type"": ""FeatureCollection"", ""features"": [{""i..."
4,otc,.parquet,2020-08-30,2022-03-03,33TVM,"{""type"": ""FeatureCollection"", ""features"": [{""i..."
...,...,...,...,...,...,...
82,otc,.parquet,2020-08-30,2022-03-03,35VNC,"{""type"": ""FeatureCollection"", ""features"": [{""i..."
83,otc,.parquet,2020-08-30,2022-03-03,35VNC,"{""type"": ""FeatureCollection"", ""features"": [{""i..."
84,otc,.parquet,2020-08-30,2022-03-03,35VNC,"{""type"": ""FeatureCollection"", ""features"": [{""i..."
85,otc,.parquet,2020-08-30,2022-03-03,35VND,"{""type"": ""FeatureCollection"", ""features"": [{""i..."


### Sub-sampling job dataframe to reduce execution time

In [5]:
job_df = job_df.iloc[[2]].reset_index(drop=True)

In [6]:
import geojson

def get_job_nb_points(row: pd.Series) -> int:
    """Get the number of polygons in the geometry."""
    return len(list(filter(lambda feat: feat.properties.get("extract"), geojson.loads(row.geometry)['features'])))

job_df['nb_points'] = job_df.apply(get_job_nb_points, axis=1)
job_df

Unnamed: 0,backend_name,out_extension,start_date,end_date,s2_tile,geometry,nb_points
0,otc,.parquet,2020-08-30,2022-03-03,31UFS,"{""type"": ""FeatureCollection"", ""features"": [{""i...",3


### Third step: implement the datacube creator function.

Implement a function to create, from the additional rows provided before, an `openeo.BatchJob` that will be used to run the job.

In this case we extract Sentinel-2 data around a 64x64 pixel square of polygons which have the field `extract=True` (although we keep them in the row for the post-job action.)

Note:
Because the polygons to extract are specified in UTM dimensions (required to have a specific size), the dataset of polygon cannot be send directly through the openeo process graph (GeoJSON only support lat/lon coordinates). The sub-datasets of polygons are therefore uploaded to a publicly accessible URL so they can be used later by openeo during the execution of the job.

In [7]:
from pathlib import Path
from typing import List, Optional

from openeo import UDF, Connection, DataCube
from openeo_gfmap import (
    BackendContext,
    BoundingBoxExtent,
    FetchType,
    SpatialContext,
    TemporalContext,
)
from openeo_gfmap.fetching.generic import build_generic_extractor
from openeo_gfmap.fetching.s1 import build_sentinel1_grd_extractor
from openeo_gfmap.fetching.s2 import build_sentinel2_l2a_extractor
from openeo_gfmap.preprocessing.compositing import (
    max_ndvi_compositing,
    mean_compositing,
)
from openeo_gfmap.preprocessing.interpolation import linear_interpolation
from openeo_gfmap.preprocessing.sar import compress_backscatter_uint16

def raw_datacube_S2(
    connection: Connection,
    backend_context: BackendContext,
    spatial_extent: SpatialContext,
    temporal_extent: TemporalContext,
    bands: List[str],
    fetch_type: FetchType,
    filter_tile: Optional[str] = None,
    distance_to_cloud_flag: Optional[bool] = True,
    additional_masks_flag: bool = True,
    apply_mask_flag: bool = False,
) -> DataCube:
    """Extract Sentinel-2 datacube from OpenEO using GFMAP routines.
    Raw data is extracted with no cloud masking applied by default (can be
    enabled by setting `apply_mask=True`). In additional to the raw band values
    a cloud-mask computed from the dilation of the SCL layer, as well as a
    rank mask from the BAP compositing are added.

    Parameters
    ----------
    connection : Connection
        OpenEO connection instance.
    backend_context : BackendContext
        GFMAP Backend context to use for extraction.
    spatial_extent : SpatialContext
        Spatial context to extract data from, can be a GFMAP BoundingBoxExtent,
        a GeoJSON dict or an URL to a publicly accessible GeoParquet file.
    temporal_extent : TemporalContext
        Temporal context to extract data from.
    bands : List[str]
        List of Sentinel-2 bands to extract.
    fetch_type : FetchType
        GFMAP Fetch type to use for extraction.
    filter_tile : Optional[str], optional
        Filter by tile ID, by default disabled. This forces the process to only
        one tile ID from the Sentinel-2 collection.
    distance_to_cloud_flag : Optional[bool], optional
        Compute the distance to cloud, by default True.
    additional_masks_flag : bool, optional
        Add the additional masks to the cube, by default True. This includes the
        distance to cloud and the SCL dilation mask.
    apply_mask_flag : bool, optional
        Apply cloud masking, by default False. Can be enabled for high
        optimization of memory usage.
    """
    # Extract the SCL collection only
    scl_cube_properties = {"eo:cloud_cover": lambda val: val <= 95.0}
    if filter_tile:
        scl_cube_properties["tileId"] = lambda val: val == filter_tile

    scl_cube = connection.load_collection(
        collection_id="SENTINEL2_L2A",
        bands=["SCL"],
        temporal_extent=[temporal_extent.start_date, temporal_extent.end_date],
        spatial_extent=dict(spatial_extent) if fetch_type == FetchType.TILE else None,
        properties=scl_cube_properties,
    )

    # Resample to 10m resolution for the SCL layer
    scl_cube = scl_cube.resample_spatial(10)

    # Compute the SCL dilation mask
    scl_dilated_mask = scl_cube.process(
        "to_scl_dilation_mask",
        data=scl_cube,
        scl_band_name="SCL",
        kernel1_size=17,  # 17px dilation on a 10m layer
        kernel2_size=77,  # 77px dilation on a 10m layer
        mask1_values=[2, 4, 5, 6, 7],
        mask2_values=[3, 8, 9, 10, 11],
        erosion_kernel_size=3,
    ).rename_labels("bands", ["S2-L2A-SCL_DILATED_MASK"])

    additional_masks = scl_dilated_mask

    if distance_to_cloud_flag:
        # Compute the distance to cloud and add it to the cube
        distance_to_cloud = scl_cube.apply_neighborhood(
            process=UDF.from_file(Path(__file__).parent / "udf_distance_to_cloud.py"),
            size=[
                {"dimension": "x", "unit": "px", "value": 256},
                {"dimension": "y", "unit": "px", "value": 256},
                {"dimension": "t", "unit": "null", "value": "P1D"},
            ],
            overlap=[
                {"dimension": "x", "unit": "px", "value": 16},
                {"dimension": "y", "unit": "px", "value": 16},
            ],
        ).rename_labels("bands", ["S2-L2A-DISTANCE-TO-CLOUD"])

        additional_masks = scl_dilated_mask.merge_cubes(distance_to_cloud)

    # Try filtering using the geometry
    if fetch_type == FetchType.TILE:
        additional_masks = additional_masks.filter_spatial(spatial_extent.to_geojson())

    # Create the job to extract S2
    extraction_parameters = {
        "target_resolution": None,  # Disable target resolution
        "load_collection": {
            "eo:cloud_cover": lambda val: val <= 95.0,
        },
    }
    if additional_masks_flag:
        extraction_parameters["pre_merge"] = additional_masks
    if filter_tile:
        extraction_parameters["load_collection"]["tileId"] = (
            lambda val: val == filter_tile
        )
    if apply_mask_flag:
        extraction_parameters["pre_mask"] = scl_dilated_mask

    extractor = build_sentinel2_l2a_extractor(
        backend_context,
        bands=bands,
        fetch_type=fetch_type,
        **extraction_parameters,
    )

    return extractor.get_cube(connection, spatial_extent, temporal_extent)

In [8]:
import openeo

import requests
from tempfile import NamedTemporaryFile
import os
import pandas as pd
import geojson
from shapely.geometry import Point

from openeo_gfmap import TemporalContext, Backend, BackendContext, FetchType, SpatialContext
from openeo_gfmap.fetching import build_sentinel2_l2a_extractor
from openeo_gfmap.preprocessing import linear_interpolation, median_compositing

def filter_extract_true(
    geometries: geojson.FeatureCollection,
) -> geojson.FeatureCollection:
    """Remove all the geometries from the Feature Collection that have the property field `extract` set to `False`"""
    return geojson.FeatureCollection(
        [f for f in geometries.features if f.properties.get("extract", 0) == 1]
    )

def create_datacube(
    row: pd.Series,
    connection: openeo.DataCube,
    provider,
    connection_provider,
    executor_memory: str = "5G",
    executor_memory_overhead: str = "2G",
):
    """Creates an OpenEO BatchJob from the given row information."""

    # Load the temporal and spatial extent
    temporal_extent = TemporalContext(row.start_date, row.end_date)
    spatial_extent = geojson.loads(row.geometry)

    # Get the feature collection containing the geometry to the job
    geometry = geojson.loads(row.geometry)
    assert isinstance(geometry, geojson.FeatureCollection)

    # Filter the geometry to the rows with the extract only flag
    geometry = filter_extract_true(geometry)
    assert len(geometry.features) > 0, "No geometries with the extract flag found"

    # Backend name and fetching type
    backend = Backend(row.backend_name)
    backend_context = BackendContext(backend)

    # Select some bands to download (chosen at random at this point)
    bands_to_download = [
        "S2-L2A-B04",
        "S2-L2A-B08",
    ]

    fetch_type = FetchType.POINT

    cube = raw_datacube_S2(
        connection=connection,
        backend_context=backend_context,
        spatial_extent=spatial_extent,
        temporal_extent=temporal_extent,
        bands=bands_to_download,
        fetch_type=fetch_type,
        distance_to_cloud_flag=False,
        additional_masks_flag=False,
        apply_mask_flag=True,
    )

    cube = 2.5 * (cube.band('S2-L2A-B08') - cube.band('S2-L2A-B04')) / \
        (cube.band('S2-L2A-B08') + 2.4 * cube.band('S2-L2A-B04') + 1)
    cube = cube.add_dimension("bands", 'S2-L2A-EVI', "bands")

    # Create monthly median composites
    cube = median_compositing(cube=cube, period="month")
    # Perform linear interpolation
    cube = linear_interpolation(cube)

    # Finally, create a vector cube based on the Point geometries
    cube = cube.aggregate_spatial(geometries=spatial_extent, reducer="mean")

    
    job_options = {
        "executor-memory": executor_memory,
        "executor-memoryOverhead": executor_memory_overhead,
    }
    return cube.create_job(
        out_format="Parquet",
        title=f"GFMAP_Feature_Extraction_S2_{row.s2_tile}",
        job_options=job_options,
    )

### Fourth step: create output paths

Implement a function that from the sample index the job row determines which path to saves the assets to

In [9]:
from openeo_gfmap.manager.job_splitters import load_s2_grid

# Load the S2 grid
s2_grid = load_s2_grid()

In [10]:
from pathlib import Path
import xarray as xr

def generate_output_path(root_folder: Path, geometry_index: int, row: pd.Series):
    features = geojson.loads(row.geometry)
    sample_id = features[geometry_index].properties.get("sample_id", None)
    if sample_id is None:
        sample_id = features[geometry_index].properties["sampleID"]

    s2_tile_id = row.s2_tile

    subfolder = root_folder / s2_tile_id
    return subfolder / f"{sample_id}{row.out_extension}"
    

### Sixth and last step: Running the manager

Let's initialize and execute the Job Manager as defined the GFMAP, and then run it using the functions defined previously

STAC related parameters such as `collection_id` and `collection_description` are also required.

In [11]:
%load_ext autoreload

In [12]:
%autoreload 2
from openeo_gfmap.manager.job_manager import GFMAPJobManager
from openeo_gfmap.backend import otc_connection


base_output_dir = Path(output_path)
tracking_job_csv = base_output_dir / 'job_tracker.csv'

manager = GFMAPJobManager(
    output_dir=base_output_dir,
    output_path_generator=generate_output_path,
    collection_id="SENTINEL2_L2A",
    poll_sleep=60,
    n_threads=2,
    post_job_params={}
)

manager.add_backend(
    Backend.OTC.value, otc_connection, parallel_jobs=6
)
manager.run_jobs(job_df, create_datacube, tracking_job_csv)

2024-06-10 15:16:37,544|openeo_gfmap.manager|INFO:  Starting a fresh STAC collection.
2024-06-10 15:16:37,544|openeo_gfmap.manager|INFO:  Starting a fresh STAC collection.
2024-06-10 15:16:37,548|openeo_gfmap.manager|INFO:  Starting ThreadPoolExecutor with 2 workers.
2024-06-10 15:16:37,548|openeo_gfmap.manager|INFO:  Starting ThreadPoolExecutor with 2 workers.
2024-06-10 15:16:37,552|openeo_gfmap.manager|INFO:  Creating and running jobs.
2024-06-10 15:16:37,552|openeo_gfmap.manager|INFO:  Creating and running jobs.
2024-06-10 15:16:37,594|openeo_gfmap.manager|DEBUG:  Normalizing dataframe. Columns: Index(['backend_name', 'out_extension', 'start_date', 'end_date', 's2_tile',
       'geometry', 'nb_points', 'status', 'id', 'start_time', 'cpu', 'memory',
       'duration', 'description', 'costs'],
      dtype='object')
2024-06-10 15:16:37,594|openeo_gfmap.manager|DEBUG:  Normalizing dataframe. Columns: Index(['backend_name', 'out_extension', 'start_date', 'end_date', 's2_tile',
       'g

Authenticated using refresh token.


2024-06-10 15:18:41,836|openeo_gfmap.manager|DEBUG:  Status of job j-2406103019e44272bfb5c77983add778 is running (on backend otc).
2024-06-10 15:18:41,836|openeo_gfmap.manager|DEBUG:  Status of job j-2406103019e44272bfb5c77983add778 is running (on backend otc).
2024-06-10 15:19:42,784|openeo_gfmap.manager|DEBUG:  Status of job j-2406103019e44272bfb5c77983add778 is running (on backend otc).
2024-06-10 15:19:42,784|openeo_gfmap.manager|DEBUG:  Status of job j-2406103019e44272bfb5c77983add778 is running (on backend otc).
2024-06-10 15:21:09,774|openeo_gfmap.manager|DEBUG:  Status of job j-2406103019e44272bfb5c77983add778 is running (on backend otc).
2024-06-10 15:21:09,774|openeo_gfmap.manager|DEBUG:  Status of job j-2406103019e44272bfb5c77983add778 is running (on backend otc).
2024-06-10 15:22:10,511|openeo_gfmap.manager|DEBUG:  Status of job j-2406103019e44272bfb5c77983add778 is running (on backend otc).
2024-06-10 15:22:10,511|openeo_gfmap.manager|DEBUG:  Status of job j-2406103019e442