In [1]:
from datetime import datetime as dt

import geopandas as gpd
import numpy as np

from pathlib import Path

import pandas as pd
from pprint import pprint

import rasterio as rio
from rasterio import windows

from shapely import box

In [None]:
def clip_and_stack(entries, location, write_to):

    if isinstance(entries, pd.Series):
        entries = entries.to_dict()
    elif isinstance(entries, dict):
        entres = entries
    else:
        raise TypeError(f"entries must be a pandas series or a dictionary, not {type(entries)}")
    
    # Retrieve the content of the rasters
    array_pairs, profile_pairs, shape_pairs, tag_pairs = {}, {}, {}, {}
    for raster_name, raster_path in entries.items():

        with rio.open(raster_path) as raster:

            # Set the profile
            profile = raster.profile

            # Set the shared extent
            naive_bounds = box(*aoi.to_crs(profile["crs"]).total_bounds)
            raster_bounds = box(*raster.bounds)
            shared_bounds = raster_bounds.intersection(naive_bounds).bounds

            # Set the clipping window, and update the profile
            window = windows.from_bounds(*shared_bounds, profile["transform"])
            window = window.round()
            profile["transform"] = windows.transform(window, profile["transform"])
            profile["height"] = window.height
            profile["width"] = window.width
            
            # Do a PARTIAL reading, set profiles and tags
            array_pairs[raster_name] = raster.read(1, window=window)

            profile_pairs[raster_name] = pd.Series(profile)
            tag_pairs[raster_name] = pd.Series(raster.tags())
       
    # Concatenate the profiles and the tags
    profile_frame = pd.concat(profile_pairs, axis=1).T
    tag_frame = pd.concat(tag_pairs, axis=1).T

    # Check for duplicate entries on the profiles
    nunique_profiles = profile_frame.nunique(axis=0)
    several_profiles = (nunique_profiles > 1)
    if several_profiles.any():
        offending_entries = nunique_profiles[several_profiles].index.tolist()
        raise RuntimeError(f"{offending_entries} have several possible values")
    
    # make the output profile and update
    out_profile = profile_frame.drop_duplicates().iloc[0].to_dict()
    out_profile["count"] = len(array_pairs)

    # keep only tags where the count is larger than one
    nunique_tags = tag_frame.nunique(axis=0)
    relevant_tags = nunique_tags[nunique_tags == 1].index.tolist()
    tags = tag_frame[relevant_tags].drop_duplicates().iloc[0].to_dict()

    # Write the output file
    with rio.open(write_to, "w", **out_profile) as out_raster:

        # update the scales and offsets
        if "add_offset" in tag_frame.columns:
            out_raster.offsets = tag_frame["add_offset"].astype(float)
        
        if "scale_factor" in tag_frame.columns:
            out_raster.scales = tag_frame["scale_factor"].astype(float)
        
        out_raster.update_tags(**tags)

        # out_raster.update_tags(**{k.replace(" ", "_"): v for k, v in tags.items()})
        for band_idx, (band_name, band_data) in enumerate(array_pairs.items(), 1):

            out_raster.write(band_data, band_idx)
            out_raster.set_band_description(band_idx, band_name)

In [3]:
PARENT_DIR = Path(r"/home/iborlafm/Downloads/Mozambique/hls")

In [4]:
aoi = gpd.read_file(PARENT_DIR / "extentUTM.geojson")
geom = aoi.loc[0, "geometry"]

## List the available files

In [5]:
path_pile, info_pile, name_pile = [], [], []

for product_path in PARENT_DIR.rglob("HLS.*tif"):

    path_pile.append(product_path)

    stem_info = product_path.stem.split(".")
    stem_info.insert(3, stem_info.pop(1))

    version = f"{stem_info[4]}.{stem_info[5]}"
    stem_info[4] = version
    stem_info.pop(5)

    info_pile.append(stem_info)
    name_pile.append("_".join(stem_info[:-1]))

columns = ["product", "tile", "time", "sensor", "version", "suffix"]
path_frame = pd.DataFrame(info_pile, index=path_pile, columns=columns).reset_index(names="path")

path_frame["stem"] = name_pile
path_frame["suffix"] = path_frame["suffix"].replace("B8A", "B08A")
path_frame.sort_values(["stem", "suffix"], ascending=True, inplace=True)

## Reshape the frame, rename, dump non-shared bands

In [6]:
common_bands = {"B01": "CoastalAerosol", "B02": "Blue", "B03": "Green", "B04": "Red"}
landsat_bands = {**common_bands, "B05": "NIRnarrow", "B06": "SWIR1", "B07": "SWIR2"}
sentinel_bands = {**common_bands, "B08A": "NIRnarrow", "B11": "SWIR1", "B12": "SWIR2"}

landsat_frame = (
    path_frame[path_frame["sensor"] == "L30"]
    .pivot(index="stem", columns="suffix", values="path")
    .drop(columns=["B09", "B10", "B11"])
    .rename(columns=landsat_bands)
    )

sentinel_frame = (
    path_frame[path_frame["sensor"] == "S30"]
    .pivot(index="stem", columns="suffix", values="path")
    .drop(columns=["B05", "B06", "B07", "B08", "B09", "B10"])
    .rename(columns=sentinel_bands)
    )

both_frame = pd.concat([landsat_frame, sentinel_frame], axis=0)

In [7]:
sentinel_frame

suffix,CoastalAerosol,Blue,Green,Red,NIRnarrow,SWIR1,SWIR2,Fmask,SAA,SZA,VAA,VZA
stem,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
HLS_T36KXE_2018239T073719_S30_v2.0,/home/iborlafm/Downloads/Mozambique/hls/2018/H...,/home/iborlafm/Downloads/Mozambique/hls/2018/H...,/home/iborlafm/Downloads/Mozambique/hls/2018/H...,/home/iborlafm/Downloads/Mozambique/hls/2018/H...,/home/iborlafm/Downloads/Mozambique/hls/2018/H...,/home/iborlafm/Downloads/Mozambique/hls/2018/H...,/home/iborlafm/Downloads/Mozambique/hls/2018/H...,/home/iborlafm/Downloads/Mozambique/hls/2018/H...,/home/iborlafm/Downloads/Mozambique/hls/2018/H...,/home/iborlafm/Downloads/Mozambique/hls/2018/H...,/home/iborlafm/Downloads/Mozambique/hls/2018/H...,/home/iborlafm/Downloads/Mozambique/hls/2018/H...
HLS_T36KXE_2024253T073611_S30_v2.0,/home/iborlafm/Downloads/Mozambique/hls/2024/H...,/home/iborlafm/Downloads/Mozambique/hls/2024/H...,/home/iborlafm/Downloads/Mozambique/hls/2024/H...,/home/iborlafm/Downloads/Mozambique/hls/2024/H...,/home/iborlafm/Downloads/Mozambique/hls/2024/H...,/home/iborlafm/Downloads/Mozambique/hls/2024/H...,/home/iborlafm/Downloads/Mozambique/hls/2024/H...,/home/iborlafm/Downloads/Mozambique/hls/2024/H...,/home/iborlafm/Downloads/Mozambique/hls/2024/H...,/home/iborlafm/Downloads/Mozambique/hls/2024/H...,/home/iborlafm/Downloads/Mozambique/hls/2024/H...,/home/iborlafm/Downloads/Mozambique/hls/2024/H...


## Clip and stack the rasters

In [8]:
selected_columns = ["Blue", "Green", "Red", "NIRnarrow", "SWIR1", "SWIR2"]

In [9]:
for observation_name, observation_files in both_frame.iterrows():

    
    clip_and_stack(
        entries=observation_files[selected_columns],
        location=aoi,
        write_to=(PARENT_DIR / f"{observation_name}_bands.tif")
    )

    clip_and_stack(
        entries=observation_files[["Fmask"]],
        location=aoi,
        write_to=(PARENT_DIR / f"{observation_name}_Fmask.tif")
    )

{'ACCODE': 'Lasrc; Lasrc',
 'AREA_OR_POINT': 'Area',
 'HLS_PROCESSING_TIME': '2022-01-13T15:21:32Z',
 'HORIZONTAL_CS_NAME': 'UTM, WGS84, UTM ZONE 36; UTM, WGS84, UTM ZONE 36',
 'L1_PROCESSING_TIME': '2020-08-31T05:57:46Z; 2020-08-31T05:57:45Z',
 'LANDSAT_PRODUCT_ID': 'LC08_L1TP_167072_20180903_20200831_02_T1; '
                       'LC08_L1TP_167073_20180903_20200831_02_T1',
 'LANDSAT_SCENE_ID': 'LC81670722018246LGN00; LC81670732018246LGN00',
 'MEAN_SUN_AZIMUTH_ANGLE': '51.269228008108',
 'MEAN_SUN_ZENITH_ANGLE': '39.4897898872913',
 'MEAN_VIEW_AZIMUTH_ANGLE': '110.513883291019',
 'MEAN_VIEW_ZENITH_ANGLE': '3.84456765958306',
 'NBAR_SOLAR_ZENITH': '37.7329480886891',
 'NCOLS': '3660',
 'NROWS': '3660',
 'OVR_RESAMPLING_ALG': 'NEAREST',
 'PROCESSING_LEVEL': 'L1TP; L1TP',
 'SENSING_TIME': '2018-09-03T07:40:57.5233930Z; 2018-09-03T07:41:21.4652640Z',
 'SENSOR': 'OLI_TIRS; OLI_TIRS',
 'SENTINEL2_TILEID': '36KXE',
 'SPATIAL_RESOLUTION': '30',
 'TIRS_SSM_MODEL': 'FINAL; FINAL',
 'TIRS_SSM_