In [None]:
%load_ext autoreload
%autoreload 2


## Libraries

In [None]:
# Add root to path
import sys
import os
from pathlib import Path


sys.path.append("..")
from component.script.utilities.file_filter import (
    list_files_by_extension,
    filter_files_by_keywords,
)

## Set user parameters

In [None]:
project_name = "test"


In [None]:
user_defined_epsg_code = None


## Describe variables

In [None]:
forest_source = "gfc"
tree_cover_threshold = 10
years: list[int] = [2015, 2020, 2024]


In [None]:
calculate_forest_loss = True
calculate_forest_loss_stack = True


In [None]:
vector_binary = ["aoi"]
vector_unique_values = None

In [None]:
raster_categorical_variables = ["towns", "rivers", "roads", "pa", forest_source]
raster_continuos_variables = ["altitude", "slope"]

In [None]:
raster_edge = ["forest"] # from class 1 (forest) to 0 (non-forest)
raster_distance = ["rivers", "roads", "town"] # from class 0 to 1

## Connect folders

In [None]:
root_folder: Path = Path.cwd().parent
downloads_folder: Path = root_folder / "data"
downloads_folder.mkdir(parents=True, exist_ok=True)


In [None]:
project_folder = downloads_folder / project_name
project_folder.mkdir(parents=True, exist_ok=True)
data_raw_folder = project_folder / "data_raw"
data_raw_folder.mkdir(parents=True, exist_ok=True)
processed_data_folder = project_folder / "data"
processed_data_folder.mkdir(parents=True, exist_ok=True)


## Calculate epsg code

In [None]:
from component.script.geo_utils import calculate_utm_rioxarray

In [None]:
calculated_epsg = calculate_utm_rioxarray(
    str(data_raw_folder) + "/" + project_name + "_subj.tif"
)
calculated_epsg

In [None]:
if user_defined_epsg_code is None:
    epsg_code = calculated_epsg
elif user_defined_epsg_code is not None:
    epsg_code = user_defined_epsg_code


In [None]:
epsg_code

## Define base raster

In [None]:
# Define the base raster to allign all the others to
base_file_raster = str(data_raw_folder) + "/" + project_name + "_subj.tif"
base_file_raster


## Reproject Base file

In [None]:
from osgeo import gdal

def reproject_raster_gdal_warp(
    input_file: str,
    output_file: str,
    target_epsg: str,
    resolution: int | float = 30,
    resampling_method: str = "near",
) -> None:
    """
    Reprojects a raster file to a specified EPSG code using GDAL and saves it with DEFLATE compression.

    Parameters:
    input_file (str): The path to the input raster file.
    output_file (str): The path where the reprojected raster file will be saved.
    target_epsg (int): The EPSG code of the target coordinate reference system.

    Returns:
    None
    """

    # Open the input dataset
    dataset = gdal.Open(input_file)
    if not dataset:
        raise FileNotFoundError(f"Input file {input_file} not found.")

    # Get projection and geotransform from the original raster
    src_proj = dataset.GetProjection()

    # Callback
    param = gdal.WarpOptions(
        warpOptions=["overwrite"],
        srcSRS=src_proj,
        dstSRS=target_epsg,
        targetAlignedPixels=True,
        resampleAlg=resampling_method,
        xRes=resolution,
        yRes=resolution,
        multithread=True,
        creationOptions=[
            "COMPRESS=DEFLATE",
            "PREDICTOR=2",
            "BIGTIFF=YES",
        ],
    )

    # Perform reprojection
    gdal.Warp(output_file, input_file, format="GTiff", options=param)

    # Close datasets
    dataset = None
    out_dataset = None


In [None]:
base_file_raster = Path(base_file_raster)
base_name = base_file_raster.stem
reprojected_file_path = Path(processed_data_folder) / f"{base_name}_reprojected.tif"

reprojected_base_file = reproject_raster_gdal_warp(
    base_file_raster,
    reprojected_file_path,
    epsg_code,
    resolution=30.0,
)

In [None]:
import rioxarray
import odc.geo.xr  # do not delete this


def get_geobox(tif_file: str = None):
    raster_array = rioxarray.open_rasterio(
        tif_file,
        chunks="auto",
        cache=False,
        lock=False,
    )
    return raster_array.odc.geobox


In [None]:
base_geobox = get_geobox(reprojected_file_path)
base_geobox


## Calculate Forest Loss 

In [None]:
import xarray as xr
import rioxarray
import numpy as np


def process_forest_loss_xarray(input1_path, input2_path, output_path):
    # Open the input rasters
    input1 = rioxarray.open_rasterio(
        input1_path,
        chunks="auto",
        cache=False,
        lock=False,
    ).squeeze()
    input2 = rioxarray.open_rasterio(
        input2_path,
        chunks="auto",
        cache=False,
        lock=False,
    ).squeeze()

    # Check bounds properly - extract bounds tuple values
    bounds1 = input1.rio.bounds()
    bounds2 = input2.rio.bounds()

    if not (
        bounds1[0] <= bounds2[0]  # left
        and bounds1[2] >= bounds2[2]  # right
        and bounds1[3] >= bounds2[3]  # top
        and bounds1[1] <= bounds2[1]  # bottom
    ):
        raise ValueError(
            "The bounds of input1 must be equal to or larger than those of input2."
        )

    # Create masks for valid data
    nodata1 = input1.rio.nodata
    nodata2 = input2.rio.nodata
    valid_mask = (input1 != nodata1) & (input2 != nodata2)

    # Create output based on conditions using xarray operations
    output = xr.where(
        valid_mask & (input1 == 1) & (input2 == 0),
        0,  # condition 0: input1 == 1 and input2 == 0
        xr.where(
            valid_mask & (input1 == 1) & (input2 == 1),
            1,  # condition 1: input1 == 1 and input2 == 1
            255,  # nodata for all other cases
        ),
    ).astype("uint8")

    # Set proper metadata
    output.rio.write_nodata(255, inplace=True)
    output.rio.write_crs(input1.rio.crs, inplace=True)
    output.rio.write_transform(input1.rio.transform(), inplace=True)

    output.rio.to_raster(
        output_path,
        driver="GTiff",
        compress="DEFLATE",
        predictor=2,
        bigtiff="YES",
        tiled=True,
    )


In [None]:
import re

# List all raster files in the input folder
forest_raster_files = list_files_by_extension(data_raw_folder, [".tiff", ".tif"])


# Define the words to filter by
if tree_cover_threshold:
    filter_words = ["forest", forest_source, str(tree_cover_threshold)]
elif tree_cover_threshold is None:
    filter_words = ["forest", forest_source]

filtered_raster_files = filter_files_by_keywords(
    forest_raster_files,
    filter_words,
    False,
    ["loss"],
    True,
)


# Function to extract the year from a filename
def extract_year(filename):
    match = re.search(r"\d{4}", os.path.basename(filename))
    return int(match.group()) if match else None


# Sort the filtered raster files based on the extracted year in ascending order
sorted_raster_files = sorted(filtered_raster_files, key=extract_year)

sorted_raster_files  # Print the sorted list to verify


In [None]:
from component.script.utilities.file_helpers import generate_output_filename_loss


if calculate_forest_loss is True:
    forest_loss1_filename = generate_output_filename_loss(
        sorted_raster_files[0], sorted_raster_files[1]
    )
    if not Path(forest_loss1_filename).exists():
        process_forest_loss_xarray(
            sorted_raster_files[0], sorted_raster_files[1], forest_loss1_filename
        )
    forest_loss2_filename = generate_output_filename_loss(
        sorted_raster_files[0], sorted_raster_files[2]
    )
    if not Path(forest_loss2_filename).exists():
        process_forest_loss_xarray(
            sorted_raster_files[0], sorted_raster_files[2], forest_loss2_filename
        )
    forest_loss3_filename = generate_output_filename_loss(
        sorted_raster_files[1], sorted_raster_files[2]
    )
    if not Path(forest_loss3_filename).exists():
        process_forest_loss_xarray(
            sorted_raster_files[1], sorted_raster_files[2], forest_loss3_filename
        )
    print("Done!")


In [None]:
import numpy as np
import rasterio


def generate_deforestation_raster(
    raster1_path, raster2_path, raster3_path, output_path
):
    """
    Generate a deforestation raster from three input rasters.

    Parameters:
    - raster1_path: Path to the first raster file (period 1).
    - raster2_path: Path to the second raster file (period 2).
    - raster3_path: Path to the third raster file (period 3).
    - output_path: Path to save the output raster file.
    """

    # Open the input rasters
    with (
        rasterio.open(raster1_path) as src1,
        rasterio.open(raster2_path) as src2,
        rasterio.open(raster3_path) as src3,
    ):
        # Read the data into numpy arrays
        raster1 = src1.read(1)
        raster2 = src2.read(1)
        raster3 = src3.read(1)

        # Create an output array initialized with NoData value (0)
        output_raster = np.zeros_like(raster1, dtype=np.uint8)

        # Set the values based on deforestation periods
        output_raster[(raster1 == 1) & (raster2 == 0)] = (
            1  # Deforestation in period 1-2
        )
        output_raster[(raster2 == 1) & (raster3 == 0)] = (
            2  # Deforestation in period 2-3
        )
        # Set the remaining forest value only where no deforestation has been marked
        output_raster[(output_raster == 0) & (raster3 == 1)] = (
            3  # Remaining forest in period 3
        )

    # Define the metadata for the output raster
    meta = src1.meta
    meta.update({"count": 1, "dtype": np.uint8, "nodata": 0, "compress": "deflate"})

    # Write the output raster to a file
    with rasterio.open(output_path, "w", **meta) as dst:
        dst.write(output_raster, 1)

    print(f"Done in {output_path}")


In [None]:
from component.script.utilities.file_helpers import generate_output_filename_stack


if calculate_forest_loss_stack is True:
    total_forest_loss_filename = generate_output_filename_stack(
        sorted_raster_files[0], sorted_raster_files[1], sorted_raster_files[2]
    )
    if not Path(total_forest_loss_filename).exists():
        total_forest_loss = generate_deforestation_raster(
            sorted_raster_files[0],
            sorted_raster_files[1],
            sorted_raster_files[2],
            total_forest_loss_filename,
        )
    else:
        print(f"File already exists in: {total_forest_loss_filename}")
    print("Done!")


## Reproject and Rasterize Vector Data

In [None]:
def xr_rasterize(
    shapefile_path: str = None,
    geobox=None,
    crs=None,
    output_path: str = None,
    mode: str = "binary",
    **rasterio_kwargs,
):
    """
    Rasterizes a vector shapefile into a raster array.

    This function provides unified functionality for both binary and unique ID rasterization.

    Parameters
    ----------
    shapefile_path : str
        Path to the input shapefile containing vector data.
    geobox : odc.geo.geobox.GeoBox
        The spatial template defining the shape, coordinates, dimensions, and transform
        of the output raster.
    crs : str or CRS object, optional
        If ``geobox``'s coordinate reference system (CRS) cannot be
        determined, provide a CRS using this parameter.
        (e.g. 'EPSG:3577').
    output_path : string, optional
        Provide an optional string file path to export the rasterized
        data as a GeoTIFF file.
    mode : str, optional
        Rasterization mode: 'binary' or 'unique'.
        - 'binary': Creates a boolean raster with 1s and 0s (default)
        - 'unique': Creates a raster with unique integer IDs for each feature
    **rasterio_kwargs :
        A set of keyword arguments to ``rasterio.features.rasterize``.
        Can include: 'all_touched', 'merge_alg', 'dtype'.

    Returns
    -------
    da_rasterized : xarray.DataArray
        The rasterized vector data.
    """

    import geopandas as gpd
    import rasterio
    from odc.geo import xr

    # Read the shapefile
    gdf = gpd.read_file(filename=shapefile_path, engine="fiona")

    # Reproject vector data to raster's CRS
    gdf_reproj = gdf.to_crs(crs=geobox.crs)

    # Handle different modes
    if mode == "binary":
        # Binary mode: rasterize into a boolean array with 1s and 0s
        shapes = gdf_reproj.geometry
        values = [1] * len(gdf_reproj)  # All features set to 1
        shapes_and_values = list(zip(shapes, values))

    elif mode == "unique":
        # Unique ID mode: rasterize using unique integer IDs for each feature
        shapes = gdf_reproj.geometry
        # Create unique integer IDs starting from 1
        values = list(range(1, len(gdf_reproj) + 1))
        shapes_and_values = list(zip(shapes, values))

    else:
        raise ValueError("Mode must be either 'binary' or 'unique'")

    # Rasterize shapes into a numpy array
    im = rasterio.features.rasterize(
        shapes=shapes_and_values if mode == "unique" else shapes,
        out_shape=geobox.shape,
        transform=geobox.transform,
        dtype="uint8",
        **rasterio_kwargs,
    )

    # Convert numpy array to a full xarray.DataArray
    # and set array name if supplied
    da_rasterized = xr.wrap_xr(im=im, gbox=geobox)

    da_rasterized.rio.to_raster(
        output_path,
        driver="GTiff",
        compress="DEFLATE",
        predictor=2,
        bigtiff="YES",
        tiled=True,
    )

    # Explicitly close references â€“ not strictly required but tidy.
    del im
    del da_rasterized


In [None]:
from component.script.geo_utils import reproject_shapefile


def rasterize_shp_files(input_folder, output_folder, geobox):
    """
    Process .shp files by generating corresponding .tif filenames and calling rasterize_vectors.

    Parameters:
    input_folder (str): The path to the folder containing .shp files.
    output_folder (str): The path to the folder where .tif files will be saved.
    geobox (int): The EPSG code of the target coordinate reference system.
    """

    print("Processing vector data...")
    shp_files = list_files_by_extension(input_folder, [".shp"])

    print(f"There's {len(shp_files)} shape files...")

    if vector_binary is not None and len(vector_binary) > 0:
        shp_files_binary = filter_files_by_keywords(shp_files, vector_binary)

        for shp_file in shp_files_binary:
            # Extract the base name of the file without extension
            base_name = os.path.splitext(os.path.basename(shp_file))[0]
            # Create the new .tif filename
            tif_filename = f"{base_name}_reprojected.tif"
            tif_path = os.path.join(output_folder, tif_filename)
            # Call rasterize_vectors with the original and new filenames
            xr_rasterize(
                shapefile_path=shp_file,
                geobox=geobox,
                output_path=tif_path,
                mode="binary",
            )

    if vector_unique_values is not None and len(vector_unique_values) > 0:
        shp_files_unique = filter_files_by_keywords(shp_files, vector_unique_values)

        for shp_file in shp_files_unique:
            # Extract the base name of the file without extension
            base_name = os.path.splitext(os.path.basename(shp_file))[0]
            # Create the new .tif filename
            tif_filename = f"{base_name}_reprojected.tif"
            tif_path = os.path.join(output_folder, tif_filename)
            # Call rasterize_vectors with the original and new filenames
            xr_rasterize(
                shapefile_path=shp_file,
                geobox=geobox,
                output_path=tif_path,
                mode="unique",
            )

    shp_files_aoi = filter_files_by_keywords(shp_files, ["aoi"])

    for shp_file in shp_files_aoi:
        # Extract the base name of the file without extension
        base_name = os.path.splitext(os.path.basename(shp_file))[0]
        # Create the new .tif filename
        vector_filename = f"{base_name}_reprojected.shp"
        aoi_vector_reprojected = os.path.join(output_folder, vector_filename)
        reproject_shapefile(
            shp_file,
            aoi_vector_reprojected,
            geobox.crs.to_epsg(),
        )

    
    print("Vector processing done!")


In [None]:
rasterize_shp_files(data_raw_folder, processed_data_folder, base_geobox)


## Reproject Raster Data

In [None]:
def reproject_tiff_files_near(input_folder, tif_folder, target_epsg):
    """
    Reproject .tif files based on data type

    Parameters:
    input_folder (str): The path to the folder containing .shp files.
    tif_folder (str): The path to the folder where .tif files will be saved.
    target_epsg (int): The EPSG code of the target coordinate reference system.

    """
    print("Processing...")
    raster_files = list_files_by_extension(input_folder, [".tiff", ".tif"])

    # Define the words to filter by
    filter_words = raster_categorical_variables

    # Filter the raster files based on the presence of any of the filter words in their filenames
    filtered_raster_files = [
        file
        for file in raster_files
        if any(word in os.path.basename(file).lower() for word in filter_words)
    ]
    for raster_file in filtered_raster_files:
        # Extract the base name of the file without extension
        base_name = os.path.splitext(os.path.basename(raster_file))[0]
        # Create the new .tif filename
        tif_filename = f"{base_name}_reprojected.tif"
        tif_path = os.path.join(tif_folder, tif_filename)
        # Call rasterize_vectors with the original and new filenames
        reproject_raster_gdal_warp(
            raster_file,
            tif_path,
            target_epsg,
            # resolution=30.0,
        )

    print("Done!")


In [None]:
def reproject_tiff_files_bilinear(input_folder, tif_folder, target_epsg):
    """
    Reproject .tif files based on data type.

    Parameters:
    input_folder (str): The path to the folder containing .shp files.
    tif_folder (str): The path to the folder where .tif files will be saved.
    target_epsg (int): The EPSG code of the target coordinate reference system.

    """
    print("Processing...")

    raster_files = list_files_by_extension(input_folder, [".tiff", ".tif"])
    # Define the words to filter by
    filter_words = raster_continuos_variables

    # Filter the raster files based on the presence of any of the filter words in their filenames
    filtered_raster_files = [
        file
        for file in raster_files
        if any(word in os.path.basename(file).lower() for word in filter_words)
    ]
    for raster_file in filtered_raster_files:
        # Extract the base name of the file without extension
        base_name = os.path.splitext(os.path.basename(raster_file))[0]
        # Create the new .tif filename
        tif_filename = f"{base_name}_reprojected.tif"
        tif_path = os.path.join(tif_folder, tif_filename)
        # Call rasterize_vectors with the original and new filenames
        reproject_raster_gdal_warp(
            raster_file,
            tif_path,
            target_epsg,
            resampling_method="bilinear",
            # resolution=30.0,
        )

    print("Done!")


In [None]:
reproject_tiff_files_near(data_raw_folder, processed_data_folder, epsg_code) # for categorical data


In [None]:
reproject_tiff_files_bilinear(data_raw_folder, processed_data_folder, epsg_code) # for continuous


## Calculate distance

In [None]:
def distance_to_edge_gdal_no_mask(
    input_file,
    dist_file,
    values=0,
    nodata=0,
    max_distance_value=4294967295,
    input_nodata=True,
    verbose=False,
):
    """Computes the shortest distance to given pixel values in a raster,
    while preserving the original nodata mask in the output."""

    # Read input file
    src_ds = gdal.Open(input_file)
    srcband = src_ds.GetRasterBand(1)

    # Create raster of distance
    drv = gdal.GetDriverByName("GTiff")
    dst_ds = drv.Create(
        dist_file,
        src_ds.RasterXSize,
        src_ds.RasterYSize,
        1,
        gdal.GDT_UInt32,
        ["COMPRESS=DEFLATE", "PREDICTOR=2", "BIGTIFF=YES"],
    )
    dst_ds.SetGeoTransform(src_ds.GetGeoTransform())
    dst_ds.SetProjection(src_ds.GetProjection())
    dstband = dst_ds.GetRasterBand(1)

    # Use_input_nodata
    ui_nodata = "YES" if input_nodata else "NO"

    # Compute distance
    val = "VALUES=" + str(values)
    use_input_nodata = "USE_INPUT_NODATA=" + ui_nodata
    max_distance = "MAXDIST=" + str(max_distance_value)
    distance_nodata = "NODATA=" + str(nodata)
    cb = gdal.TermProgress_nocb if verbose else 0
    gdal.ComputeProximity(
        srcband,
        dstband,
        [val, use_input_nodata, max_distance, distance_nodata, "DISTUNITS=GEO"],
        callback=cb,
    )

    # Set nodata value
    dstband.SetNoDataValue(max_distance_value)

    # Flush to disk
    dstband.FlushCache()
    dst_ds.FlushCache()

    # Clean up
    srcband = None
    dstband = None
    del src_ds, dst_ds


In [None]:
def calculate_edge_tif_files(input_folder, output_folder):
    """
    Process .tif files by generating corresponding .tif filenames and calling compute_proximity.
    Parameters:
    input_folder (str): The path to the folder containing tif files.
    output_folder (str): The path to the folder where .tif files will be saved.
    """
    # List all raster files in the input folder
    raster_files = list_files_by_extension(input_folder, [".tiff", ".tif"])

    # Define the words to filter by
    filter_words = ["forest", "reprojected", forest_source]

    # Define the words to exclude from the filtered files
    exclude_words = ["loss"]

    # Filter the raster files based on the presence of any of the filter words in their filenames
    filtered_raster_files = [
        file
        for file in raster_files
        if all(word in os.path.basename(file).lower() for word in filter_words)
        and not any(
            exclude_word in os.path.basename(file).lower()
            for exclude_word in exclude_words
        )
    ]

    # Process each filtered raster file
    for raster_file in filtered_raster_files:
        # Extract the base name of the file without extension
        base_name = os.path.splitext(os.path.basename(raster_file))[0]
        # Create the new .tif filename
        tif_filename = f"{base_name}_edge.tif"
        tif_path = os.path.join(output_folder, tif_filename)
        # Call compute_proximity with the original and new filenames
        distance_to_edge_gdal_no_mask(raster_file, tif_path)


In [None]:
def calculate_distance_tif_files(input_folder, output_folder):
    """
    Process .tif files by generating corresponding .tif filenames and calling compute_proximity.
    Parameters:
    input_folder (str): The path to the folder containing tif files.
    output_folder (str): The path to the folder where .tif files will be saved.
    """
    # List all raster files in the input folder
    raster_files = list_files_by_extension(input_folder, [".tiff", ".tif"])

    # Define the words to filter by
    filter_words = raster_distance

    # Filter the raster files based on the presence of any of the filter words in their filenames
    filtered_raster_files = [
        file
        for file in raster_files
        if any(word in os.path.basename(file).lower() for word in filter_words)
    ]

    # Process each filtered raster file
    for raster_file in filtered_raster_files:
        # Extract the base name of the file without extension
        base_name = os.path.splitext(os.path.basename(raster_file))[0]
        # Create the new .tif filename
        tif_filename = f"{base_name}_distance.tif"
        tif_path = os.path.join(output_folder, tif_filename)
        # Call compute_proximity with the original and new filenames
        distance_to_edge_gdal_no_mask(raster_file, tif_path, 1)


In [None]:
calculate_edge_tif_files(processed_data_folder, processed_data_folder)

In [None]:
calculate_distance_tif_files(processed_data_folder, processed_data_folder)