In [1]:
%load_ext autoreload
%autoreload 2

import sys
import logging
from pathlib import Path

scripts_dir = Path("../.").joinpath("src")
if scripts_dir not in sys.path:
    sys.path.insert(0, scripts_dir.resolve().as_posix())

formatter = logging.Formatter(
    "%(asctime)s - %(name)s - \x1b[38;20m %(levelname)s \x1b[0m - %(message)s"
)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)
logger = logging.getLogger(__name__)
logger.addHandler(ch)
logger.setLevel(logging.DEBUG)
%matplotlib inline

mkdir -p failed for path /home/mambauser/.cache/matplotlib: [Errno 13] Permission denied: '/home/mambauser/.cache/matplotlib'
Matplotlib created a temporary cache directory at /tmp/matplotlib-ka2cqne3 because there was an issue with the default path (/home/mambauser/.cache/matplotlib); it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
from typing import Tuple, Iterable, Union, List, Dict, Callable

# import requests # We can download locally the data, is faster

import geopandas as gpd
import pandas as pd

from pyproj import CRS

import numpy as np
import rasterio as rio
from dask.distributed import LocalCluster, Client

from v2.utils import download_file, unzip_file, rm_tree
from v2.processing_grid import ComputationGrid
from v2.raster_processing import calculate_zonal_stats
from v2.vector_utils import add_bbox
from v2.vector_processing import spatial_dissolve, simplify_async, spatial_difference
from v2.raster_processing_v2 import process_raster, window_generator, process_job_chunk
from v2.datasets import RasterDataset


import tqdm
import concurrent.futures
import threading
import multiprocessing

In [3]:
# Paths and sources
data_folder = Path("../data")
sks_data_path = data_folder.joinpath("biomes")
sks_data_path.mkdir(parents=True, exist_ok=True)

## Functions

In [4]:
def lower_case_columns(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    gdf.columns = gdf.columns.str.lower()
    return gdf


def filter_by_extent(
    gdf: gpd.GeoDataFrame, extent: Tuple[float, float, float, float]
) -> gpd.GeoDataFrame:
    minx, miny, maxx, maxy = extent
    return gdf.cx[minx:maxx, miny:maxy].reset_index(drop=True)


def calculate_global(gdf, col_name="iso_3"):
    global__efgs = (
        gdf.groupby("category").agg({"count": "sum"}).reset_index().assign(**{col_name: "GLOB"})
    )
    return pd.concat([global__efgs, gdf], axis=0)

## Processing

### GADM

In [None]:
# To download the url needs to be signed in the browser

local_gadm_path = data_folder.joinpath("gadm/processed/gadm_preprocess.zip")
download_file(
    "https://storage.cloud.google.com/vector-data-raw/vizzuality_processed_data/gadm/preprocess/gadm_preprocess.zip",
    local_gadm_path,
)
unzip_file(local_gadm_path)

In [None]:
gadm_data_simp = gpd.read_file(
    data_folder.joinpath("gadm/processed/preprocess/gadm_preprocess.shp")
)

### wdpa

In [36]:
## this requires the wdpa data be run in the step 5 to generate the tpa_sjoin.shp
## this is the file that contains the dissolved boundaries
wdpa_data_path = data_folder.joinpath("mpa-terrestrial/processed/stats/mpas_sjoin.shp")
# wdpa_data_path = data_folder.joinpath("mpa-terrestrial/processed/tpa_sjoin.shp")


dissolved_gdf = (
    gpd.read_file(wdpa_data_path)
    .pipe(lower_case_columns)
    .pipe(filter_by_extent, (-181, -91, 181, 91))
)
dissolved_gdf

Unnamed: 0,wdpaid,wdpa_pid,pa_def,name,desig_eng,iucn_cat,marine,gis_area,status,status_yr,parent_iso,index_righ,country,gid_0,area_km2,country_es,country_fr,iso_3,geometry
0,17131.0,17131,1,Hamoun,Protected Area,V,0,3022.952813,Designated,1968.0,IRN,0.0,Afghanistan,AFG,644050.28,Afganistán,Afghanistan,AFG,"POLYGON ((61.19578 31.44834, 61.36976 31.42576..."
1,17160.0,17160,1,"Hamun-e-Puzak, south end","Ramsar Site, Wetland of International Importance",Not Reported,0,172.551965,Designated,1975.0,IRN,0.0,Afghanistan,AFG,644050.28,Afganistán,Afghanistan,AFG,"POLYGON ((61.75037 31.33179, 61.72363 31.35944..."
2,1118.0,1118,1,Dasht-i-Nawar,Waterfowl Sanctuary,IV,0,375.359815,Designated,2020.0,AFG,0.0,Afghanistan,AFG,644050.28,Afganistán,Afghanistan,AFG,"POLYGON ((67.76059 33.78497, 67.75989 33.78315..."
3,15133.0,15133,1,Kol-i-Hashmat Khan,Waterfowl Sanctuary,IV,0,1.665554,Designated,2017.0,AFG,0.0,Afghanistan,AFG,644050.28,Afganistán,Afghanistan,AFG,"MULTIPOLYGON (((69.20214 34.49681, 69.2028 34...."
4,555705308.0,555705308,1,Koh-e Baba (Shah Foladi),Protected Landscape,V,0,341.997539,Designated,2019.0,AFG,0.0,Afghanistan,AFG,644050.28,Afganistán,Afghanistan,AFG,"POLYGON ((67.99935 34.66263, 67.9932 34.66009,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289347,2531.0,2531,1,Mana Pools,National Park,II,0,2134.271397,Designated,1975.0,ZWE,0.0,Zimbabwe,ZWE,391234.88,Zimbabue,Zimbabwe,ZWE,"POLYGON ((29.56479 -15.6758, 29.56611 -15.6757..."
289348,2526.0,2526,1,Sapi,Safari Area,VI,0,1200.644367,Designated,1975.0,ZWE,0.0,Zimbabwe,ZWE,391234.88,Zimbabue,Zimbabwe,ZWE,"POLYGON ((29.88011 -15.67272, 29.87637 -15.679..."
289349,62095.0,62095,1,Chiawa,Game Management Area,VI,0,2413.162703,Designated,1989.0,ZMB,0.0,Zimbabwe,ZWE,391234.88,Zimbabue,Zimbabwe,ZWE,"MULTIPOLYGON (((29.10233 -15.86402, 29.09482 -..."
289350,7962.0,7962,1,Lower Zambezi,National Park,II,0,4161.873753,Designated,1983.0,ZMB,0.0,Zimbabwe,ZWE,391234.88,Zimbabue,Zimbabwe,ZWE,"MULTIPOLYGON (((30.2016 -15.65147, 30.20137 -1..."


In [37]:
removed_non_poly = (
    dissolved_gdf[dissolved_gdf.geometry.geom_type.isin(["Polygon", "MultiPolygon"])]
    .reset_index(drop=True)
    .copy()
)

del dissolved_gdf

### Reclassify the raster

In [9]:
land_cover_classes = {
    1: "Forest",
    2: "Savanna",
    3: "Shrubland",
    4: "Grassland",
    5: "Wetlands/open water",
    6: "Rocky/mountains",
    7: "Desert",
    8: "Artificial",
    255: "Other",
}

In [None]:
def reclass_function(ndata: np.ndarray) -> np.ndarray:
    # Apply the value changes
    ndata = np.where(ndata < 200, 1, ndata)  # forest
    ndata = np.where((ndata >= 200) & (ndata < 300), 2, ndata)  # savanna
    ndata = np.where((ndata >= 300) & (ndata < 400), 3, ndata)  # scrub/shrub
    ndata = np.where((ndata >= 400) & (ndata < 500), 4, ndata)  # grassland
    ndata = np.where(ndata == 501, 5, ndata)  # open water - Wetlands/open water
    ndata = np.where(ndata == 505, 5, ndata)  # open water - Wetlands/open water
    ndata = np.where((ndata >= 500) & (ndata < 600), 5, ndata)  # wetlands - Wetlands/open water
    ndata = np.where(ndata == 984, 5, ndata)  # wetlands - Wetlands/open water
    ndata = np.where(ndata == 910, 5, ndata)  # wetlands - Wetlands/open water
    ndata = np.where((ndata >= 600) & (ndata < 800), 6, ndata)  # rocky/mountains
    ndata = np.where((ndata >= 800) & (ndata < 900), 7, ndata)  # desert
    ndata = np.where((ndata >= 1400) & (ndata < 1500), 8, ndata)  # ag/urban - Artificial

    # Ensure the ndata is within the 8-bit range

    return np.clip(ndata, 0, 255).astype(np.uint8)

In [11]:
def process_raster_v2(
    raster_path: Path,
    output_path: Path,
    func: Callable,
    out_data_profile,
    f_args: Tuple = (),
    f_kwargs: Dict = {},
) -> None:
    num_workers = 200
    window_size_x = 2048
    window_size_y = 2048
    with rio.open(raster_path.as_posix()) as src:
        # Create a destination dataset based on source params. The
        # destination will be tiled, and we'll process the tiles
        # concurrently.
        profile = src.profile.copy()
        profile.update(**out_data_profile)

        with rio.open(output_path.as_posix(), "w", **profile) as dst:
            windows = [window for ij, window in dst.block_windows()]
            # print(dst.block_shapes)
            # windows = list(
            #     window_generator(
            #         src.profile.get("width"),
            #         dst.profile.get("height"),
            #         window_size_x,
            #         window_size_y,
            #     )
            # )

            # We cannot write to the same file from multiple threads
            # without causing race conditions. To safely read/write
            # from multiple threads, we use a lock to protect the
            # DatasetReader/Writer
            read_lock = threading.Lock()
            write_lock = threading.Lock()

            def process(window):
                status_message = {
                    "diagnostics": {},
                    "messages": [f"Processing chunk: {window}"],
                    "return_val": None,
                }
                # read the chunk
                try:
                    status_message["messages"].append("reading data")

                    with read_lock:
                        data = src.read(window=window)

                    status_message["messages"].append("processing data")
                    result = func(data, *f_args, **f_kwargs)

                    status_message["messages"].append("writing data")
                    with write_lock:
                        dst.write(result, window=window)

                    status_message["messages"].append("success in processing chunk")

                except Exception as e:
                    status_message["diagnostics"]["error"] = e
                finally:
                    return status_message

            # We map the process() function over the list of
            # windows.

            futures = []

            with (
                concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor,
                tqdm.tqdm(total=len(windows), desc="Computing raster stats", unit="chunk") as p_bar,
            ):
                for idx, window in enumerate(windows):
                    futures.append(executor.submit(process, window))

                # if idx == break_after:
                #     break

                results = []
                for f in futures:
                    results.append(f.result())
                    p_bar.update(1)

            dst.build_overviews([2, 4, 8, 16, 32, 64], rio.enums.Resampling.mode)
            dst.update_tags(ns="rio_overview", resampling="average")

    # return results

In [12]:
def test_process_raster(
    input_path: Path,
    output_path: Path,
    fcn: Callable,
    out_data_profile: Dict | None = {},
    f_kwargs: Dict = {},
):
    with rio.Env(VSI_CACHE=True, GDAL_CACHEMAX=1024):
        i_raster = RasterDataset(input_path)
        new_profile = i_raster.profile.copy()
        new_profile.update(**out_data_profile) if out_data_profile else None
        o_raster = RasterDataset(output_path, profile=new_profile)

        window_size_y = 1024
        window_size_x = 1024
        print(i_raster.profile.get("width"), i_raster.profile.get("height"))
        print(
            (
                i_raster.profile.get("width")
                // window_size_x
                * i_raster.profile.get("height")
                // window_size_y
            )
            / 20
        )

        # window_chunks = list(
        #     window_generator(
        #         i_raster.profile.get("width"), i_raster.profile.get("height"), window_size_x, window_size_y
        #     )
        # )
        with o_raster._open_reader() as dst:
            window_chunks = [window for ij, window in dst.block_windows()]
        break_after = 100
        futures = []

        with multiprocessing.Manager() as manager:
            # create the shared lock
            lock = manager.Lock()
            with (
                concurrent.futures.ProcessPoolExecutor(max_workers=20) as executor,
                tqdm.tqdm(
                    total=len(window_chunks), desc="Computing raster stats", unit="chunk"
                ) as p_bar,
            ):
                for idx, window in enumerate(window_chunks):
                    futures.append(
                        executor.submit(
                            process_job_chunk, i_raster, o_raster, window, fcn, lock, **f_kwargs
                        )
                    )

                    # if idx == break_after:
                    #     break

                results = []
                for f in futures:
                    results.append(f.result())
                    p_bar.update(1)

    return results

In [35]:
local_biome_raster_path = data_folder.joinpath(
    "biomes/iucn_habitatclassification_composite_lvl1_ver004.tif"
)
reclassified_biome_raster_path = output_path
output_path = local_biome_raster_path.parent.joinpath(local_biome_raster_path.stem + "_reclass.tif")

output_path.unlink(missing_ok=True)

out_data_profile = {
    "dtype": rio.uint8,
    "count": 1,
    "compress": "lzw",
    "tiled": True,
    "blockxsize": 512,
    "blockysize": 512,
}

In [None]:
# To download the url needs to be signed in the browser
download_file(
    "https://storage.cloud.google.com/vector-data-raw/terrestrial/jung_etal_2020/iucn_habitatclassification_composite_lvl1_ver004.tif",
    local_biome_raster_path,
)


In [None]:
## this was the old way of processing the raster
# test_process_raster(local_biome_raster_path, output_path, reclass_function, out_data_profile)

In [19]:
# reclass the original raster
process_raster_v2(local_biome_raster_path, output_path, reclass_function, out_data_profile)

Computing raster stats: 100%|██████████| 306936/306936 [06:54<00:00, 739.67chunk/s] 


In [None]:
# another test to try the same operation in parallel with multi threading

# with LocalCluster(name="test", n_workers=20, processes=True, threads_per_worker=1) as cluster, Client(cluster) as client:
#     print(f"Processing in parallel: {client.dashboard_link}")

#     data = process_raster(
#         local_biome_raster_path,
#         output_path,
#         out_data_profile,
#         reclass_function,
#         dask_client=client,
#         window_size=512,
#     )
#     list(data)

### Computation grids 

In [38]:
wdpa_grid = ComputationGrid(
    bounds=[-180, -90, 180, 90], max_cell_size=10, crs=CRS.from_epsg(4326), grid_type="sparse"
)
region_grid = ComputationGrid(
    bounds=[-180, -90, 180, 90], max_cell_size=10, crs=CRS.from_epsg(4326), grid_type="dense"
)

In [39]:
wdpa_grid.create_gdf_density_based_grid(removed_non_poly, 5000)

Unnamed: 0,geometry
0,"POLYGON ((-170 -90, -170 -80, -180 -80, -180 -..."
1,"POLYGON ((-170 -50, -170 -40, -180 -40, -180 -..."
2,"POLYGON ((-170 -40, -170 -30, -180 -30, -180 -..."
3,"POLYGON ((-170 -30, -170 -20, -180 -20, -180 -..."
4,"POLYGON ((-170 -20, -170 -10, -180 -10, -180 -..."
...,...
436,"POLYGON ((180 -20, 180 -10, 170 -10, 170 -20, ..."
437,"POLYGON ((180 -10, 180 0, 170 0, 170 -10, 180 ..."
438,"POLYGON ((180 50, 180 60, 170 60, 170 50, 180 ..."
439,"POLYGON ((180 60, 180 70, 170 70, 170 60, 180 ..."


In [40]:
dissolved_gdf = await spatial_dissolve(removed_non_poly, wdpa_grid, "iso_3", "first")

Disolving dataset elements: 100%|██████████| 441/441 [01:13<00:00,  5.99chunk/s]


In [None]:
dissolved_simp_gdf = await simplify_async(dissolved_gdf, True)

simplifying:   0%|          | 0/944 [00:00<?, ?row/s]

In [24]:
dissolved_simp_gdf.to_file(sks_data_path.joinpath("pas_sjoin_dissolved_simp.shp"))

## Raster zonal statistics

#### Process

In [None]:
dissolved_simp_gdf = gpd.read_file(sks_data_path.joinpath("pas_sjoin_dissolved_simp.shp"))

In [25]:
dissolved_simp_gdf.geometry.geom_type.unique()

array(['MultiPolygon', 'Polygon'], dtype=object)

In [26]:
filtered = dissolved_simp_gdf[dissolved_simp_gdf.geom_type.isin(["Polygon", "MultiPolygon"])][
    ["geometry", "iso_3"]
].reset_index(drop=True)

In [27]:
def convert_pix_to_area(df: gpd.GeoDataFrame, cell_size: float, col: str) -> gpd.GeoDataFrame:
    return df.assign({col: lambda x: x.col * cell_size * cell_size})

In [28]:
async def calc_protection(efg_raster_path: str, geometries: list, c_map: dict):
    calc = await calculate_zonal_stats(
        efg_raster_path, geometries, _with="exact", stats=["unique", "frac", "count"], c_map=c_map
    )
    final = (
        pd.concat(calc, axis=0)
        .groupby(["iso_3", "category"])
        .agg({"count": "sum"})
        .reset_index()
        .pipe(calculate_global)
        .rename(columns={"category": "habitats", "count": "protected"})
    )

    return final

In [29]:
async def calc_country_extent(efg_raster_path: str, geometries: list, c_map: dict):
    calc = await calculate_zonal_stats(
        efg_raster_path, geometries, _with="exact", stats=["unique", "frac", "count"], c_map=c_map
    )

    final = (
        pd.concat(calc, axis=0)
        .groupby(["iso_3", "category"])
        .agg({"count": "sum"})
        .reset_index()
        .pipe(calculate_global)
        .rename(columns={"category": "habitats", "count": "total"})
    )

    return final

In [33]:
wdpa_grid = ComputationGrid(
    bounds=[-180, -90, 180, 90], max_cell_size=3, crs=CRS.from_epsg(4326), grid_type="sparse"
)
wdpa_grid.create_gdf_density_based_grid(filtered.explode(), 5000)
list_subset = wdpa_grid.split_gdf_by_grid(filtered, True, 0)
# wdpa_grid.grid_gdf.explore()

In [32]:
protected = await calc_protection(reclassified_biome_raster_path, list_subset, land_cover_classes)

NameError: name 'reclassified_biome_raster_path' is not defined

In [None]:
del list_subset
del filtered
del wdpa_grid

In [31]:
protected

NameError: name 'protected' is not defined

In [42]:
protected

Unnamed: 0,habitats,protected,iso_3
0,Artificial,170986537.442105,GLOB
1,Desert,744586336.870439,GLOB
2,Forest,893482228.65953,GLOB
3,Grassland,493833720.441902,GLOB
4,Other,10395092.909473,GLOB
...,...,...,...
1363,Forest,577626.820158,ZWE
1364,Grassland,21742.292057,ZWE
1365,Savanna,8742367.01956,ZWE
1366,Shrubland,287596.872073,ZWE


In [43]:
gadm_data_simp.rename(columns={"GID_0": "iso_3"}, inplace=True)

In [None]:
region_grid = ComputationGrid(
    bounds=[-180, -90, 180, 90], max_cell_size=10, crs=CRS.from_epsg(4326), grid_type="dense"
)

regions_split = region_grid.split_gdf_by_grid(gadm_data_simp, True, 0)

In [45]:
location_efgs = await calc_country_extent(
    reclassified_biome_raster_path, regions_split, land_cover_classes
)

Computing raster stats: 100%|██████████| 467/467 [02:17<00:00,  3.39chunk/s]


In [46]:
location_efgs

Unnamed: 0,habitats,total,iso_3
0,Artificial,2814392766.029436,GLOB
1,Desert,11035130642.574265,GLOB
2,Forest,4894422034.068147,GLOB
3,Grassland,3532338753.845665,GLOB
4,Other,31294030.987612,GLOB
...,...,...,...
1493,Grassland,338488.994966,ZWE
1494,Rocky/mountains,51.0,ZWE
1495,Savanna,22879058.986613,ZWE
1496,Shrubland,5252410.077386,ZWE


In [47]:
del regions_split
del region_grid

In [None]:
def create_master_data_table(location_efgs: pd.DataFrame, protected: pd.DataFrame) -> pd.DataFrame:
    protection_extent = location_efgs.set_index(["iso_3", "habitats"]).join(
        protected.set_index(["iso_3", "habitats"])
    )
    protection_extent["frac"] = (protection_extent["protected"] / protection_extent["total"]) * 100

    return protection_extent.reset_index()

In [49]:
master_data_protection = create_master_data_table(location_efgs, protected)

In [50]:
def calc_area(df: pd.DataFrame) -> pd.DataFrame:
    df["perc_extent"] = (df["total"] / df["total_area"]) * 100
    return df

In [None]:
test = master_data_protection.join(
    master_data_protection.groupby("iso_3").agg({"total": "sum"}), on="iso_3", rsuffix="_area"
).pipe(calc_area)

In [53]:
test_old = pd.read_csv(sks_data_path.joinpath("master_data_protection_old.csv"))

In [58]:
test[test["iso_3"] == "ESP"]

Unnamed: 0,iso_3,habitats,total,protected,frac,total_area,perc_extent
426,ESP,Artificial,39106089.709432,8067029.669799,20.628577,66412410.121894,58.883708
427,ESP,Desert,150986.99668,85544.142813,56.656629,66412410.121894,0.227348
428,ESP,Forest,8971486.41535,4122894.487791,45.955534,66412410.121894,13.50875
429,ESP,Grassland,3406859.161202,1228491.804244,36.059366,66412410.121894,5.129853
430,ESP,Other,43549.407046,16077.294966,36.917368,66412410.121894,0.065574
431,ESP,Rocky/mountains,28369.999124,24333.198233,85.770881,66412410.121894,0.042718
432,ESP,Savanna,160968.845551,52803.595804,32.803612,66412410.121894,0.242378
433,ESP,Shrubland,14187310.96224,4820682.304261,33.97883,66412410.121894,21.36244
434,ESP,Wetlands/open water,356788.625269,214473.227766,60.112126,66412410.121894,0.537232


In [59]:
test_old[test_old["iso_3"] == "ESP"]

Unnamed: 0,iso_3,habitats,total,protected,frac,total_area,perc_extent
472,ESP,Artificial,392371.0,117356.0,29.909448,671255.0,58.453345
473,ESP,Desert,1521.0,1005.0,66.074951,671255.0,0.22659
474,ESP,Forest,89916.0,49953.0,55.555185,671255.0,13.395207
475,ESP,Grassland,34404.0,15589.0,45.311592,671255.0,5.125325
476,ESP,Other,4595.0,2480.0,53.971708,671255.0,0.684539
477,ESP,Rocky/mountains,311.0,279.0,89.710611,671255.0,0.046331
478,ESP,Savanna,1617.0,812.0,50.21645,671255.0,0.240892
479,ESP,Shrubland,142415.0,62193.0,43.670259,671255.0,21.216229
480,ESP,Wetlands/open water,4105.0,2802.0,68.258222,671255.0,0.611541


In [60]:
test.to_csv(sks_data_path.joinpath("master_data_protection_exact.csv"), index=False)

### Test spain without split between 2 different ways of doing zonal stats

In [None]:
from rasterstats import zonal_stats

from exactextract import exact_extract

In [None]:
dissolved_simp_gdf[dissolved_simp_gdf.gid_0 == "ESP"].explore()

In [None]:
gadm_data_simp[gadm_data_simp["iso_3"] == "ESP"].explore()

In [None]:
params_rasterstats = {
    "vectors": gadm_data_simp[gadm_data_simp["iso_3"] == "ESP"].geometry,
    "raster": local_biome_raster_path,
    "all_touched": True,
    "categorical": True,
    "category_map": land_cover_classes,
}

test_esp_total_rasterstats = zonal_stats(**params_rasterstats)

In [None]:
updated_protected = params_rasterstats.copy()
updated_protected.update(
    {"vectors": dissolved_simp_gdf[dissolved_simp_gdf.gid_0 == "ESP"].geometry}
)
test_esp_protected_rasterstats = zonal_stats(**updated_protected)

In [None]:
test_esp = (
    pd.DataFrame(test_esp_total_rasterstats)
    .reset_index()
    .melt("index", var_name="category", value_name="count")
    .drop(columns=["index"])
    .rename(columns={"count": "total"})
    .join(
        pd.DataFrame(test_esp_protected_rasterstats)
        .reset_index()
        .melt("index", var_name="category", value_name="count")
        .groupby("category")
        .agg({"count": "sum"}),
        on="category",
    )
    .assign(iso_3="ESP")
)

test_esp["frac"] = (test_esp["count"] / test_esp["total"]) * 100
test_esp

In [None]:
##### Exact extract
test_total_exact_extract = exact_extract(
    local_biome_raster_path,
    gadm_data_simp[gadm_data_simp["iso_3"] == "ESP"],
    ["unique", "frac", "count"],
    include_cols="iso_3",
    output="pandas",
).explode(["unique", "frac"])

test_total_exact_extract["unique"] = test_total_exact_extract["unique"].map(land_cover_classes)
test_total_exact_extract["total"] = (
    test_total_exact_extract["count"] * test_total_exact_extract["frac"]
)
display(test_total_exact_extract)

In [None]:
##### Exact extract
test_protected_exact_extract = exact_extract(
    local_biome_raster_path,
    dissolved_simp_gdf[dissolved_simp_gdf.gid_0 == "ESP"],
    ["unique", "frac", "count"],
    include_cols="gid_0",
    output="pandas",
).explode(["unique", "frac"])
#     .groupby("unique")
#     .agg({"frac": "sum", "gid_0": "count"})
# ).reset_index()
test_protected_exact_extract["protected"] = (
    test_protected_exact_extract["count"] * test_protected_exact_extract["frac"]
)
test_protected_exact_extract = (
    test_protected_exact_extract.groupby("unique").agg({"protected": "sum"}).reset_index()
)
test_protected_exact_extract["unique"] = test_protected_exact_extract["unique"].map(
    land_cover_classes
)

In [None]:
test_total_exact_extract.join(test_protected_exact_extract.set_index("unique"), on="unique")

In [None]:
compare_table = (
    test_old[test_old["iso_3"] == "ESP"]
    .join(test_esp.set_index("category"), on="habitats", rsuffix="_test")
    .join(
        test_total_exact_extract.join(
            test_protected_exact_extract.set_index("unique"), on="unique"
        ).set_index("unique"),
        on="habitats",
        rsuffix="_exact",
    )[
        [
            "iso_3",
            "habitats",
            "total",
            "total_test",
            "total_exact",
            "protected",
            "count",
            "protected_exact",
        ]
    ]
)

In [None]:
compare_table["frac_def"] = (compare_table["protected"] / compare_table["total"]) * 100
compare_table["frac_exact"] = (
    compare_table["protected_exact"] / compare_table["total_exact"]
) * 100
compare_table["frac_test"] = (compare_table["count"] / compare_table["total_test"]) * 100

compare_table

# Difference (on the fly - terrestrial analysis)

In [None]:
# TODO - move this section to the onlthefly notebook as this is part of the terrestrial data processing for the only the fly analysis
sks_data_path = data_folder.joinpath("skytruth")

In [None]:
dissolved_simp_gdf = gpd.read_file(sks_data_path.joinpath("mpas_sjoin_dissolved_simp.shp"))
gadm_data_simp = gpd.read_file(sks_data_path.joinpath("gadm_preprocess.shp"))

In [None]:
grid = ComputationGrid(
    bounds=[-180, -90, 180, 90],
    cell_size=20,
    crs=CRS.from_epsg(4326),
)

In [None]:
grid.create_gdf_density_based_grid(dissolved_simp_gdf.explode(), 5000)
grid.grid_gdf.reset_index().explore()

In [None]:
test_area = grid.grid_gdf[grid.grid_gdf.index == 235]

test_gadm = gadm_data_simp.iloc[gadm_data_simp.sindex.intersection(test_area.total_bounds)].clip(
    test_area.total_bounds
)

test_wdpa = dissolved_simp_gdf.iloc[
    dissolved_simp_gdf.sindex.intersection(test_area.total_bounds)
].clip(test_area.total_bounds)
test_filtered = test_wdpa.loc[
    test_wdpa.geometry.geom_type.isin(["MultiPolygon", "Polygon"])
].reset_index()

result_test = gpd.overlay(
    test_gadm,
    test_filtered,
    how="difference",
).reset_index(drop=True)

result_test.explore()

In [None]:
# Spatial difference

diffenrence_dataset = await spatial_difference(gadm_data_simp, dissolved_simp_gdf, grid)

In [None]:
diffenrence_dataset.to_file(sks_data_path.joinpath("gadm_diff.shp"))