In [None]:
%load_ext autoreload
%autoreload 2

import sys
import logging
from pathlib import Path

scripts_dir = Path("../.").joinpath("src")
if scripts_dir not in sys.path:
    sys.path.insert(0, scripts_dir.resolve().as_posix())

formatter = logging.Formatter(
    "%(asctime)s - %(name)s - \x1b[38;20m %(levelname)s \x1b[0m - %(message)s"
)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)
logger = logging.getLogger(__name__)
logger.addHandler(ch)
logger.setLevel(logging.DEBUG)
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
from typing import Tuple, Iterable, Union, List, Dict, Callable

# import requests # We can download locally the data, is faster

import geopandas as gpd
import pandas as pd

from pyproj import CRS

import numpy as np
import rasterio as rio
from dask.distributed import LocalCluster, Client

from v2.utils import download_file, unzip_file, rm_tree
from v2.processing_grid import ComputationGrid
from v2.raster_processing import calculate_zonal_stats
from v2.vector_utils import add_bbox
from v2.vector_processing import spatial_dissolve, simplify_async, spatial_difference
from v2.raster_processing_v2 import process_raster, window_generator, process_job_chunk
from v2.datasets import RasterDataset


import tqdm
import concurrent.futures
import threading
import multiprocessing

In [None]:
# Paths and sources
data_folder = Path("../data")
sks_data_path = data_folder.joinpath("skytruth")
sks_data_path.mkdir(parents=True, exist_ok=True)

## Functions

In [30]:
def lower_case_columns(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    gdf.columns = gdf.columns.str.lower()
    return gdf


def filter_by_extent(
    gdf: gpd.GeoDataFrame, extent: Tuple[float, float, float, float]
) -> gpd.GeoDataFrame:
    minx, miny, maxx, maxy = extent
    return gdf.cx[minx:maxx, miny:maxy].reset_index(drop=True)


def calculate_global(gdf, col_name="iso_3"):
    global__efgs = (
        gdf.groupby("category").agg({"count": "sum"}).reset_index().assign(**{col_name: "GLOB"})
    )
    return pd.concat([global__efgs, gdf], axis=0)

## Processing

### GADM

In [None]:
# To download the url needs to be signed in the browser

local_gadm_path = sks_data_path.joinpath("gadm_preprocess.zip")
download_file(
    "https://storage.cloud.google.com/vector-data-raw/vizzuality_processed_data/gadm/preprocess/gadm_preprocess.zip",
    local_gadm_path,
)
unzip_file(local_gadm_path)

In [31]:
gadm_data_simp = gpd.read_file(
    sks_data_path.joinpath("gadm_preprocess/preprocess/gadm_preprocess.shp")
)

### wdpa

In [None]:
## dissolve wdpa data to get the 2024 dissolved boundaries
dissolved_gdf = (
    gpd.read_file(sks_data_path.joinpath("mpas_sjoin.shp"))
    .pipe(lower_case_columns)
    .pipe(filter_by_extent, (-181, -91, 181, 91))
)
dissolved_gdf

In [None]:
removed_non_poly = (
    dissolved_gdf[dissolved_gdf.geometry.geom_type.isin(["Polygon", "MultiPolygon"])]
    .reset_index(drop=True)
    .copy()
)

del dissolved_gdf

### Reclassify the raster

In [32]:
land_cover_classes = {
    1: "Forest",
    2: "Savanna",
    3: "Shrubland",
    4: "Grassland",
    5: "Wetlands/open water",
    6: "Rocky/mountains",
    7: "Desert",
    8: "Artificial",
    255: "Other",
}

In [None]:
def reclass_function(ndata: np.ndarray) -> np.ndarray:
    # Apply the value changes
    ndata = np.where(ndata < 200, 1, ndata)  # forest
    ndata = np.where((ndata >= 200) & (ndata < 300), 2, ndata)  # savanna
    ndata = np.where((ndata >= 300) & (ndata < 400), 3, ndata)  # scrub/shrub
    ndata = np.where((ndata >= 400) & (ndata < 500), 4, ndata)  # grassland

    ndata = np.where(ndata == 501, 5, ndata)  # open water - Wetlands/open water
    ndata = np.where(ndata == 505, 5, ndata)  # open water - Wetlands/open water
    ndata = np.where((ndata >= 500) & (ndata < 600), 5, ndata)  # wetlands - Wetlands/open water
    ndata = np.where(ndata == 984, 5, ndata)  # wetlands - Wetlands/open water
    ndata = np.where(ndata == 910, 5, ndata)  # wetlands - Wetlands/open water
    ndata = np.where((ndata >= 600) & (ndata < 800), 6, ndata)  # rocky/mountains
    ndata = np.where((ndata >= 800) & (ndata < 900), 7, ndata)  # desert
    ndata = np.where((ndata >= 1400) & (ndata < 1500), 8, ndata)  # ag/urban - Artificial

    # Ensure the ndata is within the 8-bit range

    return np.clip(ndata, 0, 255).astype(np.uint8)

In [None]:
def process_raster_v2(
    raster_path: Path,
    output_path: Path,
    func: Callable,
    out_data_profile,
    f_args: Tuple = (),
    f_kwargs: Dict = {},
) -> None:
    num_workers = 200
    window_size_x = 2048
    window_size_y = 2048
    with rio.open(raster_path.as_posix()) as src:
        # Create a destination dataset based on source params. The
        # destination will be tiled, and we'll process the tiles
        # concurrently.
        profile = src.profile.copy()
        profile.update(**out_data_profile)

        with rio.open(output_path.as_posix(), "w", **profile) as dst:
            windows = [window for ij, window in dst.block_windows()]
            # print(dst.block_shapes)
            # windows = list(
            #     window_generator(
            #         src.profile.get("width"),
            #         dst.profile.get("height"),
            #         window_size_x,
            #         window_size_y,
            #     )
            # )

            # We cannot write to the same file from multiple threads
            # without causing race conditions. To safely read/write
            # from multiple threads, we use a lock to protect the
            # DatasetReader/Writer
            read_lock = threading.Lock()
            write_lock = threading.Lock()

            def process(window):
                status_message = {
                    "diagnostics": {},
                    "messages": [f"Processing chunk: {window}"],
                    "return_val": None,
                }
                # read the chunk
                try:
                    status_message["messages"].append("reading data")

                    with read_lock:
                        data = src.read(window=window)

                    status_message["messages"].append("processing data")
                    result = func(data, *f_args, **f_kwargs)

                    status_message["messages"].append("writing data")
                    with write_lock:
                        dst.write(result, window=window)

                    status_message["messages"].append("success in processing chunk")

                except Exception as e:
                    status_message["diagnostics"]["error"] = e
                finally:
                    return status_message

            # We map the process() function over the list of
            # windows.

            futures = []

            with (
                concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor,
                tqdm.tqdm(total=len(windows), desc="Computing raster stats", unit="chunk") as p_bar,
            ):
                for idx, window in enumerate(windows):
                    futures.append(executor.submit(process, window))

                # if idx == break_after:
                #     break

                results = []
                for f in futures:
                    results.append(f.result())
                    p_bar.update(1)

            dst.build_overviews([2, 4, 8, 16, 32, 64], rio.enums.Resampling.mode)
            dst.update_tags(ns="rio_overview", resampling="average")

    # return results

In [None]:
def test_process_raster(
    input_path: Path,
    output_path: Path,
    fcn: Callable,
    out_data_profile: Dict | None = {},
    f_kwargs: Dict = {},
):
    with rio.Env(VSI_CACHE=True, GDAL_CACHEMAX=1024):
        i_raster = RasterDataset(input_path)
        new_profile = i_raster.profile.copy()
        new_profile.update(**out_data_profile) if out_data_profile else None
        o_raster = RasterDataset(output_path, profile=new_profile)

        window_size_y = 1024
        window_size_x = 1024
        print(i_raster.profile.get("width"), i_raster.profile.get("height"))
        print(
            (
                i_raster.profile.get("width")
                // window_size_x
                * i_raster.profile.get("height")
                // window_size_y
            )
            / 20
        )

        # window_chunks = list(
        #     window_generator(
        #         i_raster.profile.get("width"), i_raster.profile.get("height"), window_size_x, window_size_y
        #     )
        # )
        with o_raster._open_reader() as dst:
            window_chunks = [window for ij, window in dst.block_windows()]
        break_after = 100
        futures = []

        with multiprocessing.Manager() as manager:
            # create the shared lock
            lock = manager.Lock()
            with (
                concurrent.futures.ProcessPoolExecutor(max_workers=20) as executor,
                tqdm.tqdm(
                    total=len(window_chunks), desc="Computing raster stats", unit="chunk"
                ) as p_bar,
            ):
                for idx, window in enumerate(window_chunks):
                    futures.append(
                        executor.submit(
                            process_job_chunk, i_raster, o_raster, window, fcn, lock, **f_kwargs
                        )
                    )

                    # if idx == break_after:
                    #     break

                results = []
                for f in futures:
                    results.append(f.result())
                    p_bar.update(1)

    return results

In [None]:
local_biome_raster_path = sks_data_path.joinpath(
    "iucn_habitatclassification_composite_lvl1_ver004.tif"
)
output_path = local_biome_raster_path.parent.joinpath(local_biome_raster_path.stem + "_reclass.tif")

output_path.unlink(missing_ok=True)

out_data_profile = {
    "dtype": rio.uint8,
    "count": 1,
    "compress": "lzw",
    "tiled": True,
    "blockxsize": 512,
    "blockysize": 512,
}

In [None]:
# test_process_raster(local_biome_raster_path, output_path, reclass_function, out_data_profile)

In [None]:
# process_raster_v2(local_biome_raster_path, output_path, reclass_function, out_data_profile)

In [None]:
# check this with multi threading

# with LocalCluster(name="test", n_workers=20, processes=True, threads_per_worker=1) as cluster, Client(cluster) as client:
#     print(f"Processing in parallel: {client.dashboard_link}")

#     data = process_raster(
#         local_biome_raster_path,
#         output_path,
#         out_data_profile,
#         reclass_function,
#         dask_client=client,
#         window_size=512,
#     )
#     list(data)

In [None]:
# To download the url needs to be signed in the browser
local_biome_raster_path = sks_data_path.joinpath(
    "iucn_habitatclassification_composite_lvl1_ver004.tif"
)
download_file(
    "https://storage.cloud.google.com/vector-data-raw/terrestrial/jung_etal_2020/iucn_habitatclassification_composite_lvl1_ver004.tif",
    local_biome_raster_path,
)
# reclass the original raster
reclassified_biome_raster_path = reclass_function(local_biome_raster_path)

In [None]:
# reclassified_biome_raster_path = sks_data_path.joinpath(
#     "iucn_habitatclassification_composite_lvl1_ver004_reclass.tif"
# )

### Computation grids 

In [None]:
# wdpa_grid = ComputationGrid(
#     bounds=[-180, -90, 180, 90], max_cell_size=10, crs=CRS.from_epsg(4326), grid_type="sparse"
# )
# region_grid = ComputationGrid(
#     bounds=[-180, -90, 180, 90],
#     max_cell_size=10,
#     crs=CRS.from_epsg(4326),
#     grid_type="dense"
# )

In [None]:
# wdpa_grid.create_gdf_density_based_grid(wdpa_data, 5000)

In [None]:
# dissolved_gdf = await spatial_dissolve(wdpa_data, wdpa_grid, "iso_3", "first")

In [None]:
# dissolved_simp_gdf = await simplify_async(dissolved_gdf, True)

In [None]:
# dissolved_simp_gdf.to_file(sks_data_path.joinpath("mpas_sjoin_dissolved_simp.shp"))

## raster zonal statistics

#### Process

In [34]:
dissolved_simp_gdf = gpd.read_file(sks_data_path.joinpath("mpas_sjoin_dissolved_simp.shp"))

In [None]:
dissolved_simp_gdf.geometry.geom_type.unique()

array(['MultiPolygon', 'Polygon', None], dtype=object)

In [None]:
filtered = dissolved_simp_gdf[dissolved_simp_gdf.geom_type.isin(["Polygon", "MultiPolygon"])][
    ["geometry", "iso_3"]
].reset_index(drop=True)

In [None]:
def convert_pix_to_area(df: gpd.GeoDataFrame, cell_size: float, col: str) -> gpd.GeoDataFrame:
    return df.assign({col: lambda x: x.col * cell_size * cell_size})

In [38]:
async def calc_protection(efg_raster_path: str, geometries: list, c_map: dict):
    calc = await calculate_zonal_stats(
        efg_raster_path, geometries, _with="exact", stats=["unique", "frac", "count"], c_map=c_map
    )
    final = (
        pd.concat(calc, axis=0)
        .groupby(["iso_3", "category"])
        .agg({"count": "sum"})
        .reset_index()
        .pipe(calculate_global)
        .rename(columns={"category": "habitats", "count": "protected"})
    )

    return final

In [39]:
async def calc_country_extent(efg_raster_path: str, geometries: list, c_map: dict):
    calc = await calculate_zonal_stats(
        efg_raster_path, geometries, _with="exact", stats=["unique", "frac", "count"], c_map=c_map
    )

    final = (
        pd.concat(calc, axis=0)
        .groupby(["iso_3", "category"])
        .agg({"count": "sum"})
        .reset_index()
        .pipe(calculate_global)
        .rename(columns={"category": "habitats", "count": "total"})
    )

    return final

In [40]:
wdpa_grid = ComputationGrid(
    bounds=[-180, -90, 180, 90], max_cell_size=3, crs=CRS.from_epsg(4326), grid_type="sparse"
)
wdpa_grid.create_gdf_density_based_grid(filtered.explode(), 5000)
list_subset = wdpa_grid.split_gdf_by_grid(filtered, True, 0)
# wdpa_grid.grid_gdf.explore()

In [41]:
protected = await calc_protection(reclassified_biome_raster_path, list_subset, land_cover_classes)
del list_subset
del filtered
del wdpa_grid

Computing raster stats:   1%|          | 26/2332 [00:03<03:46, 10.17chunk/s] 

Unsupported geometry type.


Computing raster stats:   3%|▎         | 71/2332 [00:04<01:23, 27.05chunk/s]

Mixed-type geometries not supported.


Computing raster stats:   4%|▍         | 97/2332 [00:05<01:06, 33.58chunk/s]

Mixed-type geometries not supported.
Mixed-type geometries not supported.


Computing raster stats:   6%|▌         | 140/2332 [00:06<00:49, 44.01chunk/s]

Mixed-type geometries not supported.


Computing raster stats:   6%|▋         | 151/2332 [00:06<00:47, 46.06chunk/s]

Mixed-type geometries not supported.


Computing raster stats:   7%|▋         | 160/2332 [00:06<00:42, 50.78chunk/s]

Mixed-type geometries not supported.Mixed-type geometries not supported.



Computing raster stats:   7%|▋         | 170/2332 [00:06<00:35, 60.64chunk/s]

Mixed-type geometries not supported.


Computing raster stats:   8%|▊         | 183/2332 [00:07<00:44, 48.74chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  11%|█▏        | 267/2332 [00:08<00:33, 61.43chunk/s]

Mixed-type geometries not supported.

Computing raster stats:  12%|█▏        | 276/2332 [00:08<00:30, 66.40chunk/s]




Computing raster stats:  14%|█▍        | 326/2332 [00:09<00:30, 65.40chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  14%|█▍        | 333/2332 [00:09<00:30, 65.96chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  15%|█▌        | 353/2332 [00:10<00:42, 46.39chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  16%|█▌        | 376/2332 [00:10<00:35, 54.72chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  17%|█▋        | 388/2332 [00:10<00:41, 47.19chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  17%|█▋        | 403/2332 [00:11<00:51, 37.81chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  19%|█▉        | 453/2332 [00:12<00:48, 38.64chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  20%|██        | 468/2332 [00:12<00:48, 38.69chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  20%|██        | 478/2332 [00:13<01:02, 29.89chunk/s]

Unsupported geometry type.


Computing raster stats:  21%|██        | 493/2332 [00:13<00:54, 33.90chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  27%|██▋       | 618/2332 [00:16<00:32, 52.67chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  28%|██▊       | 650/2332 [00:16<00:30, 54.94chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  38%|███▊      | 881/2332 [00:20<00:24, 58.84chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  40%|████      | 942/2332 [00:22<00:36, 37.65chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  41%|████      | 950/2332 [00:22<00:51, 26.82chunk/s]

Mixed-type geometries not supported.
Mixed-type geometries not supported.


Computing raster stats:  42%|████▏     | 971/2332 [00:23<00:41, 32.46chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  45%|████▍     | 1045/2332 [00:25<00:30, 42.52chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  46%|████▌     | 1077/2332 [00:25<00:21, 58.82chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  47%|████▋     | 1102/2332 [00:26<00:24, 49.48chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  48%|████▊     | 1110/2332 [00:26<00:22, 53.55chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  49%|████▉     | 1143/2332 [00:28<00:53, 22.07chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  49%|████▉     | 1146/2332 [00:28<00:52, 22.67chunk/s]

Mixed-type geometries not supported.
Mixed-type geometries not supported.

Computing raster stats:  49%|████▉     | 1150/2332 [00:28<00:46, 25.43chunk/s]


Mixed-type geometries not supported.
Mixed-type geometries not supported.


Computing raster stats:  50%|█████     | 1175/2332 [00:29<00:37, 30.85chunk/s]

Unsupported geometry type.


Computing raster stats:  51%|█████     | 1179/2332 [00:29<00:51, 22.33chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  52%|█████▏    | 1221/2332 [00:30<00:37, 29.70chunk/s]

Mixed-type geometries not supported.
Mixed-type geometries not supported.


Computing raster stats:  53%|█████▎    | 1226/2332 [00:30<00:33, 33.09chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  54%|█████▍    | 1258/2332 [00:31<00:35, 30.23chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  54%|█████▍    | 1262/2332 [00:32<00:39, 26.81chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  58%|█████▊    | 1347/2332 [00:33<00:21, 46.33chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  71%|███████▏  | 1665/2332 [00:36<00:04, 149.01chunk/s]

Mixed-type geometries not supported.

Computing raster stats:  72%|███████▏  | 1681/2332 [00:37<00:04, 132.39chunk/s]




Computing raster stats:  82%|████████▏ | 1912/2332 [00:39<00:03, 114.29chunk/s]

Mixed-type geometries not supported.
Mixed-type geometries not supported.


Computing raster stats:  83%|████████▎ | 1947/2332 [00:39<00:03, 105.36chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  84%|████████▍ | 1958/2332 [00:39<00:04, 82.93chunk/s] 

Unsupported geometry type.

Computing raster stats:  84%|████████▍ | 1970/2332 [00:39<00:03, 91.21chunk/s]


Unsupported geometry type.


Computing raster stats:  86%|████████▌ | 2009/2332 [00:40<00:03, 91.70chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  87%|████████▋ | 2036/2332 [00:40<00:04, 65.78chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  89%|████████▉ | 2072/2332 [00:41<00:03, 71.77chunk/s]

Mixed-type geometries not supported.


Computing raster stats:  99%|█████████▊| 2302/2332 [00:44<00:00, 129.60chunk/s]

Mixed-type geometries not supported.


Computing raster stats: 100%|██████████| 2332/2332 [00:45<00:00, 51.38chunk/s] 


In [42]:
protected

Unnamed: 0,habitats,protected,iso_3
0,Artificial,170986537.442105,GLOB
1,Desert,744586336.870439,GLOB
2,Forest,893482228.65953,GLOB
3,Grassland,493833720.441902,GLOB
4,Other,10395092.909473,GLOB
...,...,...,...
1363,Forest,577626.820158,ZWE
1364,Grassland,21742.292057,ZWE
1365,Savanna,8742367.01956,ZWE
1366,Shrubland,287596.872073,ZWE


In [43]:
gadm_data_simp.rename(columns={"GID_0": "iso_3"}, inplace=True)

In [None]:
region_grid = ComputationGrid(
    bounds=[-180, -90, 180, 90], max_cell_size=10, crs=CRS.from_epsg(4326), grid_type="dense"
)

regions_split = region_grid.split_gdf_by_grid(gadm_data_simp, True, 0)

In [45]:
location_efgs = await calc_country_extent(
    reclassified_biome_raster_path, regions_split, land_cover_classes
)

Computing raster stats: 100%|██████████| 467/467 [02:17<00:00,  3.39chunk/s]


In [46]:
location_efgs

Unnamed: 0,habitats,total,iso_3
0,Artificial,2814392766.029436,GLOB
1,Desert,11035130642.574265,GLOB
2,Forest,4894422034.068147,GLOB
3,Grassland,3532338753.845665,GLOB
4,Other,31294030.987612,GLOB
...,...,...,...
1493,Grassland,338488.994966,ZWE
1494,Rocky/mountains,51.0,ZWE
1495,Savanna,22879058.986613,ZWE
1496,Shrubland,5252410.077386,ZWE


In [47]:
del regions_split
del region_grid

In [None]:
def create_master_data_table(location_efgs: pd.DataFrame, protected: pd.DataFrame) -> pd.DataFrame:
    protection_extent = location_efgs.set_index(["iso_3", "habitats"]).join(
        protected.set_index(["iso_3", "habitats"])
    )
    protection_extent["frac"] = (protection_extent["protected"] / protection_extent["total"]) * 100

    return protection_extent.reset_index()

In [49]:
master_data_protection = create_master_data_table(location_efgs, protected)

In [50]:
def calc_area(df: pd.DataFrame) -> pd.DataFrame:
    df["perc_extent"] = (df["total"] / df["total_area"]) * 100
    return df

In [None]:
test = master_data_protection.join(
    master_data_protection.groupby("iso_3").agg({"total": "sum"}), on="iso_3", rsuffix="_area"
).pipe(calc_area)

In [53]:
test_old = pd.read_csv(sks_data_path.joinpath("master_data_protection_old.csv"))

In [58]:
test[test["iso_3"] == "ESP"]

Unnamed: 0,iso_3,habitats,total,protected,frac,total_area,perc_extent
426,ESP,Artificial,39106089.709432,8067029.669799,20.628577,66412410.121894,58.883708
427,ESP,Desert,150986.99668,85544.142813,56.656629,66412410.121894,0.227348
428,ESP,Forest,8971486.41535,4122894.487791,45.955534,66412410.121894,13.50875
429,ESP,Grassland,3406859.161202,1228491.804244,36.059366,66412410.121894,5.129853
430,ESP,Other,43549.407046,16077.294966,36.917368,66412410.121894,0.065574
431,ESP,Rocky/mountains,28369.999124,24333.198233,85.770881,66412410.121894,0.042718
432,ESP,Savanna,160968.845551,52803.595804,32.803612,66412410.121894,0.242378
433,ESP,Shrubland,14187310.96224,4820682.304261,33.97883,66412410.121894,21.36244
434,ESP,Wetlands/open water,356788.625269,214473.227766,60.112126,66412410.121894,0.537232


In [59]:
test_old[test_old["iso_3"] == "ESP"]

Unnamed: 0,iso_3,habitats,total,protected,frac,total_area,perc_extent
472,ESP,Artificial,392371.0,117356.0,29.909448,671255.0,58.453345
473,ESP,Desert,1521.0,1005.0,66.074951,671255.0,0.22659
474,ESP,Forest,89916.0,49953.0,55.555185,671255.0,13.395207
475,ESP,Grassland,34404.0,15589.0,45.311592,671255.0,5.125325
476,ESP,Other,4595.0,2480.0,53.971708,671255.0,0.684539
477,ESP,Rocky/mountains,311.0,279.0,89.710611,671255.0,0.046331
478,ESP,Savanna,1617.0,812.0,50.21645,671255.0,0.240892
479,ESP,Shrubland,142415.0,62193.0,43.670259,671255.0,21.216229
480,ESP,Wetlands/open water,4105.0,2802.0,68.258222,671255.0,0.611541


In [60]:
test.to_csv(sks_data_path.joinpath("master_data_protection_exact.csv"), index=False)

### Test spain without split

In [None]:
from rasterstats import zonal_stats

from exactextract import exact_extract

In [None]:
dissolved_simp_gdf[dissolved_simp_gdf.gid_0 == "ESP"].explore()

In [None]:
gadm_data_simp[gadm_data_simp["iso_3"] == "ESP"].explore()

In [None]:
params_rasterstats = {
    "vectors": gadm_data_simp[gadm_data_simp["iso_3"] == "ESP"].geometry,
    "raster": local_biome_raster_path,
    "all_touched": True,
    "categorical": True,
    "category_map": land_cover_classes,
}

test_esp_total_rasterstats = zonal_stats(**params_rasterstats)

In [None]:
updated_protected = params_rasterstats.copy()
updated_protected.update(
    {"vectors": dissolved_simp_gdf[dissolved_simp_gdf.gid_0 == "ESP"].geometry}
)
test_esp_protected_rasterstats = zonal_stats(**updated_protected)

In [None]:
test_esp = (
    pd.DataFrame(test_esp_total_rasterstats)
    .reset_index()
    .melt("index", var_name="category", value_name="count")
    .drop(columns=["index"])
    .rename(columns={"count": "total"})
    .join(
        pd.DataFrame(test_esp_protected_rasterstats)
        .reset_index()
        .melt("index", var_name="category", value_name="count")
        .groupby("category")
        .agg({"count": "sum"}),
        on="category",
    )
    .assign(iso_3="ESP")
)

test_esp["frac"] = (test_esp["count"] / test_esp["total"]) * 100
test_esp

In [None]:
##### Exact extract
test_total_exact_extract = exact_extract(
    local_biome_raster_path,
    gadm_data_simp[gadm_data_simp["iso_3"] == "ESP"],
    ["unique", "frac", "count"],
    include_cols="iso_3",
    output="pandas",
).explode(["unique", "frac"])

test_total_exact_extract["unique"] = test_total_exact_extract["unique"].map(land_cover_classes)
test_total_exact_extract["total"] = (
    test_total_exact_extract["count"] * test_total_exact_extract["frac"]
)
display(test_total_exact_extract)

In [None]:
##### Exact extract
test_protected_exact_extract = exact_extract(
    local_biome_raster_path,
    dissolved_simp_gdf[dissolved_simp_gdf.gid_0 == "ESP"],
    ["unique", "frac", "count"],
    include_cols="gid_0",
    output="pandas",
).explode(["unique", "frac"])
#     .groupby("unique")
#     .agg({"frac": "sum", "gid_0": "count"})
# ).reset_index()
test_protected_exact_extract["protected"] = (
    test_protected_exact_extract["count"] * test_protected_exact_extract["frac"]
)
test_protected_exact_extract = (
    test_protected_exact_extract.groupby("unique").agg({"protected": "sum"}).reset_index()
)
test_protected_exact_extract["unique"] = test_protected_exact_extract["unique"].map(
    land_cover_classes
)

In [None]:
test_total_exact_extract.join(test_protected_exact_extract.set_index("unique"), on="unique")

In [None]:
compare_table = (
    test_old[test_old["iso_3"] == "ESP"]
    .join(test_esp.set_index("category"), on="habitats", rsuffix="_test")
    .join(
        test_total_exact_extract.join(
            test_protected_exact_extract.set_index("unique"), on="unique"
        ).set_index("unique"),
        on="habitats",
        rsuffix="_exact",
    )[
        [
            "iso_3",
            "habitats",
            "total",
            "total_test",
            "total_exact",
            "protected",
            "count",
            "protected_exact",
        ]
    ]
)

In [None]:
compare_table["frac_def"] = (compare_table["protected"] / compare_table["total"]) * 100
compare_table["frac_exact"] = (
    compare_table["protected_exact"] / compare_table["total_exact"]
) * 100
compare_table["frac_test"] = (compare_table["count"] / compare_table["total_test"]) * 100

compare_table

# difference

In [None]:
sks_data_path = data_folder.joinpath("skytruth")

In [None]:
dissolved_simp_gdf = gpd.read_file(sks_data_path.joinpath("mpas_sjoin_dissolved_simp.shp"))
gadm_data_simp = gpd.read_file(sks_data_path.joinpath("gadm_preprocess.shp"))

In [None]:
grid = ComputationGrid(
    bounds=[-180, -90, 180, 90],
    cell_size=20,
    crs=CRS.from_epsg(4326),
)

In [None]:
grid.create_gdf_density_based_grid(dissolved_simp_gdf.explode(), 5000)
grid.grid_gdf.reset_index().explore()

In [None]:
test_area = grid.grid_gdf[grid.grid_gdf.index == 235]

test_gadm = gadm_data_simp.iloc[gadm_data_simp.sindex.intersection(test_area.total_bounds)].clip(
    test_area.total_bounds
)

test_wdpa = dissolved_simp_gdf.iloc[
    dissolved_simp_gdf.sindex.intersection(test_area.total_bounds)
].clip(test_area.total_bounds)
test_filtered = test_wdpa.loc[
    test_wdpa.geometry.geom_type.isin(["MultiPolygon", "Polygon"])
].reset_index()

result_test = gpd.overlay(
    test_gadm,
    test_filtered,
    how="difference",
).reset_index(drop=True)

result_test.explore()

In [None]:
# Spatial difference

diffenrence_dataset = await spatial_difference(gadm_data_simp, dissolved_simp_gdf, grid)

In [None]:
diffenrence_dataset.to_file(sks_data_path.joinpath("gadm_diff.shp"))