In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging
import sys
from pathlib import Path
import time
import pandas as pd
import geopandas as gpd
import numpy as np
import json

scripts_dir = Path("../../").joinpath("src")
if scripts_dir not in sys.path:
    sys.path.insert(0, scripts_dir.resolve().as_posix())

from helpers.strapi import Strapi
from helpers.settings import get_settings, Settings
from helpers.file_handler import FileConventionHandler
from helpers.utils import download_and_unzip_if_needed, writeReadGCP

from pipelines.output_schemas import (
    FPLSchema,
    ProtectionLevelSchema,
    MPAsSchema,
    HabitatsSchema,
    LocationSchema,
    ProtectedAreaExtentSchema,
)
from pipelines.processors import (
    add_envelope,
    add_location_iso,
    expand_multiple_locations,
    add_region_iso,
    calculate_eez_area,
    add_bbox,
    add_groups_and_members,
    add_location_name,
    output,
    clean_geometries,
    filter_by_exluding_propossed_mpas,
    spatial_join,
    process_mpa_data,
    assign_iso3,
    calculate_global_area,
    separate_parent_iso,
    calculate_stats_cov,
    coverage_stats,
    mpaatlas_filter_stablishment,
    process_mpaatlas_data,
    calculate_stats,
    fix_monaco,
    batch_export,
    calculate_area,
    define_is_child,
    set_child_id,
    add_child_parent_relationship,
    columns_to_lower,
    extract_wdpaid_mpaatlas,
)

logging.basicConfig(level=logging.DEBUG)
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("fiona").setLevel(logging.WARNING)

In [None]:
mysettings = get_settings()
prev_step = "preprocess"
current_step = "stats"

In [None]:
pipe_eez = "eez"
pipe_eez_dir = FileConventionHandler(pipe_eez)
pipe_gadm = "gadm"
pipe_gadm_dir = FileConventionHandler(pipe_gadm)

output_file = pipe_gadm_dir.get_processed_step_path(current_step).joinpath("locations.json")

# Download the EEZ file && unzip it
download_and_unzip_if_needed(pipe_eez_dir, prev_step, mysettings)

# Download the EEZ file && unzip it
download_and_unzip_if_needed(pipe_gadm_dir, prev_step, mysettings)

In [None]:
from typing import Union, List
import pandera as pa
from pandera.typing import Index, Series
from pandera.typing.geopandas import GeoDataFrame, GeoSeries
import pandas as pd

class LocationSchemaAll(pa.DataFrameModel):
    id: Index[int] = pa.Field(gt=0, coerce=True)
    code: Series[str] = pa.Field(coerce=True)
    name: Series[str] = pa.Field(coerce=True)
    totalMarineArea: Series[float] = pa.Field(ge=0, coerce=True)  # noqa: N815
    totalLandArea: Series[float] = pa.Field(ge=0, coerce=True)  # noqa: N815
    type: Series[str] = pa.Field(
        unique_values_eq=["country", "worldwide", "region", "highseas"], coerce=True
    )
    groups: Series[List[int]] = pa.Field(coerce=True)
    bounds: Series[List[float]] = pa.Field(coerce=True)

def calculate_gadm_area(df: pd.DataFrame) -> pd.DataFrame:
    glob = gpd.GeoDataFrame(
        {
            "iso": "GLOB",
            "AREA_KM2": 134954835,
            "location_type": "worldwide",
            "region": np.nan,
            "geometry": gpd.GeoSeries([gpd.GeoSeries(df["geometry"]).unary_union]),
        },
        crs="EPSG:4326",
    )

    terrestrial_areas = (
        df
        .dissolve(by=["iso", "region"], aggfunc={"AREA_KM2": "sum"})
        .reset_index()
        .assign(location_type="country")
    )
    regions_areas = (
        df
        .dissolve(by=["region"], aggfunc={"AREA_KM2": "sum"})
        .reset_index()
        .rename(columns={"region": "iso"})
        .assign(location_type="region")
    )
    result = (
        pd.concat(
            [
                glob,
                regions_areas,
                terrestrial_areas,
            ],
            ignore_index=True,
        )
        .dropna(subset=["iso"])
        .reset_index(drop=True)
    )
    result.index = result.index + 1
    result.index.name = "id"

    return result.assign(id=result.index)

def add_groups_and_members_land(df: pd.DataFrame | gpd.GeoDataFrame) -> pd.DataFrame | gpd.GeoDataFrame:
    return df.assign(
        groups=lambda row: row[["region", "location_type"]].apply(
            lambda x: (np.where(df.iso == x["region"])[0] + 2).tolist()
            if x["location_type"] == "country"
            else [],
            axis=1,
        )
    )

def combine_bounds(marine_bounds, land_bounds):
    # Check if marine bounds are valid
    if isinstance(marine_bounds, list) and len(marine_bounds) == 4:
        return marine_bounds
    # If marine bounds are not valid, check land bounds
    elif isinstance(land_bounds, list) and len(land_bounds) == 4:
        return land_bounds
    # If neither bounds are valid, return an empty list
    else:
        return []

def combine_columns(df, col1, col2, new_col):
    """
    Combine two columns in a DataFrame using combine_first and assign to a new column.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the columns to combine.
    col1 (str): The name of the first column.
    col2 (str): The name of the second column.
    new_col (str): The name of the new column to assign the combined result.

    Returns:
    pd.DataFrame: The DataFrame with the new combined column.
    """
    df[new_col] = df[col1].combine_first(df[col2])
    return df


In [None]:
# Process EEZ data (marine data)
locations = (
    gpd.read_file(pipe_eez_dir.get_step_fmt_file_path(prev_step, "shp"))
    .pipe(add_envelope)
    .pipe(add_location_iso)
    .pipe(expand_multiple_locations)
    .pipe(add_region_iso, 'iso')
    .pipe(calculate_eez_area)
    .pipe(add_bbox)
    .pipe(add_groups_and_members)
    .pipe(add_location_name)
    .rename(
        columns={
            "iso": "code",
            "AREA_KM2": "totalMarineArea",
            "location_type": "type",
        }
    )
).reset_index(drop=True)

locations.drop(
    columns=list(
        set(locations.columns) -
        set(["code", "name", "totalMarineArea", "type", "groups", "bounds", "id"])
    ),
    inplace=True,
)


# Create a lookup dictionary for IDs from EEZ data
id_lookup = locations.set_index('code')['id'].to_dict()

# Process GADM data (land data)
locations_land = (
    gpd.read_file(pipe_gadm_dir.get_step_fmt_file_path(prev_step, "shp"))
    .rename(columns={"GID_0": "iso", 'area_km2': 'AREA_KM2'})
    .pipe(add_envelope)
    .pipe(add_region_iso, 'iso')
    .pipe(calculate_gadm_area)
    .pipe(add_bbox)
    .pipe(add_groups_and_members_land)
    .pipe(add_location_name)
    .rename(
        columns={
            "iso": "code",
            "AREA_KM2": "totalLandArea",
            "location_type": "type",
        }
    )
).reset_index(drop=True)

# Apply the EEZ IDs to the GADM dataset
locations_land['id'] = locations_land['code'].map(id_lookup)

# Identify the NaN values in the id column
nan_mask = locations_land['id'].isna()

# Generate new IDs for any GADM rows without an EEZ match
new_ids = pd.Series(
    range(max(id_lookup.values()) + 1, max(id_lookup.values()) + 1 + nan_mask.sum()),
    index=locations_land[nan_mask].index
)

# Assign the new IDs to the NaN values in the id column
locations_land['id'] = locations_land['id'].fillna(new_ids).astype(int)

# Drop unnecessary columns in GADM data
locations_land.drop(
    columns=list(
        set(locations_land.columns) -
        set(["code", "name", "totalLandArea", "type", "groups", "bounds", "id"])
    ),
    inplace=True,
)

# Merge EEZ and GADM datasets
combined_locations = pd.merge(
    locations, locations_land,
    on=['code', 'id'],
    suffixes=('_marine', '_land'),
    how='outer'  # Use 'outer' join to keep all records
)

# Replace NaN values in TotalMarineArea and TotalLandArea with 0
combined_locations['totalMarineArea'] = combined_locations['totalMarineArea'].fillna(0)
combined_locations['totalLandArea'] = combined_locations['totalLandArea'].fillna(0)
combined_locations['id'] = combined_locations['id'].astype(int)

# Combine bounding boxes from both datasets
combined_locations['bounds'] = combined_locations.apply(lambda row: combine_bounds(row['bounds_marine'], row['bounds_land']), axis=1)

# Combine data from land and marine
combined_locations = combine_columns(combined_locations, 'type_marine', 'type_land', 'type')
combined_locations = combine_columns(combined_locations, 'groups_marine', 'groups_land', 'groups')
combined_locations = combine_columns(combined_locations, 'name_marine', 'name_land', 'name')

# Drop unnecessary columns
combined_locations.drop(
    columns=[col for col in combined_locations.columns if col.endswith('_marine') or col.endswith('_land')],
    inplace=True
)
combined_locations = combined_locations.reset_index(drop=True)

combined_locations['index'] = combined_locations['id']
combined_locations.set_index('index', inplace=True)
combined_locations.sort_index(inplace=True)

# Step 8: Prepare final JSON output (stored in gadm folder)
output_locations_combined = {
    "version": 2,
    "data": {
        "api::location.location": LocationSchemaAll(pd.DataFrame(combined_locations)).to_dict(
            orient="index"
        )
    },
}

# Step 9: Write the output to a JSON file (stored in gadm folder)
with open(output_file, "w") as f:
    json.dump(output_locations_combined, f)

del output_locations_combined


In [None]:
## Create locations_code (stored in gadm folder)
(combined_locations[['id', 'code']]
 .to_csv(pipe_gadm_dir.get_processed_step_path(current_step)
     .joinpath('locations_code.csv'), index=False))
