In [194]:
%load_ext autoreload
%autoreload 2

In [1]:
import logging
import sys
from pathlib import Path
import time
import pandas as pd
import geopandas as gpd
import numpy as np
import json
import dotenv
import os
import logging
from typing import Tuple, List
import sys
from pathlib import Path
import pandas as pd
import geopandas as gpd
import numpy as np
import asyncio
from tqdm.asyncio import tqdm
from itertools import product
from shapely.geometry import box

dotenv.load_dotenv()

scripts_dir = Path(".").joinpath("src")
if scripts_dir not in sys.path:
    sys.path.insert(0, scripts_dir.resolve().as_posix())

from helpers.strapi import Strapi
from helpers.settings import get_settings, Settings
from helpers.file_handler import FileConventionHandler
from helpers.utils import download_and_unzip_if_needed, writeReadGCP

from pipelines.output_schemas import (
    FPLSchema,
    ProtectionLevelSchema,
    MPAsSchema,
    HabitatsSchema,
    LocationSchema,
    ProtectedAreaExtentSchema,
)
from pipelines.processors import (
    add_envelope,
    add_location_iso,
    expand_multiple_locations,
    add_region_iso,
    calculate_eez_area,
    add_bbox,
    add_groups_and_members,
    add_location_name,
    output,
    clean_geometries,
    filter_by_exluding_propossed_mpas,
    spatial_join,
    process_mpa_data,
    assign_iso3,
    calculate_global_area,
    separate_parent_iso,
    calculate_stats_cov,
    coverage_stats,
    mpaatlas_filter_stablishment,
    process_mpaatlas_data,
    calculate_stats,
    fix_monaco,
    batch_export,
    calculate_area,
    define_is_child,
    set_child_id,
    add_child_parent_relationship,
    columns_to_lower,
    extract_wdpaid_mpaatlas,
    simplify_async,
    process_tpa_data,
    get_matches,
    repair_geometry, 
    arrange_dimensions, 
)
from pipelines.utils import background

logging.basicConfig(level=logging.DEBUG)
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("fiona").setLevel(logging.WARNING)
logger = logging.getLogger("notebook")

In [2]:
mysettings = get_settings()
prev_step = "preprocess"
current_step = "stats"

In [3]:
from typing import List, Dict
import pandera as pa
from pandera.typing import Index, Series

def change_ata_to_abnj(df):
    """
    Changes values in the parent_iso column from 'ATA' to 'ABNJ' as there is no 'ATA' stats in Protected Planet.
    """
    # Count the occurrences of 'ATA'
    count_changes = df['parent_iso'].value_counts().get('ATA', 0)
    
    # Replace 'ATA' with 'ABNJ'
    df['parent_iso'] = df['parent_iso'].replace('ATA', 'ABNJ')

    return df


def add_total_marine_area(df):
    # Read the JSON file
    with open(scripts_dir.joinpath('data_commons/data/locations_all.json'), 'r') as f:
        locations_data = json.load(f)
    
    # Access the nested dictionary
    locations_dict = locations_data.get('data', {}).get('api::location.location', {})
    
    # Create a lookup dictionary from the nested dictionary
    marine_area_lookup = {item['code']: item['total_marine_area'] for item in locations_dict.values()}
    
    # Identify the column that contains the word 'iso'
    iso_column = [col for col in df.columns if 'iso' in col][0]

    # Perform the mapping using the identified column
    df['total_marine_area'] = df[iso_column].map(marine_area_lookup)
    
    return df

def add_total_terrestrial_area(df):
    # Read the JSON file
    with open(scripts_dir.joinpath('data_commons/data/locations_all.json'), 'r') as f:
        locations_data = json.load(f)
    
    # Access the nested dictionary
    locations_dict = locations_data.get('data', {}).get('api::location.location', {})
    
    # Create a lookup dictionary from the nested dictionary
    marine_area_lookup = {item['code']: item['total_terrestrial_area'] for item in locations_dict.values()}
    
    # Identify the column that contains the word 'iso'
    iso_column = [col for col in df.columns if 'iso' in col][0]

    # Perform the mapping using the identified column
    df['total_terrestrial_area'] = df[iso_column].map(marine_area_lookup)
    
    return df

def add_mpa_oecm_percentages(df):
    # Calculate the total protectedAreasCount for each year and iso_3
    total_counts = df.groupby(['year', 'iso_3'])['protectedAreasCount'].transform('sum')

    # Calculate the counts for PA_DEF == 0 and PA_DEF == 1
    df['oecm_count'] = df['protectedAreasCount'].where(df['PA_DEF'] == 0, 0)
    df['pa_count'] = df['protectedAreasCount'].where(df['PA_DEF'] == 1, 0)

    # Calculate the percentages
    df['oecms'] = df.groupby(['year', 'iso_3'])['oecm_count'].transform('sum') / total_counts * 100
    df['pas'] = df.groupby(['year', 'iso_3'])['pa_count'].transform('sum') / total_counts * 100

    # Aggregate the results and fill NaN values with 0
    final_df = df.groupby(['year', 'iso_3']).agg(
        area=('area', 'sum'),
        protected_areas_count=('protectedAreasCount', 'sum'),
        oecms=('oecms', 'first'),
        pas=('pas', 'first')
    ).reset_index().fillna(0)

    return final_df

def calculate_pa_def_percentages(df: pd.DataFrame, iso_col: str = "iso_3") -> pd.DataFrame:
    """
    Calculate the percentages for each PA_DEF value.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the cumulative counts of PA_DEF values.
    iso_col (str): The column name for the iso_3 values. Default is "iso_3".

    Returns:
    pd.DataFrame: A DataFrame with the percentages of PA_DEF values for each iso_3 and each year.
    """
    
    df['protected_areas_count'] = df['0'] + df['1']
    df['oecms'] = (df['0'] / df['protected_areas_count']) * 100
    df['pas'] = (df['1'] / df['protected_areas_count']) * 100

    df = df.drop(columns=['0', '1'], errors='ignore')

    return df

def calculate_coverage_percentage_mpatlas(df):
    df['percentage'] = (df['area_km2'] / df['total_marine_area']) * 100
    return df

def calculate_coverage_percentage_pa(df):
    if 'total_marine_area' in df.columns:
        df['coverage'] = (df['protected_area'] / df['total_marine_area']) * 100
    elif 'total_terrestrial_area' in df.columns:
        df['coverage'] = (df['protected_area'] / df['total_terrestrial_area']) * 100
    else:
        df['coverage'] = np.nan

    return df

def calculate_global_contribution(df):
    if 'total_marine_area' in df.columns:
        df['global_contribution'] = (df['protected_area'] / 361000000) * 100
    elif 'total_terrestrial_area' in df.columns:
        df['global_contribution'] = (df['protected_area'] / 134954835) * 100
    else:
        df['global_contribution'] = np.nan
    return df

def add_is_last_year(df):
    # Find the latest year for each iso_3
    latest_years = df.groupby('iso_3')['year'].transform('max')
    
    # Create the is_last_year column
    df['is_last_year'] = df['year'] == latest_years
    
    return df

def add_environment(df):
    """
    Adds a column 'environment' based on the presence of 'totalMarineArea' or 'totalLandArea'.

    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: The DataFrame with the 'environment' column added.
    """
    if 'total_marine_area' in df.columns:
        df['environment'] = 'marine'
    elif 'total_terrestrial_area' in df.columns:
        df['environment'] = 'terrestrial'
    else:
        df['environment'] = 'unknown' 
    
    return df

def coverage_stats2(
    df: pd.DataFrame,
    area_col: str = "area",
    sort_vals: List[str] = ["iso_3", "year"],
) -> pd.DataFrame:
    """only relevant to get the coverage numbers for mpa"""
    return df.assign(
        protected_area=(
            df.sort_values(by=sort_vals)[area_col]
            - df.sort_values(by=sort_vals)
            .groupby(sort_vals)[area_col]
            .shift(-1, fill_value=0)
            .reset_index(drop=True)
        ).round(2),
    )

def process_mpaatlas_data(gdf: gpd.GeoDataFrame) -> pd.DataFrame:
    return (
        gdf.dissolve(by=["protecti_1", "iso_3"], aggfunc={"name": "count"})
        .reset_index()
        .pipe(calculate_area, "area_km2", None)
        .drop(columns=["geometry"])
    )

def separate_parent_iso(df: pd.DataFrame, iso_column="iso_3", separator=";") -> pd.DataFrame:
    df[iso_column] = (
        df[iso_column].str.replace(" ", "").str.replace(":", separator).str.split(separator)
    )
    return df.explode(iso_column)

def output2(
    df: pd.DataFrame, iso_column: str, rep_d: dict, rename: Dict[str, str], drop_cols: List[str]
) -> pd.DataFrame:
    """Output function formatter for the data.

    Args:
        df (pd.DataFrame): The DataFrame to process.
        iso_column (str): The column containing the ISO codes.
        rep_d (dict): A dictionary of values to replace.
        rename (Dict[str, str]): A dictionary of columns to rename.
        drop_cols (List[str]): A list of columns to drop.

    Returns:
        pd.DataFrame: The processed DataFrame.
    """
    if iso_column:
        locations_code = pd.read_csv(
            scripts_dir.joinpath("data_commons/data/locations_code_all.csv"),
            keep_default_na=False,
            na_values=[]
        )
        df = df.join(locations_code.set_index("code"), on=iso_column, how="left")
    return (
        df.replace(rep_d)
        .rename(columns=rename)
        .drop(columns=drop_cols)
        .assign(
            id=df.index + 1,
        )
        .set_index("id")
    )

def set_child_id_pa(
    df: pd.DataFrame | gpd.GeoDataFrame, columns: list[str] = ["wdpa_pid"]
) -> pd.DataFrame | gpd.GeoDataFrame:
    return df.assign(child_id=df[columns].bfill(axis=1)[columns[0]])

def calculate_global_area_pa(
    df: pd.DataFrame,
    gby_col: list,
    agg_ops: Dict[str, str] = {"area": "sum"},
    iso_column="iso_3",
) -> pd.DataFrame:
    global_area = df.groupby([*gby_col]).agg(agg_ops).reset_index().assign(**{iso_column: "GLOB"})
    return pd.concat([global_area, df], ignore_index=True)


def cumulative_pa_def_counts(df: pd.DataFrame, year_col: str = "STATUS_YR", pa_def_col: str = "PA_DEF", iso_col: str = "iso_3", start_year: int = 2010) -> pd.DataFrame:
    """
    Calculate the cumulative number of PA_DEF values for each iso_3 and each year starting from a given year.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    year_col (str): The column name for the year. Default is "STATUS_YR".
    pa_def_col (str): The column name for the PA_DEF values. Default is "PA_DEF".
    iso_col (str): The column name for the iso_3 values. Default is "iso_3".
    start_year (int): The starting year for cumulative counts. Default is 2010.

    Returns:
    pd.DataFrame: A DataFrame with cumulative counts of PA_DEF values for each iso_3 and each year.
    """
    
    results = []
    years = sorted(df[year_col].unique())

    for year in years:
        if year < start_year:
            continue
        cumulative_data = df[df[year_col] <= year]
        pa_def_counts = cumulative_data.groupby([iso_col, pa_def_col]).size().unstack(fill_value=0)
        pa_def_counts['year'] = year
        results.append(pa_def_counts.reset_index())

    final_results = pd.concat(results, ignore_index=True)
    final_results = final_results.fillna(0)
    final_results = final_results.groupby([iso_col, 'year']).sum().reset_index()

    final_results['protected_areas_count'] = final_results['0'] + final_results['1']

    return final_results

def calculate_stats_pa(
    df: pd.DataFrame, gby_col: list, iso_column: str, ops: dict[str, str] = {"protected_area": "sum"}
) -> pd.DataFrame:
    # Group by the specified columns and region, then aggregate
    regions = (
        df.groupby([*gby_col, "region"])
        .agg(ops)
        .reset_index()
        .rename(columns={"region": iso_column})
    )

    # Group by the specified columns and iso_column, then aggregate
    countries = df.groupby([*gby_col, iso_column]).agg(ops).reset_index()

    # Concatenate the results
    return pd.concat([regions, countries], ignore_index=True)

def calculate_stats_cov_pa(df: pd.DataFrame, gby_col: list, iso_column: str):
    return calculate_stats_pa(df, gby_col, iso_column, {"protected_area": "sum", "protected_areas_count": "sum", "1": "sum", "0": "sum"})




class NewProtectedAreaExtentSchema(pa.DataFrameModel):
    id: Index[int] = pa.Field(gt=0, coerce=True)
    location: Series[int] = pa.Field(gt=0, coerce=True)
    protected_area: Series[float] = pa.Field(ge=0, coerce=True)
    protected_areas_count: Series[int] = pa.Field(ge=0, coerce=True)
    oecms: Series[float] = pa.Field(ge=0, le=100, coerce=True)
    pas: Series[float] = pa.Field(ge=0, le=100, coerce=True)
    coverage: Series[float] = pa.Field(ge=0, le=100, coerce=True)
    global_contribution: Series[float] = pa.Field(ge=0, le=100, coerce=True)
    year: Series[int] = pa.Field(ge=2000, coerce=True)
    is_last_year: Series[bool] = pa.Field(coerce=True)
    environment: Series[str] = pa.Field(isin=["marine", "terrestrial"], coerce=True)

class NewProtectionLevelSchema(pa.DataFrameModel):
    id: Index[int] = pa.Field(gt=0, coerce=True)
    location: Series[int] = pa.Field(gt=0, coerce=True)
    mpaa_protection_level: Series[int] = pa.Field(ge=0, coerce=True)
    year: Series[int] = pa.Field(gt=1900, coerce=True)
    area: Series[float] = pa.Field(ge=0, coerce=True)
    percentage: Series[float] = pa.Field(ge=0, le=100, coerce=True)

class PAsSchema(pa.DataFrameModel):
    id: Index[int] = pa.Field(gt=0, coerce=True)
    wdpaid: Series[pd.Int64Dtype] = pa.Field(coerce=True, nullable=True)
    child_id: Series[str] = pa.Field(coerce=True)
    name: Series[str] = pa.Field(coerce=True)
    year: Series[pd.Int32Dtype] = pa.Field(gt=1700, nullable=True)
    area: Series[float] = pa.Field(ge=0, coerce=True)
    bbox: Series[List[float]] = pa.Field(coerce=True)
    location: Series[int] = pa.Field(ge=0, coerce=True)
    protection_status: Series[int] = pa.Field(ge=0, nullable=True)
    mpaa_establishment_stage: Series[pd.Int32Dtype] = pa.Field(ge=0, nullable=True, coerce=True)
    mpaa_protection_level: Series[pd.Int32Dtype] = pa.Field(ge=0, nullable=True, coerce=True)
    pa_iucn_category: Series[pd.Int32Dtype] = pa.Field(coerce=True, nullable=True)
    designation: Series[str] = pa.Field(coerce=True, nullable=True)
    is_child: Series[bool] = pa.Field(coerce=True)
    children: Series[List[int]] = pa.Field(coerce=True, nullable=True)
    data_source: Series[int] = pa.Field(coerce=True)
    coverage: Series[float] = pa.Field(ge=0, le=100, nullable=True)
    environment: Series[str] = pa.Field(isin=["marine", "terrestrial"], coerce=True)



In [None]:
# Code for pa terrestrial processing

def split_by_year(
    gdf: gpd.GeoDataFrame, year_col: str = "STATUS_YR", year_val: int = 2010
) -> List[gpd.GeoDataFrame]:
    """Split data by year. relevant for MPA data.(coverage indicator)"""
    prior_2010 = (
        gdf[gdf[year_col] <= year_val][["iso_3", "STATUS_YR", "geometry"]]
        .dissolve(
            by=["iso_3"],
        )
        .assign(year=2010)
        .reset_index()
    )

    after_2010 = (
        gdf[gdf["STATUS_YR"] > 2010][["iso_3", "STATUS_YR", "geometry"]]
        .rename(columns={"STATUS_YR": "year"})
    )
    return [prior_2010, after_2010]


def create_grid(bounds: Tuple[float, float, float, float], cell_size: int = 1) -> gpd.GeoDataFrame:
    """Create a grid of cells for a given GeoDataFrame"""
    minx, miny, maxx, maxy = bounds
    x = np.arange(minx, maxx, cell_size)
    y = np.arange(miny, maxy, cell_size)
    polygons = [
        {
            "geometry": box(i, j, i + cell_size, j + cell_size),
            "cell_id": f"{i}_{j}",
        }
        for i, j in product(x, y)
    ]
    return gpd.GeoDataFrame(polygons)


def subdivide_grid(
    grid_gdf: gpd.GeoDataFrame, gdf: gpd.GeoDataFrame, max_cellsize: float, max_complexity: int
) -> List:
    subdivided_elements = []
    for grid_element in grid_gdf.geometry:
        candidates = get_matches(grid_element, gdf)
        density = len(candidates)
        if density > max_complexity:
            
            subdivision_cellsize = max_cellsize / 2
            # Subdivide the grid element recursively
            subgrid = create_grid(grid_element.bounds, subdivision_cellsize)
            subdivided_elements.extend(
                subdivide_grid(subgrid, gdf, subdivision_cellsize, max_complexity)
            )
        elif density > 0:
            subdivided_elements.append(grid_element)

    return subdivided_elements


def create_density_based_grid(
    gdf: gpd.GeoDataFrame, max_cellsize: int = 10, max_complexity: int = 10000
) -> gpd.GeoDataFrame:
    # Get the bounds of the GeoDataFrame
    minx, miny, maxx, maxy = gdf.total_bounds

    # Create an initial grid
    grid_gdf = create_grid((minx, miny, maxx, maxy), max_cellsize)

    # Subdivide grid elements based on density and complexity
    subdivided_elements = subdivide_grid(grid_gdf, gdf, max_cellsize, max_complexity)

    return gpd.GeoDataFrame(geometry=subdivided_elements)


#  TODO: refactor this so old function mantains functionality for marine areas

def split_gdf_by_grid(gdf: gpd.GeoDataFrame, grid_gdf: gpd.GeoDataFrame):
    result = []
    gdf["already_processed"] = False
    for geometry in grid_gdf.geometry:
        candidates = get_matches(geometry, gdf)
        subset = gdf.loc[candidates.index][~gdf["already_processed"]]
        gdf.loc[subset.index, "already_processed"] = True
        if not subset.empty:
            result.append(subset.drop(columns=["already_processed"]).reset_index(drop=True).copy())
    return result


@background
def spatial_join_chunk(df_large_chunk, df_small, pbar):
    try:
        bbox = df_large_chunk.total_bounds

        candidates = get_matches(box(*bbox), df_small.geometry)
        if len(candidates) > 0:
            subset = df_small.loc[candidates.index].clip(box(*bbox))

            result = (
                gpd.overlay(df_large_chunk, subset).reset_index(drop=True)
                .clip(subset.geometry)
                .reset_index(drop=True)
            )
            result.geometry = result.geometry.apply(repair_geometry)
        else:
            result = gpd.GeoDataFrame(columns=df_large_chunk.columns)
        return result
    except Exception as e:
        logging.error(e)
        return gpd.GeoDataFrame()
    finally:
        pbar.update(1)


async def spatial_join(
    geodataframe_a: gpd.GeoDataFrame, geodataframe_b: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:
    """Create spatial join between two GeoDataFrames."""
    # we build the spatial index for the larger GeoDataFrame
    smaller_dim, larger_dim = arrange_dimensions(geodataframe_a, geodataframe_b)

    logger.info(f"Processing {len(larger_dim)} elements")

    grid = create_density_based_grid(larger_dim, max_cellsize=10, max_complexity=5000)

    logger.info(f"grid created with {len(grid)} cells")

    list_of_chunks = split_gdf_by_grid(larger_dim, grid)

    logger.info(f"grid split into {len(list_of_chunks)} chunks")

    with tqdm(total=len(list_of_chunks)) as pbar:  # we create a progress bar
        new_df = await asyncio.gather(
            *(spatial_join_chunk(chunk, smaller_dim, pbar) for chunk in list_of_chunks)
        )

    return gpd.GeoDataFrame(pd.concat(new_df, ignore_index=True), crs=smaller_dim.crs)


@background
def spatial_dissolve_chunk(geometry, gdf, pbar):
    try:
        logger.info("Processing chunk")
        candidates = get_matches(
            geometry,
            gdf.geometry,
        )
        subset = gdf.loc[candidates.index]

        result = pd.concat(
            subset.clip(geometry).pipe(split_by_year, year_col="STATUS_YR"), ignore_index=True
        ).copy()

        data_chunk = [
            (
                result[result["year"] <= 2010]
                .reset_index()
                .pipe(calculate_area, "area", None)
                .drop(columns=["geometry"])
            )
        ]
        for year in range(2011, 2025):
            data_chunk.append(
                result[result["year"] <= year]
                .dissolve(
                    by=["iso_3"],
                )
                .assign(year=year)
                .reset_index()
                .pipe(calculate_area, "area", None)
                .drop(columns=["geometry"])
            )

        return pd.concat(data_chunk, ignore_index=True)
    except Exception as e:
        logging.error(e)
        return gpd.GeoDataFrame()
    finally:
        pbar.update(1)

async def process_grid(gdf):
    grid_gdf = create_density_based_grid(gdf, max_cellsize=10, max_complexity=5000)
    logger.info(f"grid created with {grid_gdf.shape[0]} cells")

    with tqdm(total=grid_gdf.shape[0], desc="Processing grid elements") as pbar:
        jobs = [spatial_dissolve_chunk(geometry, gdf, pbar) for geometry in grid_gdf.geometry.values]
        result = await asyncio.gather(*jobs)
    return result

### Coverage stats - Mpas

We are going to use the intermediate data from eez, in order to create a dataset that can be used as a land mask.
The steps are:
1. Load eez
2. Spatial inner Join the eez dataset with the Mpas one
3. Assign the location iso
4. dissolve by location iso and cummulative year
5. calculate the area for global regions and eez countries
6. prepare the data to be ingested in strapi
7. upload the data to strapi

In [4]:
pipe = "mpa"
strapi_collection = ""

pipe_dir_eez = FileConventionHandler("eez")
pipe_dir_mpas = FileConventionHandler(pipe)
output_file = pipe_dir_mpas.get_processed_step_path(current_step).joinpath(
    "mpa_landmask_strapi.csv"
)

# Download the EEZ file && unzip it
download_and_unzip_if_needed(pipe_dir_eez, prev_step, mysettings)
# Download the mpas file && unzip it
download_and_unzip_if_needed(pipe_dir_mpas, prev_step, mysettings)

# Load the data
eez = gpd.read_file(pipe_dir_eez.get_step_fmt_file_path(prev_step, "shp")).pipe(clean_geometries)
mpas = gpd.read_file(pipe_dir_mpas.get_step_fmt_file_path(prev_step, "shp")).pipe(clean_geometries)

/home/sofia/dev/skytruth-30x30/data/data/eez/processed/eez_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/eez/processed/preprocess
/home/sofia/dev/skytruth-30x30/data/data/mpa/processed/mpa_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/mpa/processed/preprocess


In [5]:
eez_mpas_data_join = await spatial_join(eez, mpas.pipe(filter_by_exluding_propossed_mpas))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 282/282 [08:22<00:00,  1.78s/it]


In [6]:
eez_mpas_data_join.head(1)

Unnamed: 0,WDPAID,WDPA_PID,PA_DEF,NAME,DESIG_ENG,IUCN_CAT,STATUS,STATUS_YR,PARENT_ISO,GIS_M_AREA,geometry,index_right,GEONAME,MRGID,AREA_KM2,POL_TYPE,ISO_SOV1,ISO_SOV2,ISO_SOV3
0,555624810.0,555624810_D,1,Ross Sea Region Marine Protected Area,Marine Protected Area (CCAMLR),Not Reported,Designated,2017.0,ABNJ,326507.190744,"POLYGON ((150 -62.5, 150.90909 -62.5, 151.8181...",0.0,High Seas,63203.0,212881389.0,High Seas,ABNJ,,


In [None]:
# # To get an idea of the spatial join results
# eez_mpas_data_join.pipe(add_location_iso).pipe(assign_iso3).to_file(
#     pipe_dir_mpas.get_processed_step_path(current_step).joinpath("mpas_sjoin.shp"), driver="ESRI Shapefile"
# )

INFO:pyogrio._io:Created 17,697 records


In [7]:
final_data = await process_mpa_data(
    eez_mpas_data_join.pipe(add_location_iso).pipe(assign_iso3),
    range(2011, time.localtime().tm_year + 1),
    ["PA_DEF", "iso_3"],
    {"protectedAreasCount": "sum"},
)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [03:30<00:00, 15.01s/it]


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 392/392 [10:13<00:00, 104.86s/it]INFO:notebook:Processing chunk
INFO:notebook:Processing chunk
INFO:notebook:Processing chunk
INFO:notebook:Processing chunk
INFO:notebook:Processing chunk
INFO:notebook:Processing chunk
INFO:notebook:Processing chunk
INFO:notebook:Processing chunk
INFO:notebook:Processing chunk
INFO:notebook:Processing chunk
INFO:notebook:Processing chunk
INFO:notebook:Processing chunk
INFO:notebook:Processing chunk
INFO:notebook:Processing chunk
INFO:notebook:Processing chunk
INFO:notebook:Processing chunk
INFO:notebook:Processing chunk
INFO:notebook:Processing chunk
Processing grid elements:   0%|▎                                                                                                                                    | 1/425 [00:02<17:53,  2.53s/it]INFO:notebook:Processing chunk
Processing grid ele

In [16]:
coverage = (
    final_data.pipe(calculate_global_area, ["year", "PA_DEF"], {"area": "sum"}, "iso_3")
    .pipe(separate_parent_iso, "iso_3")
    .pipe(add_region_iso, "iso_3")
    .replace(
        {
            "iso_3": {
                "ATA": "ABNJ",
                "COK": "NZL",
                "IOT": "GBR",
                "NIU": "NZL",
                "SHN": "GBR",
                "SJM": "NOR",
                "UMI": "USA",
                "NCL": "FRA",
                "GIB": "GBR",
            }
        }
    )
    .pipe(calculate_stats_cov, ["year", "PA_DEF"], "iso_3").astype({"PA_DEF": int})
#     .pipe(add_mpa_oecm_percentages)
#     .pipe(add_total_marine_area)
#     .pipe(coverage_stats2)
#     .pipe(calculate_coverage_percentage_pa)
#     .pipe(calculate_global_contribution)
#     .pipe(add_is_last_year)
#     .pipe(add_environment)
# )


# NewProtectedAreaExtentSchema(
#     coverage.pipe(
#         output,
#         "iso_3",
#         {},
#         {},
#         ["area", "iso_3", 'total_marine_area'],
#     )
# ).to_csv(
#     output_file,
#     index=True,
)
coverage.head(2)

Unnamed: 0,year,PA_DEF,iso_3,area,protectedAreasCount
0,2010,0,AF,206.100207,10.0
1,2010,0,AS,31956.310701,24.0


In [17]:
coverage.iso_3.unique()

array(['AF', 'AS', 'EU', 'SA', 'NA', 'WA', 'COL', 'ESP', 'GLOB', 'MAR',
       'PHL', 'ABNJ', 'AGO', 'ALB', 'ARE', 'ARG', 'ATG', 'AUS', 'AZE',
       'BEL', 'BGD', 'BGR', 'BHS', 'BLZ', 'BRA', 'BRB', 'BRN', 'CAN',
       'CHL', 'CHN', 'CMR', 'COD', 'COG', 'COM', 'CPV', 'CRI', 'CUB',
       'CYP', 'DEU', 'DMA', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'EST',
       'FIN', 'FJI', 'FRA', 'GAB', 'GBR', 'GEO', 'GIN', 'GMB', 'GNB',
       'GNQ', 'GRC', 'GRD', 'GTM', 'HND', 'HRV', 'HTI', 'IDN', 'IRL',
       'IRN', 'ISL', 'ISR', 'ITA', 'JAM', 'JPN', 'KAZ', 'KEN', 'KHM',
       'KIR', 'KNA', 'KOR', 'KWT', 'LBN', 'LBR', 'LCA', 'LKA', 'LTU',
       'LVA', 'MCO', 'MDG', 'MDV', 'MEX', 'MHL', 'MLT', 'MMR', 'MNE',
       'MOZ', 'MRT', 'MUS', 'MYS', 'NAM', 'NGA', 'NIC', 'NLD', 'NOR',
       'NZL', 'OMN', 'PAK', 'PAN', 'PER', 'PLW', 'PNG', 'POL', 'PRT',
       'QAT', 'ROU', 'RUS', 'SAU', 'SDN', 'SEN', 'SLB', 'SLE', 'SLV',
       'STP', 'SUR', 'SVN', 'SWE', 'SYC', 'THA', 'TKM', 'TLS', 'TON',
       'TTO', 'TU

In [72]:
remote_path = 'vizzuality_processed_data/strapi_tables/mpa_coverage.csv'

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=output_file,
    operation="w",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


In [None]:
# strapi_collection = "protection-coverage-stat"

In [None]:
# strapi.deleteCollectionData(strapi_collection, list(range(1, 2300)))

<helpers.strapi.Strapi at 0x7fda8ddb8860>

In [None]:
# strapi.importCollectionData(
#     strapi_collection,
#     output_file,
# )

### Mpa atlas - country stats Fully or highly protected

We are going to use the intermediate data from eez, in order to create a dataset that can be used as a land mask.
The steps are:
1. Load eez
2. Spatial inner Join the eez dataset with the Mpaatlas one
3. iso assign using the sovereign one provided by mpaatlas
4. dissolve by location
5. calculate the area for global regions and eez countries ussing mollwide projection
6. prepare the data to be ingested in strapi
7. upload the data to strapi

In [73]:
pipe = "mpaatlas"
strapi_collection = "mpaa-protection-level-stat"

pipe_dir_eez = FileConventionHandler("eez")
pipe_dir_mpaatlas = FileConventionHandler(pipe)
output_file = pipe_dir_mpaatlas.get_processed_step_path(current_step).joinpath(
    "mpaatlas_protection_level.csv"
)

# Download the EEZ file && unzip it
download_and_unzip_if_needed(pipe_dir_eez, prev_step, mysettings)
# Download the mpas file && unzip it
download_and_unzip_if_needed(pipe_dir_mpaatlas, prev_step, mysettings)

# Load the data
eez = gpd.read_file(pipe_dir_eez.get_step_fmt_file_path(prev_step, "shp")).pipe(clean_geometries)
mpaatlas_intermediate = gpd.read_file(
    pipe_dir_mpaatlas.get_step_fmt_file_path(prev_step, "shp")
).pipe(clean_geometries)

/home/sofia/dev/skytruth-30x30/data/data/eez/processed/eez_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/eez/processed/preprocess
/home/sofia/dev/skytruth-30x30/data/data/mpaatlas/processed/mpaatlas_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/mpaatlas/processed/preprocess


In [74]:
eez_mpaatlas_data_join = await spatial_join(
    eez, mpaatlas_intermediate.pipe(mpaatlas_filter_stablishment)
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 282/282 [00:29<00:00,  9.59it/s]


<class 'shapely.geometry.base.GeometrySequence'>


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 282/282 [00:29<00:00,  2.95s/it]

<class 'shapely.geometry.base.GeometrySequence'>


In [None]:
# To get an idea of the spatial join results
# eez_mpaatlas_data_join.to_file(
#     pipe_dir_mpaatlas.get_processed_step_path(current_step).joinpath("mpaatlas_sjoin.shp"),
#     driver="ESRI Shapefile",
# )

In [75]:
eez_mpaatlas_data_join.dissolve(by=["protecti_1", "location_i"], aggfunc={"name": "count"}).reset_index().to_file(
pipe_dir_mpaatlas.get_processed_step_path(current_step).joinpath("mpaatlas_sjoin_dissolved.shp"),
driver="ESRI Shapefile",
)

INFO:pyogrio._io:Created 54 records


In [79]:
result = (
    eez_mpaatlas_data_join.rename(columns={"location_i": "iso_3"})
    .pipe(process_mpaatlas_data)  
    .pipe(calculate_global_area, gby_col=["protecti_1"], iso_column="iso_3")
    .pipe(separate_parent_iso)
    .replace(
        {
            "location_i": {
                "COK": "NZL",
                "IOT": "GBR",
                "NIU": "NZL",
                "SHN": "GBR",
                "SJM": "NOR",
                "UMI": "USA",
                "NCL": "FRA",
            }
        }
    )
    .pipe(add_region_iso, iso_column="iso_3")
    .pipe(calculate_stats, gby_col=["protecti_1"], iso_column="iso_3")
    .query('protecti_1 != "less protected or unknown"')
    .pipe(fix_monaco, iso_column="iso_3", area_column="area_km2")
    .pipe(add_total_marine_area)
    .pipe(calculate_coverage_percentage_mpatlas)
    .pipe(
        output,
        iso_column="iso_3",
        rep_d={
            "protecti_1": {
                "fully or highly protected": 1,
            }
        },
        rename={"protecti_1": "mpaa_protection_level", "area_km2": "area"},
        drop_cols=["total_marine_area", "iso_3"],
    )
)

NewProtectionLevelSchema(result[~result.location.isna()].assign(year=2024)).to_csv(
    output_file, index=True
)

In [82]:
remote_path = 'vizzuality_processed_data/strapi_tables/mpaatlas_protection_level.csv'

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=output_file,
    operation="w",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


In [None]:
# strapi_collection = "mpaa-protection-level-stat"

In [None]:
# strapi.deleteCollectionData(strapi_collection, list(range(1, 300)))

<helpers.strapi.Strapi at 0x7fda8ddb8860>

In [None]:
# strapi.importCollectionData(
#     strapi_collection,
#     output_file,
# )

### Protected seas  - fishing protection level

In [83]:
pipe = "protectedseas"
strapi_collection = "fishing-protection-level-stat"

pipe_dir = FileConventionHandler(pipe)
input_file = pipe_dir.get_processed_step_path(prev_step).joinpath("protectedseas_stats.xlsx")
output_file = pipe_dir.get_processed_step_path(current_step).joinpath("lfp.csv")

# Download the protected seas file && unzip it
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name="vizzuality_processed_data/protectedseas/preprocess/protectedseas_stats.xlsx",
    file=input_file,
    operation="r",
)

# Load the data
protectedseas_intermediate = pd.read_excel(input_file)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


In [84]:
protectedseas_intermediate[
    (
        protectedseas_intermediate.iso_ter.isna()
        & protectedseas_intermediate.includes_multi_jurisdictional_areas.eq(True)
    )
    | (
        protectedseas_intermediate.iso_ter.isna()
        & protectedseas_intermediate.includes_multi_jurisdictional_areas.eq(False)
        & ~protectedseas_intermediate.iso_sov.isin(
            protectedseas_intermediate[
                protectedseas_intermediate.includes_multi_jurisdictional_areas.eq(True)
            ].iso_sov.unique()
        )
    )
][protectedseas_intermediate.iso_sov.eq("ESP")]

Unnamed: 0,iso_ter,iso_sov,includes_multi_jurisdictional_areas,lfp,area_sqkm,total_area,pct_total
320,,ESP,True,5,142.97301,1011023.776,0.014141
321,,ESP,True,4,1639.682076,1011023.776,0.16218
322,,ESP,True,3,214532.8498,1011023.776,21.219367
323,,ESP,True,2,15064.13277,1011023.776,1.489988
324,,ESP,True,1,779644.1388,1011023.776,77.114323


In [85]:
final = (
    protectedseas_intermediate[
        (
            protectedseas_intermediate.iso_ter.isna()
            & protectedseas_intermediate.includes_multi_jurisdictional_areas.eq(True)
        )
        | (
            protectedseas_intermediate.iso_ter.isna()
            & protectedseas_intermediate.includes_multi_jurisdictional_areas.eq(False)
            & ~protectedseas_intermediate.iso_sov.isin(
                protectedseas_intermediate[
                    protectedseas_intermediate.includes_multi_jurisdictional_areas.eq(True)
                ].iso_sov.unique()
            )
        )
    ].replace(
        {
            "lfp": {
                5: "highly",
                4: "highly",
                3: "moderately",
                2: "less",
                1: "less",
            },
        }
    ).groupby(["iso_sov", "lfp"]).agg({"area_sqkm": "sum", "total_area": "max"}).reset_index()
    .pipe(
        calculate_global_area,
        gby_col=["lfp"],
        iso_column="iso_sov",
        agg_ops={"area_sqkm": "sum", "total_area": "sum"},
    )
    .pipe(add_region_iso, iso_column="iso_sov")
    .pipe(
        calculate_stats,
        gby_col=["lfp"],
        ops={"area_sqkm": "sum", "total_area": "sum"},
        iso_column="iso_sov",
    )
    .pipe(lambda x: x.assign(pct=round((x.area_sqkm / x.total_area)*100, 2)))
    .pipe(
        output,
        iso_column="iso_sov",
        rep_d={
            "lfp": {
                "highly": 1,
                "moderately": 2,
                "less": 3,
            }
        },
        rename={"lfp": "fishing_protection_level", "area_sqkm": "area"},
        drop_cols=["iso_sov", "total_area"],
    )
)
FPLSchema(final[final.location.notna()]).to_csv(output_file, index=True)

In [88]:
remote_path = 'vizzuality_processed_data/strapi_tables/lfp.csv'

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=output_file,
    operation="w",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


In [None]:
# strapi.deleteCollectionData(strapi_collection, list(range(1, 500)))

In [None]:
# strapi.importCollectionData(
#     strapi_collection,
#     output_file,
# )

### Country detail table data - all together WIP

  1- lower case the columns   
2- separate location that its regime is in dispute or on join regime  
3- calcualte area for mpaatlas data  
4- rename columns for merge  
5- merge maaatlas and mpa data identifying the source  
6- identify child resources and set them as childs  
7- calculate bbox  
8- set child resources  
9- prepare output for batch export  
10- upload data to strapi  

In [89]:
pipe = "pa"
strapi_collection_pas = "pa"

pipe_dir = FileConventionHandler(pipe)
pipe_dir_mpaatlas = FileConventionHandler("mpaatlas")
output_file_mpas = pipe_dir.get_processed_step_path(current_step).joinpath("pa_detail.csv")

# Download the protected atlas file && unzip it
download_and_unzip_if_needed(pipe_dir, prev_step, mysettings)
# Download the mpaatlas file 
download_and_unzip_if_needed(pipe_dir_mpaatlas, prev_step, mysettings)

/home/sofia/dev/skytruth-30x30/data/data/pa/processed/pa_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/pa/processed/preprocess
/home/sofia/dev/skytruth-30x30/data/data/mpaatlas/processed/mpaatlas_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/mpaatlas/processed/preprocess


PosixPath('/home/sofia/dev/skytruth-30x30/data/data/mpaatlas/processed/preprocess')

In [100]:
# Load the data
pa_intermediate = gpd.read_file(pipe_dir.get_step_fmt_file_path(prev_step, "gpkg")).pipe(
    clean_geometries
)
mpaatlas_intermediate = gpd.read_file(
    pipe_dir_mpaatlas.get_step_fmt_file_path(prev_step, "shp")
).pipe(clean_geometries)

In [101]:
init_table = (
    pd.concat(
        [
            (
                pa_intermediate.pipe(columns_to_lower)
                .pipe(separate_parent_iso, iso_column="parent_iso")
                .pipe(change_ata_to_abnj)
                .rename(
                    columns={
                        "parent_iso": "iso",
                        "status_yr": "year",
                        "gis_m_area": "area_km2_marine",
                        "gis_area": "area_km2_terrestrial",
                    }
                )
                .drop(columns=['status'])
                .assign(source="protected_planet")
            ),
            (
                mpaatlas_intermediate.pipe(calculate_area)
                .pipe(extract_wdpaid_mpaatlas)
                .pipe(separate_parent_iso, iso_column="location_i")
                .rename(
                    columns={
                        "location_i": "iso",
                        "wdpa_id": "wdpa_pid",
                        "designatio": "desig_eng",
                    }
                )
                .assign(source="mpaatlas")
                .astype({"mpa_zone_i": "Int64"})
            ),
        ],
        ignore_index=True,
    )
    .reset_index(drop=True)
    .replace(
        {
            "iso": {
                "COK": "NZL",
                "IOT": "GBR",
                "NIU": "NZL",
                "SHN": "GBR",
                "SJM": "NOR",
                "UMI": "USA",
                "NCL": "FRA",
            }
        }
    )
    .sort_values(by=["wdpa_pid", "source"], ascending=[True, False])
)

KeyboardInterrupt: 

In [96]:
#  to be run if things change a lot in the future
iucn_cat = pd.DataFrame(
    {"slug": init_table.iucn_cat.dropna().unique(), "name": init_table.iucn_cat.dropna().unique()},
    index=pd.Index(np.arange(1, len(init_table.iucn_cat.dropna().unique()) + 1)),
)
iucn_cat.to_csv(pipe_dir.get_processed_step_path(current_step).joinpath("iucn_categories.csv"), index=True)

iucn_cat = pd.read_csv(
    pipe_dir.get_processed_step_path(current_step).joinpath("iucn_categories.csv"), index_col=0
)

In [104]:
def define_is_child(
    gdf: pd.DataFrame | gpd.GeoDataFrame,
    gby: str = "wdpaid",
    env_col: str = "environment",
    sort_by: dict[str, bool] = {"wdpa_pid": True, "source": False},
    col_name: str = "is_child",
) -> pd.DataFrame | gpd.GeoDataFrame:
    return gdf.assign(
        **{
            col_name: np.where(
                gdf.index.isin(
                    gdf.sort_values(by=list(sort_by.keys()), ascending=list(sort_by.values()))
                    .groupby([gby, env_col])  # Group by wdpaid and environment
                    .nth(slice(1, None))
                    .index
                ),
                True,
                False,
            )
        }
    )

In [110]:
pa_table = (
    init_table.pipe(add_bbox, "bbox")
    .pipe(define_is_child)
    .pipe(set_child_id)
    .sort_values(by=["wdpaid", "is_child"], ascending=[True, True])
    .reset_index(drop=True)
    # .pipe(add_total_areas)
    # .pipe(calculate_coverage_percentage_pa)
    # .pipe(add_environment)
    # .pipe(
    #     output,
    #     iso_column="iso",
    #     rep_d={
    #         "status": {
    #             "Adopted": 4,
    #             "implemented": 6,
    #             "Established": 6,
    #             "Designated": 5,
    #             "Proposed": 3,
    #             "Inscribed": 3,
    #             "unknown": 1,
    #         },
    #         "pa_def": {"0": 2, "1": 1},
    #         "year": {0: pd.NA},
    #         "iucn_cat": dict(
    #             iucn_cat[["slug"]]
    #             .reset_index(drop=False)
    #             .iloc[:, [1, 0]]
    #             .to_dict(orient="tight")["data"]
    #         ),
    #         "source": {"protected_planet": 3, "mpaatlas": 1},
    #         "protection": {
    #             "full": 3,
    #             "light": 4,
    #             "incompatible": 5,
    #             "high": 6,
    #             "minimal": 7,
    #             "unknown": 8,
    #             "unknown/to be determined": 8,
    #         },
    #         "establishm": {
    #             "actively managed": 4,
    #             "implemented": 6,
    #             "designated": 5,
    #             "Designated": 5,
    #             "proposed or committed": 3,
    #             "Proposed": 3,
    #             "Inscribed": 3,
    #             "Established": 5,
    #             "Adopted": 5,
    #             "unknown": 1,
    #         },
    #     },
    #     rename={
    #         "pa_def": "protection_status",
    #         "area_km2": "area",
    #         "iucn_cat": "pa_iucn_category",
    #         "desig_eng": "designation",
    #         "protection": "mpaa_protection_level",
    #         "establishm": "mpaa_establishment_stage",
    #         "source": "data_source",
    #     },
    #     drop_cols=["geometry", "protecti_1","mpa_zone_i", "iso", "total_marine_area"]
    # )
    .pipe(add_child_parent_relationship)
    # .astype(
    #     {
    #         "year": "Int32",
    #         "pa_iucn_category": "Int64",
    #         "protection_status": "Int64",
    #     }
    # )
    # .query("coverage <= 100") 
    # .sort_index()
)
pa_table.head(5)

KeyError: "Columns not found: 'data_source'"

In [106]:
pa_table[pa_table['name'] == 'Ivvavik National Park Of Canada']

Unnamed: 0,wdpaid,wdpa_pid,pa_def,name,desig_eng,iucn_cat,marine,year,iso,geometry,source,mpa_zone_i,establishm,protection,protecti_1,area_km2,environment,bbox,is_child,child_id
19056,100672,100672_A,1,Ivvavik National Park Of Canada,National Park,II,0,1984.0,CAN,"MULTIPOLYGON (((-140.83302 69.63132, -140.8350...",protected_planet,,,,,9695.837607,terrestrial,"[-141.000000001, 68.556807999, -138.1338199979...",False,100672_A
19057,100672,100672_B,1,Ivvavik National Park Of Canada,National Park,II,1,1984.0,CAN,"MULTIPOLYGON (((-139.78657 69.59821, -139.7872...",protected_planet,,,,,79.375056,terrestrial,"[-140.894068268, 69.19278843000001, -138.37542...",True,100672_B
19058,100672,100672_B,1,Ivvavik National Park Of Canada,National Park,II,1,1984.0,CAN,"MULTIPOLYGON (((-139.78657 69.59821, -139.7872...",protected_planet,,,,,52.17008,marine,"[-140.894068268, 69.19278843000001, -138.37542...",True,100672_B


In [107]:
mpa_table[mpa_table['name'] == 'Ivvavik National Park Of Canada']

Unnamed: 0_level_0,wdpaid,wdpa_pid,protection_status,name,designation,pa_iucn_category,year,area,data_source,mpaa_establishment_stage,mpaa_protection_level,bbox,is_child,child_id,coverage,environment,location,children
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2137,100672,100672_A,1,Ivvavik National Park Of Canada,National Park,2,1984,39.201811,3,,,"[-141.000000001, 68.556807999, -138.1338199979...",False,100672_A,0.00068,marine,29.0,[2138]
2138,100672,100672_B,1,Ivvavik National Park Of Canada,National Park,2,1984,52.17008,3,,,"[-140.894068268, 69.19278843000001, -138.37542...",True,100672_B,0.000905,marine,29.0,


In [109]:
tpa_table[tpa_table['name'] == 'Ivvavik National Park Of Canada']

Unnamed: 0_level_0,wdpaid,wdpa_pid,protection_status,name,designation,pa_iucn_category,area,year,data_source,bbox,is_child,child_id,coverage,environment,location,children
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
17036,100672,100672_A,1,Ivvavik National Park Of Canada,National Park,2,9695.837607,1984,3,"[-141.000000001, 68.556807999, -138.1338199979...",False,100672_A,0.097898,terrestrial,29.0,[17037]
17037,100672,100672_B,1,Ivvavik National Park Of Canada,National Park,2,79.375056,1984,3,"[-140.894068268, 69.19278843000001, -138.37542...",True,100672_B,0.000801,terrestrial,29.0,


In [None]:
PAsSchema(mpa_table[mpa_table.location.notna()]).to_csv(output_file_mpas, index=True)

In [None]:
# todo investigate the issue with area as null

In [None]:
# batch_export(
#     mpa_table[mpa_table.area.notna()],
#     5000,
#     PAsSchema,
#     pipe_dir.get_processed_step_path(current_step),
#     "mpa_detail",
#     format="json",
#     strapi_colection=strapi_collection_mpas,
# )

In [None]:
# # This code is to be able to identify groups that has wdpa_pid so in the future if needed we could combine the group geometries to generate a wdpa coverage geometry
# init_table[
#     (
#         init_table.sort_values(by=["wdpaid", "source"], ascending=[True, False])
#         .groupby("wdpaid")
#         .transform("size")
#         .gt(1)
#     )
#     & (init_table.wdpa_pid.str.extract(r"([A-Za-z]+)", expand=False).notna())
# ].groupby("wdpaid")
# .geometry.apply(lambda x: x.union_all())

#### upload data to strapi

In [None]:
# strapi.deleteCollectionData("mpa", list(range(1, 20914)))

<helpers.strapi.Strapi at 0x7fda8ddb8860>

In [None]:
# for i in range(0, 4):
#     strapi.importCollectionData(
#         strapi_collection_mpas,
#         mpa_folder.joinpath(f"mpa_detail_{i}.csv"),
#     )

### Country mpas detail table data

  1- lower case the columns   
2- separate location that its regime is in dispute or on join regime  
3- calcualte area for mpaatlas data  
4- rename columns for merge  
5- merge maaatlas and mpa data identifying the source  
6- identify child resources and set them as childs  
7- calculate bbox  
8- set child resources  
9- prepare output for batch export  
10- upload data to strapi  

In [256]:
pipe = "mpa"
strapi_collection_mpas = "mpa"

pipe_dir = FileConventionHandler(pipe)
pipe_dir_mpaatlas = FileConventionHandler("mpaatlas")
output_file_mpas = pipe_dir.get_processed_step_path(current_step).joinpath("mpa_detail.csv")

# Download the protected atlas file && unzip it
download_and_unzip_if_needed(pipe_dir, prev_step, mysettings)
# Download the mpaatlas file 
download_and_unzip_if_needed(pipe_dir_mpaatlas, prev_step, mysettings)

/home/sofia/dev/skytruth-30x30/data/data/mpa/processed/mpa_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/mpa/processed/preprocess
/home/sofia/dev/skytruth-30x30/data/data/mpaatlas/processed/mpaatlas_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/mpaatlas/processed/preprocess


PosixPath('/home/sofia/dev/skytruth-30x30/data/data/mpaatlas/processed/preprocess')

In [257]:
# Load the data
mpa_intermediate = gpd.read_file(pipe_dir.get_step_fmt_file_path(prev_step, "shp")).pipe(
    clean_geometries
)
mpaatlas_intermediate = gpd.read_file(
    pipe_dir_mpaatlas.get_step_fmt_file_path(prev_step, "shp")
).pipe(clean_geometries)

In [258]:
init_table = (
    pd.concat(
        [
            (
                mpa_intermediate.pipe(columns_to_lower)
                .pipe(separate_parent_iso, iso_column="parent_iso")
                .pipe(change_ata_to_abnj)
                .rename(
                    columns={
                        "parent_iso": "iso",
                        "status_yr": "year",
                        "gis_m_area": "area_km2",
                    }
                ).drop(columns=['status'])
            ).assign(source="protected_planet"),
            (
                mpaatlas_intermediate.pipe(calculate_area)
                .pipe(extract_wdpaid_mpaatlas)
                .pipe(separate_parent_iso, iso_column="location_i")
                .rename(
                    columns={
                        "location_i": "iso",
                        "wdpa_id": "wdpa_pid",
                        "designatio": "desig_eng",
                    }
                )
            ).assign(source="mpaatlas"
            ).astype({"mpa_zone_i": "Int64"}),
        ],
        ignore_index=True,
    )
    .reset_index(drop=True)
    .replace(
        {
            "iso": {
                "COK": "NZL",
                "IOT": "GBR",
                "NIU": "NZL",
                "SHN": "GBR",
                "SJM": "NOR",
                "UMI": "USA",
                "NCL": "FRA",
            }
        }
    )
    .sort_values(by=["wdpa_pid", "wdpa_pid", "source"], ascending=[True, True, False])
)

In [12]:
#  to be run if things change a lot in the future
# iucn_cat = pd.DataFrame(
#     {"slug": init_table.iucn_cat.dropna().unique(), "name": init_table.iucn_cat.dropna().unique()},
#     index=pd.Index(np.arange(1, len(init_table.iucn_cat.dropna().unique()) + 1)),
# )
# iucn_cat.to_csv(pipe_dir.get_processed_step_path(current_step).joinpath("iucn_categories.csv"), index=True)

iucn_cat = pd.read_csv(
    pipe_dir.get_processed_step_path(current_step).joinpath("iucn_categories.csv"), index_col=0
)

In [259]:
mpa_table = (
    init_table.pipe(add_bbox, "bbox")
    .pipe(define_is_child)
    .pipe(set_child_id)
    .sort_values(by=["wdpaid", "is_child"], ascending=[True, True])
    .reset_index(drop=True)
    .pipe(add_total_marine_area)
    .pipe(calculate_coverage_percentage_pa)
    .pipe(add_environment)
    .pipe(
        output,
        iso_column="iso",
        rep_d={
            "status": {
                "Adopted": 4,
                "implemented": 6,
                "Established": 6,
                "Designated": 5,
                "Proposed": 3,
                "Inscribed": 3,
                "unknown": 1,
            },
            "pa_def": {"0": 2, "1": 1},
            "year": {0: pd.NA},
            "iucn_cat": dict(
                iucn_cat[["slug"]]
                .reset_index(drop=False)
                .iloc[:, [1, 0]]
                .to_dict(orient="tight")["data"]
            ),
            "source": {"protected_planet": 3, "mpaatlas": 1},
            "protection": {
                "full": 3,
                "light": 4,
                "incompatible": 5,
                "high": 6,
                "minimal": 7,
                "unknown": 8,
                "unknown/to be determined": 8,
            },
            "establishm": {
                "actively managed": 4,
                "implemented": 6,
                "designated": 5,
                "Designated": 5,
                "proposed or committed": 3,
                "Proposed": 3,
                "Inscribed": 3,
                "Established": 5,
                "Adopted": 5,
                "unknown": 1,
            },
        },
        rename={
            "pa_def": "protection_status",
            "area_km2": "area",
            "iucn_cat": "pa_iucn_category",
            "desig_eng": "designation",
            "protection": "mpaa_protection_level",
            "establishm": "mpaa_establishment_stage",
            "source": "data_source",
        },
        drop_cols=["geometry", "protecti_1","mpa_zone_i", "iso", "total_marine_area"]
    )
    .pipe(add_child_parent_relationship)
    .astype(
        {
            "year": "Int32",
            "pa_iucn_category": "Int64",
            "protection_status": "Int64",
        }
    )
    .query("coverage <= 100") 
    .sort_index()
)

In [260]:
# Validate and save
PAsSchema(mpa_table[mpa_table.location.notna()]).to_csv(output_file_mpas, index=True)

In [None]:
# todo investigate the issue with area as null

In [None]:
# batch_export(
#     mpa_table[mpa_table.area.notna()],
#     5000,
#     PAsSchema,
#     pipe_dir.get_processed_step_path(current_step),
#     "mpa_detail",
#     format="json",
#     strapi_colection=strapi_collection_mpas,
# )

In [None]:
# # This code is to be able to identify groups that has wdpa_pid so in the future if needed we could combine the group geometries to generate a wdpa coverage geometry
# init_table[
#     (
#         init_table.sort_values(by=["wdpaid", "source"], ascending=[True, False])
#         .groupby("wdpaid")
#         .transform("size")
#         .gt(1)
#     )
#     & (init_table.wdpa_pid.str.extract(r"([A-Za-z]+)", expand=False).notna())
# ].groupby("wdpaid")
# .geometry.apply(lambda x: x.union_all())

#### upload data to strapi

In [None]:
# strapi.deleteCollectionData("mpa", list(range(1, 20914)))

<helpers.strapi.Strapi at 0x7fda8ddb8860>

In [None]:
# for i in range(0, 4):
#     strapi.importCollectionData(
#         strapi_collection_mpas,
#         mpa_folder.joinpath(f"mpa_detail_{i}.csv"),
#     )

### Terrestrial pas - detail table data

1- lower case the columns   
2- separate location that its regime is in dispute or on join regime 
3- remove ATA and ABNJ because Protected planet doesn't include stats for ATA and ABNJ is marine 
4- rename columns for merge   
5- identify child resources and set them as childs  
6- calculate bbox  
7- set child resources  
8- prepare output for batch export  
9- upload data to strapi  

In [4]:
pipe = "mpa-terrestrial"
strapi_collection_mpas = "mpa-terrestrial"

pipe_dir = FileConventionHandler(pipe)
pipe_dir_gadm = FileConventionHandler("gadm")
output_file_tpas = pipe_dir.get_processed_step_path(current_step).joinpath("tpa_detail.csv")

# Download the protected atlas file && unzip it
download_and_unzip_if_needed(pipe_dir, prev_step, mysettings)
# Download the mpaatlas file 
download_and_unzip_if_needed(pipe_dir_gadm, prev_step, mysettings)

/home/sofia/dev/skytruth-30x30/data/data/mpa-terrestrial/processed/mpa-terrestrial_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/mpa-terrestrial/processed/preprocess
/home/sofia/dev/skytruth-30x30/data/data/gadm/processed/gadm_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/gadm/processed/preprocess


PosixPath('/home/sofia/dev/skytruth-30x30/data/data/gadm/processed/preprocess')

In [9]:
tpa_intermediate = gpd.read_file(pipe_dir.get_step_fmt_file_path(prev_step, "gpkg")).pipe(
    clean_geometries
)

In [13]:
iucn_cat = pd.read_csv(
    pipe_dir.get_processed_step_path(current_step).joinpath("iucn_categories.csv"), index_col=0
)

In [10]:
init_table = (
    pd.concat(
        [
            (
                tpa_intermediate.pipe(columns_to_lower)
                .pipe(separate_parent_iso, iso_column="parent_iso")
                .query("parent_iso != 'ATA' and parent_iso != 'ABNJ'")
                .rename(
                    columns={
                        "parent_iso": "iso",
                        "status_yr": "year",
                        "gis_area": "protected_area",
                    }
                ).drop(columns=['status'])
            ).assign(source="protected_planet"),
        ],
        ignore_index=True,
    )
    .reset_index(drop=True)
    .replace(
        {
            "iso": {
                "COK": "NZL",
                "IOT": "GBR",
                "NIU": "NZL",
                "SHN": "GBR",
                "SJM": "NOR",
                "UMI": "USA",
                "NCL": "FRA",
            }
        }
    )
    .sort_values(by=["wdpa_pid", "wdpa_pid", "source"], ascending=[True, True, False])
)

In [25]:
tpa_table = (
    init_table.pipe(add_bbox, "bbox")
    .pipe(define_is_child)
    .pipe(set_child_id_pa)
    .sort_values(by=["wdpaid", "is_child"], ascending=[True, True])
    .reset_index(drop=True)
    .pipe(add_total_terrestrial_area)
    .pipe(calculate_coverage_percentage_pa)
    .pipe(add_environment)
    .pipe(
        output2,
        iso_column="iso",
        rep_d={
            "pa_def": {"0": 2, "1": 1},
            "year": {0: pd.NA},
            "iucn_cat": dict(
                iucn_cat[["slug"]]
                .reset_index(drop=False)
                .iloc[:, [1, 0]]
                .to_dict(orient="tight")["data"]
            ),
            "source": {"protected_planet": 3},
        },
        rename={
            "pa_def": "protection_status",
            "protected_area": "area",
            "iucn_cat": "pa_iucn_category",
            "desig_eng": "designation",
            "source": "data_source",
        },
        drop_cols=["geometry", "iso", "marine", "total_terrestrial_area"]
    )
    .pipe(add_child_parent_relationship)
    .astype(
        {
            "year": "Int32",
            "pa_iucn_category": "Int64",
            "protection_status": "Int64",
        }
    )
    .query("coverage <= 100") 
    .sort_index()
)

  df.replace(rep_d)


In [26]:
# Add col mpaa_protection_level and mpa_establishment_stage to the table to validate it
tpa_table['mpaa_protection_level'] = np.nan
tpa_table['mpaa_establishment_stage'] = np.nan

In [16]:
# Validate and save
PAsSchema(tpa_table[tpa_table.location.notna()]).to_csv(output_file_tpas, index=True)

In [None]:
# batch_export(
#     mpa_table[mpa_table.area.notna()],
#     5000,
#     PAsSchema,
#     pipe_dir.get_processed_step_path(current_step),
#     "mpa_detail",
#     format="json",
#     strapi_colection=strapi_collection_mpas,
# )

In [None]:
# # This code is to be able to identify groups that has wdpa_pid so in the future if needed we could combine the group geometries to generate a wdpa coverage geometry
# init_table[
#     (
#         init_table.sort_values(by=["wdpaid", "source"], ascending=[True, False])
#         .groupby("wdpaid")
#         .transform("size")
#         .gt(1)
#     )
#     & (init_table.wdpa_pid.str.extract(r"([A-Za-z]+)", expand=False).notna())
# ].groupby("wdpaid")
# .geometry.apply(lambda x: x.union_all())

In [None]:
# strapi.deleteCollectionData("mpa", list(range(1, 20914)))

<helpers.strapi.Strapi at 0x7fda8ddb8860>

In [None]:
# for i in range(0, 4):
#     strapi.importCollectionData(
#         strapi_collection_mpas,
#         mpa_folder.joinpath(f"mpa_detail_{i}.csv"),
#     )

### PA coverage - terrestrial

In [4]:
pipe = "mpa-terrestrial"
step = "preprocess"
strapi_collection_mpas = "mpa-terrestrial"

pipe_dir = FileConventionHandler(pipe)
pipe_dir_gadm = FileConventionHandler("gadm")

working_folder = FileConventionHandler(pipe)
input_path = working_folder.pipe_raw_path
temp_working_path = working_folder.get_temp_file_path(step)
output_file_sjoin = pipe_dir.get_processed_step_path(current_step).joinpath("tpa_sjoin.shp")
output_file_dissolve = pipe_dir.get_processed_step_path(current_step).joinpath("tpa_dissolve.csv")
output_file_tpas = pipe_dir.get_processed_step_path(current_step).joinpath("tpa_detail.csv")

# Download the protected atlas file && unzip it
download_and_unzip_if_needed(pipe_dir, prev_step, mysettings)
# Download the mpaatlas file 
download_and_unzip_if_needed(pipe_dir_gadm, prev_step, mysettings)

/home/sofia/dev/skytruth-30x30/data/data/mpa-terrestrial/processed/mpa-terrestrial_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/mpa-terrestrial/processed/preprocess
/home/sofia/dev/skytruth-30x30/data/data/gadm/processed/gadm_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/gadm/processed/preprocess


PosixPath('/home/sofia/dev/skytruth-30x30/data/data/gadm/processed/preprocess')

In [9]:
# # Load the data
# wdpa = gpd.read_file(pipe_dir.get_step_fmt_file_path(prev_step, "gpkg")).pipe(
#     clean_geometries
# )
# gadm = gpd.read_file(pipe_dir_gadm.get_step_fmt_file_path(prev_step, "shp")).pipe(clean_geometries)

# gadm.sindex
# wdpa.sindex

<geopandas.sindex.SpatialIndex at 0x7f706660eb40>

In [None]:
# # Spatial join using overlay
# wdpa_subset = wdpa[
#     ~(
#         (wdpa.bounds.minx < -181)
#         | (wdpa.bounds.miny < -91)
#         | (wdpa.bounds.maxx > 181)
#         | (wdpa.bounds.maxy > 91)
#     )
# ].reset_index(drop=True)

# sjoin_gdf = await spatial_join(wdpa_subset, gadm)
# sjoin_gdf.rename(columns={"GID_0": "iso_3"}, inplace=True)

In [11]:
# # test that we have not produce duplicates
# sjoin_gdf.loc[sjoin_gdf.duplicated(subset=["WDPA_PID", "iso_3"], keep=False)].sort_values(
#     "WDPA_PID"
# )

Unnamed: 0,WDPAID,WDPA_PID,PA_DEF,NAME,DESIG_ENG,IUCN_CAT,MARINE,GIS_AREA,STATUS,STATUS_YR,PARENT_ISO,COUNTRY,iso_3,area_km2,geometry


In [12]:
# sjoin_gdf = filter_by_exluding_propossed_mpas(sjoin_gdf)
# len(sjoin_gdf)

289352

In [13]:
# # Save the spatial join
# sjoin_gdf.to_file(output_file_sjoin, driver="ESRI Shapefile")

INFO:pyogrio._io:Created 289,352 records


In [5]:
sjoin_gdf = gpd.read_file(output_file_sjoin)
sjoin_gdf["STATUS_YR"] = sjoin_gdf["STATUS_YR"].astype("Int64")

In [15]:
# # Calculate wdpa cumulative counts and pa and oecm percentages
cumulative_counts = cumulative_pa_def_counts(sjoin_gdf)
cumulative_counts

PA_DEF,iso_3,year,1,0,protected_areas_count
0,AFG,2010,10,0.0,10.0
1,AFG,2011,10,0.0,10.0
2,AFG,2012,10,0.0,10.0
3,AFG,2013,10,0.0,10.0
4,AFG,2014,10,0.0,10.0
...,...,...,...,...,...
2884,ZWE,2020,229,0.0,229.0
2885,ZWE,2021,229,0.0,229.0
2886,ZWE,2022,229,0.0,229.0
2887,ZWE,2023,229,0.0,229.0


In [None]:
# # Dissolve geometries to calculate the coverage
# data = await process_grid(sjoin_gdf)

In [17]:
# tpa = pd.concat(data, ignore_index=True).drop(columns=['STATUS_YR', 'index']).rename(columns={'area': 'protected_area'})
# tpa.head(5)

Unnamed: 0,iso_3,year,protected_area
0,ATA,2017,1395.028044
1,ATA,2018,1395.028044
2,ATA,2019,1395.028044
3,ATA,2020,1395.028044
4,ATA,2021,1395.028044


In [20]:
# # Group by 'iso_3' and 'year' and sum the 'area'
# tpa_grouped = tpa.groupby(['iso_3', 'year'], as_index=False)['protected_area'].sum()
# tpa_grouped.reset_index(drop=True, inplace=True)
# tpa_grouped.head(5)

Unnamed: 0,iso_3,year,protected_area
0,AFG,2010,1078.918622
1,AFG,2011,1078.918622
2,AFG,2012,1078.918622
3,AFG,2013,1078.918622
4,AFG,2014,1078.918622


In [21]:
# # save to csv
# tpa_grouped.to_csv(output_file_dissolve, index=False)

In [17]:
tpa_grouped = pd.read_csv(output_file_dissolve)
tpa_grouped.head(5)

Unnamed: 0,iso_3,year,protected_area
0,AFG,2010,1078.918622
1,AFG,2011,1078.918622
2,AFG,2012,1078.918622
3,AFG,2013,1078.918622
4,AFG,2014,1078.918622


In [63]:
# Add pa and oecm counts to the coverage table
coverage = (
    pd.merge(tpa_grouped, cumulative_counts, on=['iso_3', 'year'], how='left')
    .pipe(add_region_iso, "iso_3")
    .pipe(calculate_stats_cov_pa, ["year"], "iso_3")
    .pipe(calculate_pa_def_percentages)
    .pipe(add_total_terrestrial_area)
    .pipe(calculate_coverage_percentage_pa)
    .pipe(calculate_global_contribution)
    .pipe(add_is_last_year)
    .pipe(add_environment)
)

NewProtectedAreaExtentSchema(
    coverage.pipe(
        output2,
        "iso_3",
        {},
        {},
        ["iso_3", 'total_terrestrial_area'],
    )
).to_csv(
    output_file_tpas,
    index=True,
)

coverage

Unnamed: 0,year,iso_3,protected_area,protected_areas_count,oecms,pas,total_terrestrial_area,coverage,global_contribution,is_last_year,environment
0,2010,AF,3.636311e+06,7272.0,0.0,100.0,29993094.71,12.123827,2.694465,False,terrestrial
1,2010,AS,2.040713e+06,24761.0,0.0,100.0,31625555.58,6.452734,1.512145,False,terrestrial
2,2010,AT,1.108333e+02,2.0,0.0,100.0,12088229.65,0.000917,0.000082,False,terrestrial
3,2010,EU,4.303722e+06,116101.0,0.0,100.0,30037571.37,14.327795,3.189009,False,terrestrial
4,2010,,2.006295e+06,52176.0,0.0,100.0,19371151.92,10.357127,1.486642,False,terrestrial
...,...,...,...,...,...,...,...,...,...,...,...
2989,2024,YEM,5.145397e+03,15.0,0.0,100.0,453741.18,1.133994,0.003813,True,terrestrial
2990,2024,ZAF,1.143850e+05,1631.0,0.0,100.0,1221327.52,9.365631,0.084758,True,terrestrial
2991,2024,ZMB,2.929805e+05,557.0,0.0,100.0,753990.33,38.857330,0.217095,True,terrestrial
2992,2024,ZNC,2.779983e+00,8.0,0.0,100.0,3314.08,0.083884,0.000002,True,terrestrial


### Combine marine and terrestrial - Detail table

In [17]:
pipe_mar = "mpa"
pipe_ter = "mpa-terrestrial"
pipe_pa = "pa"
step = "preprocess"


pipe_dir_mar = FileConventionHandler(pipe_mar)
pipe_dir_ter = FileConventionHandler(pipe_ter)
pipe_dir_pa = FileConventionHandler(pipe_pa)

input_path_mar = pipe_dir_mar.get_processed_step_path(current_step).joinpath("mpa_detail.csv")
input_path_ter = pipe_dir_ter.get_processed_step_path(current_step).joinpath("tpa_detail.csv")
output_file_pa = pipe_dir_pa.get_processed_step_path(current_step).joinpath("pa_detail.csv")

In [29]:
mpa_table = pd.read_csv(input_path_mar)
tpa_table = pd.read_csv(input_path_ter)

In [30]:
# Create final table with all the data
final_table = pd.concat([mpa_table, tpa_table])
final_table.index = range(1, len(final_table) + 1)
final_table.index.name = 'id'
final_table.head(2)

Unnamed: 0_level_0,id,wdpaid,wdpa_pid,protection_status,name,designation,pa_iucn_category,year,area,data_source,mpaa_establishment_stage,mpaa_protection_level,bbox,is_child,child_id,coverage,environment,location,children
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,1,1.0,1,1.0,Diamond Reef and Salt Fish Tail Reef,Marine Reserve,1.0,1973.0,14.636135,3,,,"[-61.88691617799998, 17.184972703000028, -61.8...",False,1,0.013119,marine,15,
2,2,2.0,2,1.0,Palaster Reef,Marine Reserve,1.0,1973.0,3.845623,3,,,"[-61.771742115999984, 17.520006550999994, -61....",False,2,0.003447,marine,15,


In [31]:
PAsSchema(final_table[final_table.location.notna()]).to_csv(output_file_pa, index=True)

SchemaError: Error while coercing 'bbox' to type typing.List[float]: Could not coerce <class 'pandas.core.series.Series'> data_container into type typing.List[float]:
         index failure_case
0            1         <NA>
1            2         <NA>
2            3         <NA>
3            4         <NA>
4            5         <NA>
...        ...          ...
306118  306119         <NA>
306119  306120         <NA>
306120  306121         <NA>
306121  306122         <NA>
306122  306123         <NA>

[306123 rows x 2 columns]

In [None]:
remote_path = 'vizzuality_processed_data/strapi_tables/pa.csv'

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=output_file_pas,
    operation="w",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


In [65]:
pipe_mar = "mpa"
pipe_ter = "mpa-terrestrial"
step = "preprocess"


pipe_dir_mar = FileConventionHandler(pipe_mar)
pipe_dir_ter = FileConventionHandler(pipe_ter)

input_path_mar = pipe_dir_mar.get_processed_step_path(current_step).joinpath("mpa_detail.csv")
input_path_ter = pipe_dir_ter.get_processed_step_path(current_step).joinpath("tpa_detail.csv")
output_file_pa = pipe_dir_ter.get_processed_step_path(current_step).joinpath("pa_detail.csv")

In [66]:
ter = pd.read_csv(input_path_ter)
mar = pd.read_csv(input_path_mar)

In [67]:
ter.columns

Index(['id', 'year', 'protected_area', 'protected_areas_count', 'oecms', 'pas',
       'coverage', 'global_contribution', 'is_last_year', 'environment',
       'location'],
      dtype='object')

In [68]:
mar.columns

Index(['id', 'wdpaid', 'wdpa_pid', 'protection_status', 'name', 'designation',
       'pa_iucn_category', 'year', 'area', 'data_source',
       'mpaa_establishment_stage', 'mpaa_protection_level', 'bbox', 'is_child',
       'child_id', 'coverage', 'environment', 'location', 'children'],
      dtype='object')

In [61]:
locations_code.isna().sum()

location    0
code        0
dtype: int64

In [57]:
# show rows with null values in locations_code
locations_code[locations_code.isna().any(axis=1)]

Unnamed: 0,location,code
6,7,


In [None]:
coverage = (
    final_data.pipe(calculate_global_area, ["year", "PA_DEF"], {"area": "sum"}, "iso_3")
    .pipe(separate_parent_iso, "iso_3")
    .pipe(add_region_iso, "iso_3")
    .replace(
        {
            "iso_3": {
                "ATA": "ABNJ",
                "COK": "NZL",
                "IOT": "GBR",
                "NIU": "NZL",
                "SHN": "GBR",
                "SJM": "NOR",
                "UMI": "USA",
                "NCL": "FRA",
                "GIB": "GBR",
            }
        }
    )
    .pipe(calculate_stats_cov, ["year", "PA_DEF"], "iso_3").astype({"PA_DEF": int})
    .pipe(add_pa_oecm_percentages)
    .pipe(add_total_marine_area)
    .pipe(coverage_stats2)
    .pipe(calculate_coverage_percentage_mpa)
    .pipe(calculate_global_contribution)
    .pipe(add_is_last_year)
    .pipe(add_environment)
)


NewProtectedAreaExtentSchema(
    coverage.pipe(
        output,
        "iso_3",
        {},
        {},
        ["area", "iso_3", 'total_marine_area'],
    )
).to_csv(
    output_file,
    index=True,
)
coverage.head(2)

In [None]:
result_oecms = (
    sjoin_gdf.groupby(["iso_3", "PA_DEF"])
    .agg({"PA_DEF": "count"})
    .rename(columns={"PA_DEF": "count"})
    .reset_index()
    .pivot(index="GID_0", columns="PA_DEF", values="count")
    .fillna(0)
    .reset_index()
    .rename(columns={"0": "oecm", "1": "pa"})
)
# ).reset_index().pivot(index="iso_3", columns="PA_DEF", values="count").reset_index(names=["PA_DEF"], level=0, drop=True)

In [None]:
result_oecms["oecm_perc"] = result_oecms["oecm"] / (result_oecms["oecm"] + result_oecms["pa"])

In [None]:
result_oecms.sort_values("pa", ascending=False).head(10)

PA_DEF,iso_3,oecm,pa,oecm_perc
180,USA,0.0,50674.0,0.0
161,SWE,0.0,30813.0,0.0
44,DEU,0.0,23703.0,0.0
55,EST,0.0,20579.0,0.0
57,FIN,0.0,18427.0,0.0
29,CAN,2.0,12566.0,0.000159
61,GBR,0.0,11712.0,0.0
9,AUS,0.0,11154.0,0.0
30,CHE,0.0,10632.0,0.0
130,NZL,0.0,10205.0,0.0


In [None]:
result_area = pd.concat(data)[['iso_3', 'year', 'area']].groupby(['iso_3', 'year']).sum().reset_index()

In [None]:
result = result_area.merge(result_oecms, on="iso_3")

In [None]:
# save sjoin_gdf to file
sjoin_gdf.to_file(pipe_dir.get_processed_step_path(current_step).joinpath("tpa_sjoin.shp"), driver="ESRI Shapefile")

In [9]:
sjoin_gdf = gpd.read_file(pipe_dir.get_processed_step_path(current_step).joinpath("tpa_sjoin.shp")).pipe(clean_geometries)

In [10]:
sjoin_gdf.columns

Index(['WDPAID', 'WDPA_PID', 'PA_DEF', 'NAME', 'DESIG_ENG', 'IUCN_CAT',
       'MARINE', 'GIS_AREA', 'STATUS', 'STATUS_YR', 'PARENT_ISO', 'index_righ',
       'COUNTRY', 'GID_0', 'area_km2', 'geometry'],
      dtype='object')

In [218]:
result_oecms = (
    sjoin_gdf.groupby(["GID_0", "PA_DEF"])
    .agg({"PA_DEF": "count"})
    .rename(columns={"PA_DEF": "count"})
    .reset_index()
    .pivot(index="GID_0", columns="PA_DEF", values="count")
    .fillna(0)
    .reset_index()
    .rename(columns={"0": "oecm", "1": "pa"})
)
# ).reset_index().pivot(index="iso_3", columns="PA_DEF", values="count").reset_index(names=["PA_DEF"], level=0, drop=True)

result_oecms.head(10)

PA_DEF,GID_0,oecm,pa
0,AFG,0.0,25.0
1,AGO,0.0,37.0
2,ALB,0.0,117.0
3,AND,0.0,23.0
4,ARE,0.0,54.0
5,ARG,0.0,403.0
6,ARM,0.0,68.0
7,ATA,0.0,9.0
8,ATG,0.0,10.0
9,AUS,0.0,11234.0


In [None]:
# async def process_mpa_data(
#     gdf: gpd.GeoDataFrame, loop: list[int], by: list[str], aggfunc: dict
# ) -> pd.DataFrame:
#     """process protected planet data. relevant for acc coverage extent by year indicator."""
#     # we split the data by =< year so we can acumulate the coverage
#     base = split_by_year(gdf)

#     result_to_iter = pd.concat(base, ignore_index=True).copy()

#     with tqdm(total=len(loop)) as pbar:  # we create a progress bar
#         new_df = await asyncio.gather(
#             *(spatial_dissolve_chunk(year, result_to_iter, pbar, by, aggfunc) for year in loop)
#         )
#     return pd.concat(
#         [base[0].pipe(calculate_area, "area", None).drop(columns=["geometry"]), *new_df],
#         ignore_index=True,
#     )

In [None]:
# final_data = await process_mpa_data(
#     eez_mpas_data_join.pipe(add_location_iso).pipe(assign_iso3),
#     range(2011, time.localtime().tm_year + 1),
#     ["PA_DEF", "iso_3"],
#     {"protectedAreasCount": "sum"},
# )
# coverage = (
#     final_data.pipe(calculate_global_area, ["year", "PA_DEF"], {"area": "sum"}, "iso_3")
#     .pipe(separate_parent_iso, "iso_3")
#     .pipe(add_region_iso, "iso_3")
#     .replace(
#         {
#             "iso_3": {
#                 "ATA": "ABNJ",
#                 "COK": "NZL",
#                 "IOT": "GBR",
#                 "NIU": "NZL",
#                 "SHN": "GBR",
#                 "SJM": "NOR",
#                 "UMI": "USA",
#                 "NCL": "FRA",
#                 "GIB": "GBR",
#             }
#         }
#     )
#     .pipe(calculate_stats_cov, ["year", "PA_DEF"], "iso_3").astype({"PA_DEF": int})
#     .pipe(add_pa_oecm_percentages)
#     .pipe(add_total_marine_area)
#     .pipe(coverage_stats2)
#     .pipe(calculate_coverage_percentage_mpa)
#     .pipe(calculate_global_contribution)
#     .pipe(add_is_last_year)
#     .pipe(add_environment)
# )


# NewProtectedAreaExtentSchema(
#     coverage.pipe(
#         output,
#         "iso_3",
#         {},
#         {},
#         ["area", "iso_3", 'total_marine_area'],
#     )
# ).to_csv(
#     output_file,
#     index=True,
# )
# coverage.head(2)