In [194]:
%load_ext autoreload
%autoreload 2

In [1]:
import logging
import sys
from pathlib import Path
import time
import pandas as pd
import geopandas as gpd
import numpy as np
import json
import dotenv
import os
import logging
from typing import Tuple, List, Union
import sys
from pathlib import Path
import pandas as pd
import geopandas as gpd
import numpy as np
import asyncio
from tqdm.asyncio import tqdm
from itertools import product
from shapely.geometry import box

dotenv.load_dotenv()

scripts_dir = Path(".").joinpath("src")
if scripts_dir not in sys.path:
    sys.path.insert(0, scripts_dir.resolve().as_posix())

from helpers.strapi import Strapi
from helpers.settings import get_settings, Settings
from helpers.file_handler import FileConventionHandler
from helpers.utils import download_and_unzip_if_needed, writeReadGCP, make_archive

from pipelines.output_schemas import (
    FPLSchema,
    ProtectionLevelSchema,
    MPAsSchema,
    HabitatsSchema,
    LocationSchema,
    ProtectedAreaExtentSchema,
)
from pipelines.processors import (
    add_envelope,
    add_location_iso,
    expand_multiple_locations,
    add_region_iso,
    calculate_eez_area,
    add_bbox,
    add_groups_and_members,
    add_location_name,
    output,
    clean_geometries,
    filter_by_exluding_propossed_mpas,
    spatial_join,
    process_mpa_data,
    assign_iso3,
    calculate_global_area,
    separate_parent_iso,
    calculate_stats_cov,
    coverage_stats,
    mpaatlas_filter_stablishment,
    process_mpaatlas_data,
    calculate_stats,
    fix_monaco,
    batch_export,
    calculate_area,
    define_is_child,
    set_child_id,
    add_child_parent_relationship,
    columns_to_lower,
    extract_wdpaid_mpaatlas,
    simplify_async,
    process_tpa_data,
    get_matches,
    repair_geometry, 
    arrange_dimensions, 
)
from pipelines.utils import background

logging.basicConfig(level=logging.DEBUG)
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("fiona").setLevel(logging.WARNING)
logger = logging.getLogger("notebook")

In [2]:
mysettings = get_settings()
prev_step = "preprocess"
current_step = "stats"

In [3]:
# # Strapi setup
# strapi = Strapi(url=mysettings.STRAPI_URL)
# strapi.login(jwt=mysettings.STRAPI_JWT)

#### General functions

In [3]:
from typing import List, Dict
import pandera as pa
from pandera.typing import Index, Series

def change_ata_to_abnj(df):
    """
    Changes values in the parent_iso column from 'ATA' to 'ABNJ' as there is no 'ATA' stats in Protected Planet.
    """
    # Count the occurrences of 'ATA'
    count_changes = df['parent_iso'].value_counts().get('ATA', 0)
    
    # Replace 'ATA' with 'ABNJ'
    df['parent_iso'] = df['parent_iso'].replace('ATA', 'ABNJ')

    return df


def add_total_marine_area(df):
    # Read the JSON file
    with open(scripts_dir.joinpath('data_commons/data/locations_all.json'), 'r') as f:
        locations_data = json.load(f)
    
    # Access the nested dictionary
    locations_dict = locations_data.get('data', {}).get('api::location.location', {})
    
    # Create a lookup dictionary from the nested dictionary
    marine_area_lookup = {item['code']: item['total_marine_area'] for item in locations_dict.values()}
    
    # Identify the column that contains the word 'iso'
    iso_column = [col for col in df.columns if 'iso' in col][0]

    # Perform the mapping using the identified column
    df['total_marine_area'] = df[iso_column].map(marine_area_lookup)
    
    return df

def add_total_terrestrial_area(df):
    # Read the JSON file
    with open(scripts_dir.joinpath('data_commons/data/locations_all.json'), 'r') as f:
        locations_data = json.load(f)
    
    # Access the nested dictionary
    locations_dict = locations_data.get('data', {}).get('api::location.location', {})
    
    # Create a lookup dictionary from the nested dictionary
    marine_area_lookup = {item['code']: item['total_terrestrial_area'] for item in locations_dict.values()}
    
    # Identify the column that contains the word 'iso'
    iso_column = [col for col in df.columns if 'iso' in col][0]

    # Perform the mapping using the identified column
    df['total_terrestrial_area'] = df[iso_column].map(marine_area_lookup)
    
    return df

def add_mpa_oecm_percentages(df):
    # Calculate the total protectedAreasCount for each year and iso_3
    total_counts = df.groupby(['year', 'iso_3'])['protectedAreasCount'].transform('sum')

    # Calculate the counts for PA_DEF == 0 and PA_DEF == 1
    df['oecm_count'] = df['protectedAreasCount'].where(df['PA_DEF'] == 0, 0)
    df['pa_count'] = df['protectedAreasCount'].where(df['PA_DEF'] == 1, 0)

    # Calculate the percentages
    df['oecms'] = df.groupby(['year', 'iso_3'])['oecm_count'].transform('sum') / total_counts * 100
    df['pas'] = df.groupby(['year', 'iso_3'])['pa_count'].transform('sum') / total_counts * 100

    # Aggregate the results and fill NaN values with 0
    final_df = df.groupby(['year', 'iso_3']).agg(
        area=('area', 'sum'),
        protected_areas_count=('protectedAreasCount', 'sum'),
        oecms=('oecms', 'first'),
        pas=('pas', 'first')
    ).reset_index().fillna(0)

    return final_df

def calculate_pa_def_percentages(df: pd.DataFrame, iso_col: str = "iso_3") -> pd.DataFrame:
    """
    Calculate the percentages for each PA_DEF value.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the cumulative counts of PA_DEF values.
    iso_col (str): The column name for the iso_3 values. Default is "iso_3".

    Returns:
    pd.DataFrame: A DataFrame with the percentages of PA_DEF values for each iso_3 and each year.
    """
    
    df['protected_areas_count'] = df['0'] + df['1']
    df['oecms'] = (df['0'] / df['protected_areas_count']) * 100
    df['pas'] = (df['1'] / df['protected_areas_count']) * 100

    df = df.drop(columns=['0', '1'], errors='ignore')

    return df

def calculate_coverage_percentage_mpatlas(df):
    df['percentage'] = (df['area_km2'] / df['total_marine_area']) * 100
    return df

def calculate_coverage_percentage_pa(df):
    if 'total_marine_area' in df.columns:
        df['coverage'] = (df['protected_area'] / df['total_marine_area']) * 100
    elif 'total_terrestrial_area' in df.columns:
        df['coverage'] = (df['protected_area'] / df['total_terrestrial_area']) * 100
    else:
        df['coverage'] = np.nan

    return df

def calculate_global_contribution(df):
    if 'total_marine_area' in df.columns:
        df['global_contribution'] = (df['protected_area'] / 361000000) * 100
    elif 'total_terrestrial_area' in df.columns:
        df['global_contribution'] = (df['protected_area'] / 134954835) * 100
    else:
        df['global_contribution'] = np.nan
    return df

def add_is_last_year(df):
    # Find the latest year for each iso_3
    latest_years = df.groupby('iso_3')['year'].transform('max')
    
    # Create the is_last_year column
    df['is_last_year'] = (df['year'] == latest_years).astype(int)
    
    return df

def add_environment(df):
    """
    Adds a column 'environment' based on the presence of 'totalMarineArea' or 'totalLandArea'.

    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: The DataFrame with the 'environment' column added.
    """
    if 'total_marine_area' in df.columns:
        df['environment'] = 1
    elif 'total_terrestrial_area' in df.columns:
        df['environment'] = 2
    else:
        df['environment'] = 0
    
    return df

def coverage_stats2(
    df: pd.DataFrame,
    area_col: str = "area",
    sort_vals: List[str] = ["iso_3", "year"],
) -> pd.DataFrame:
    """only relevant to get the coverage numbers for mpa"""
    return df.assign(
        protected_area=(
            df.sort_values(by=sort_vals)[area_col]
            - df.sort_values(by=sort_vals)
            .groupby(sort_vals)[area_col]
            .shift(-1, fill_value=0)
            .reset_index(drop=True)
        ).round(2),
    )

def process_mpaatlas_data(gdf: gpd.GeoDataFrame) -> pd.DataFrame:
    return (
        gdf.dissolve(by=["protecti_1", "iso_3"], aggfunc={"name": "count"})
        .reset_index()
        .pipe(calculate_area, "area_km2", None)
        .drop(columns=["geometry"])
    )

def separate_parent_iso(df: pd.DataFrame, iso_column="iso_3", separator=";") -> pd.DataFrame:
    df[iso_column] = (
        df[iso_column].str.replace(" ", "").str.replace(":", separator).str.split(separator)
    )
    return df.explode(iso_column)

def output2(
    df: pd.DataFrame, iso_column: str, rep_d: dict, rename: Dict[str, str], drop_cols: List[str]
) -> pd.DataFrame:
    """Output function formatter for the data.

    Args:
        df (pd.DataFrame): The DataFrame to process.
        iso_column (str): The column containing the ISO codes.
        rep_d (dict): A dictionary of values to replace.
        rename (Dict[str, str]): A dictionary of columns to rename.
        drop_cols (List[str]): A list of columns to drop.

    Returns:
        pd.DataFrame: The processed DataFrame.
    """
    if iso_column:
        locations_code = pd.read_csv(
            scripts_dir.joinpath("data_commons/data/locations_code_all.csv"),
            keep_default_na=False,
            na_values=[]
        )
        df = df.join(locations_code.set_index("code"), on=iso_column, how="left")
    return (
        df.replace(rep_d)
        .rename(columns=rename)
        .drop(columns=drop_cols)
        .assign(
            id=df.index + 1,
        )
        .set_index("id")
    )

def set_child_id_pa(
    df: pd.DataFrame | gpd.GeoDataFrame, columns: list[str] = ["wdpa_pid"]
) -> pd.DataFrame | gpd.GeoDataFrame:
    return df.assign(child_id=df[columns].bfill(axis=1)[columns[0]])


def calculate_global_area_tpa(
    df: pd.DataFrame,
    gby_col: list,
    agg_ops: Dict[str, str] = {"protected_area": "sum", "1": "sum", "0": "sum", "protected_areas_count": "sum"},
    iso_column="iso_3",
) -> pd.DataFrame:
    global_area = df.groupby(gby_col).agg(agg_ops).reset_index().assign(**{iso_column: "GLOB"})
    return pd.concat([global_area, df], ignore_index=True)

def cumulative_pa_def_counts(df: pd.DataFrame, year_col: str = "STATUS_YR", pa_def_col: str = "PA_DEF", iso_col: str = "iso_3", start_year: int = 2010) -> pd.DataFrame:
    """
    Calculate the cumulative number of PA_DEF values for each iso_3 and each year starting from a given year.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    year_col (str): The column name for the year. Default is "STATUS_YR".
    pa_def_col (str): The column name for the PA_DEF values. Default is "PA_DEF".
    iso_col (str): The column name for the iso_3 values. Default is "iso_3".
    start_year (int): The starting year for cumulative counts. Default is 2010.

    Returns:
    pd.DataFrame: A DataFrame with cumulative counts of PA_DEF values for each iso_3 and each year.
    """
    
    results = []
    years = sorted(df[year_col].unique())

    for year in years:
        if year < start_year:
            continue
        cumulative_data = df[df[year_col] <= year]
        pa_def_counts = cumulative_data.groupby([iso_col, pa_def_col]).size().unstack(fill_value=0)
        pa_def_counts['year'] = year
        results.append(pa_def_counts.reset_index())

    final_results = pd.concat(results, ignore_index=True)
    final_results = final_results.fillna(0)
    final_results = final_results.groupby([iso_col, 'year']).sum().reset_index()

    final_results['protected_areas_count'] = final_results['0'] + final_results['1']

    return final_results

def calculate_global_area_tpa(
    df: pd.DataFrame,
    gby_col: list,
    agg_ops: Dict[str, str] = {"protected_area": "sum", "1": "sum", "0": "sum", "protected_areas_count": "sum"},
    iso_column="iso_3",
) -> pd.DataFrame:
    # Ensure the columns to be aggregated exist in the DataFrame
    missing_cols = [col for col in agg_ops.keys() if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing columns in DataFrame: {missing_cols}")
    
    # Group by the specified columns and aggregate using the provided operations
    global_area = df.groupby(gby_col).agg(agg_ops).reset_index().assign(**{iso_column: "GLOB"})
    
    # Concatenate the global area DataFrame with the original DataFrame
    return pd.concat([global_area, df], ignore_index=True)

def calculate_stats_pa(
    df: pd.DataFrame, gby_col: list, iso_column: str, ops: dict[str, str] = {"protected_area": "sum"}
) -> pd.DataFrame:
    # Group by the specified columns and region, then aggregate
    regions = (
        df.groupby([*gby_col, "region"])
        .agg(ops)
        .reset_index()
        .rename(columns={"region": iso_column})
    )

    # Group by the specified columns and iso_column, then aggregate
    countries = df.groupby([*gby_col, iso_column]).agg(ops).reset_index()

    # Concatenate the results
    return pd.concat([regions, countries], ignore_index=True)

def calculate_stats_cov_pa(df: pd.DataFrame, gby_col: list, iso_column: str):
    return calculate_stats_pa(df, gby_col, iso_column, {"protected_area": "sum", "protected_areas_count": "sum", "1": "sum", "0": "sum"})


def add_region_iso2(
    df: pd.DataFrame | gpd.GeoDataFrame, iso_column
) -> pd.DataFrame | gpd.GeoDataFrame:
    
    with open(scripts_dir.joinpath('data_commons/data/regions_data2.json'), 'r') as f:
        regions = json.load(f)

    def find_region_iso(iso: str) -> Union[str, None]:
        filtered_regions = list(filter(lambda x: iso in x["country_iso_3s"], regions.get("data")))
        return filtered_regions[0]["region_iso"] if len(filtered_regions) > 0 else None

    return df.assign(region=lambda row: row[iso_column].apply(find_region_iso))

def define_childs_ids(group) -> tuple:
    if len(group) > 1:
        parent_id = group[group.is_child.eq(False)].index.values[0]
        children_ids = group[group.is_child.eq(True)].index.tolist()
        return parent_id, children_ids
    else:
        return pd.NA, pd.NA

def add_child_parent_relationship(
    df: pd.DataFrame | gpd.GeoDataFrame,
    gby: str = "wdpaid",
    cols: list = ["wdpaid", "wdpa_pid", "is_child", "data_source"],
) -> pd.DataFrame | gpd.GeoDataFrame:
    
    # Get parent and children IDs for each group
    groups = df.groupby(gby)[cols].apply(define_childs_ids)
    
    # Extract parent and children information
    relationship_df = pd.DataFrame(
        [[a, b] for a, b in groups.values], 
        columns=["parent", "children"]
    ).dropna(subset=["parent"]).set_index("parent")
    
    # Assign children IDs to the 'children' column
    df["children"] = pd.Series(relationship_df["children"], index=relationship_df.index).reindex(df.index)
    
    # Assign parent IDs to the 'parent' column for the children
    df["parent"] = pd.NA 
    for parent, children in relationship_df.itertuples(index=True):
        df.loc[children, "parent"] = parent
    
    return df


class NewProtectedAreaExtentSchema(pa.DataFrameModel):
    id: Index[int] = pa.Field(gt=0, coerce=True)
    location: Series[int] = pa.Field(gt=0, coerce=True)
    protected_area: Series[float] = pa.Field(ge=0, coerce=True)
    protected_areas_count: Series[int] = pa.Field(ge=0, coerce=True)
    oecms: Series[float] = pa.Field(ge=0, le=100, coerce=True)
    pas: Series[float] = pa.Field(ge=0, le=100, coerce=True)
    coverage: Series[float] = pa.Field(ge=0, le=100, coerce=True)
    global_contribution: Series[float] = pa.Field(ge=0, le=100, coerce=True)
    year: Series[int] = pa.Field(ge=2000, coerce=True)
    is_last_year: Series[int] = pa.Field(isin=[0, 1], coerce=True)
    environment: Series[int] = pa.Field(isin=[1, 2], coerce=True)

class NewProtectionLevelSchema(pa.DataFrameModel):
    id: Index[int] = pa.Field(gt=0, coerce=True)
    location: Series[int] = pa.Field(gt=0, coerce=True)
    mpaa_protection_level: Series[int] = pa.Field(ge=0, coerce=True)
    year: Series[int] = pa.Field(gt=1900, coerce=True)
    area: Series[float] = pa.Field(ge=0, coerce=True)
    percentage: Series[float] = pa.Field(ge=0, le=100, coerce=True)

class PAsSchema(pa.DataFrameModel):
    id: Index[int] = pa.Field(gt=0, coerce=True)
    wdpaid: Series[pd.Int64Dtype] = pa.Field(coerce=True, nullable=True)
    # child_id: Series[str] = pa.Field(coerce=True)
    name: Series[str] = pa.Field(coerce=True)
    year: Series[pd.Int32Dtype] = pa.Field(gt=1700, nullable=True)
    area: Series[float] = pa.Field(ge=0, coerce=True)
    bbox: Series[List[float]] = pa.Field(coerce=True)
    location: Series[int] = pa.Field(ge=0, coerce=True)
    protection_status: Series[int] = pa.Field(ge=0, nullable=True)
    mpaa_establishment_stage: Series[pd.Int32Dtype] = pa.Field(ge=0, nullable=True, coerce=True)
    mpaa_protection_level: Series[pd.Int32Dtype] = pa.Field(ge=0, nullable=True, coerce=True)
    iucn_category: Series[pd.Int32Dtype] = pa.Field(coerce=True, nullable=True)
    designation: Series[str] = pa.Field(coerce=True, nullable=True)
    parent: Series[pd.Int64Dtype] = pa.Field(coerce=True, nullable=True)
    children: Series[List[int]] = pa.Field(coerce=True, nullable=True)
    data_source: Series[int] = pa.Field(coerce=True)
    coverage: Series[float] = pa.Field(ge=0, le=100, nullable=True)
    environment: Series[int] = pa.Field(isin=[1, 2], coerce=True)

class PAsSchemaChunk1(pa.DataFrameModel):
    id: Index[int] = pa.Field(gt=0, coerce=True)
    wdpaid: Series[pd.Int64Dtype] = pa.Field(coerce=True, nullable=True)
    # child_id: Series[str] = pa.Field(coerce=True)
    name: Series[str] = pa.Field(coerce=True)
    year: Series[pd.Int32Dtype] = pa.Field(gt=1700, nullable=True)
    area: Series[float] = pa.Field(ge=0, coerce=True)
    bbox: Series[List[float]] = pa.Field(coerce=True)
    location: Series[int] = pa.Field(ge=0, coerce=True)
    protection_status: Series[int] = pa.Field(ge=0, nullable=True)
    mpaa_establishment_stage: Series[pd.Int32Dtype] = pa.Field(ge=0, nullable=True, coerce=True)
    mpaa_protection_level: Series[pd.Int32Dtype] = pa.Field(ge=0, nullable=True, coerce=True)
    iucn_category: Series[pd.Int32Dtype] = pa.Field(coerce=True, nullable=True)
    designation: Series[str] = pa.Field(coerce=True, nullable=True)
    children: Series[List[int]] = pa.Field(coerce=True, nullable=True)
    data_source: Series[int] = pa.Field(coerce=True)
    coverage: Series[float] = pa.Field(ge=0, le=100, nullable=True)
    environment: Series[int] = pa.Field(isin=[1, 2], coerce=True)

class PAsSchemaChunk2(pa.DataFrameModel):
    id: Index[int] = pa.Field(gt=0, coerce=True)
    parent: Series[pd.Int64Dtype] = pa.Field(coerce=True, nullable=True)

#### Code for terrestrial processing

In [None]:
# # Code for pa terrestrial processing

# def split_by_year(
#     gdf: gpd.GeoDataFrame, year_col: str = "STATUS_YR", year_val: int = 2010
# ) -> List[gpd.GeoDataFrame]:
#     """Split data by year. relevant for MPA data.(coverage indicator)"""
#     prior_2010 = (
#         gdf[gdf[year_col] <= year_val][["iso_3", "STATUS_YR", "geometry"]]
#         .dissolve(
#             by=["iso_3"],
#         )
#         .assign(year=2010)
#         .reset_index()
#     )

#     after_2010 = (
#         gdf[gdf["STATUS_YR"] > 2010][["iso_3", "STATUS_YR", "geometry"]]
#         .rename(columns={"STATUS_YR": "year"})
#     )
#     return [prior_2010, after_2010]


# def create_grid(bounds: Tuple[float, float, float, float], cell_size: int = 1) -> gpd.GeoDataFrame:
#     """Create a grid of cells for a given GeoDataFrame"""
#     minx, miny, maxx, maxy = bounds
#     x = np.arange(minx, maxx, cell_size)
#     y = np.arange(miny, maxy, cell_size)
#     polygons = [
#         {
#             "geometry": box(i, j, i + cell_size, j + cell_size),
#             "cell_id": f"{i}_{j}",
#         }
#         for i, j in product(x, y)
#     ]
#     return gpd.GeoDataFrame(polygons)


# def subdivide_grid(
#     grid_gdf: gpd.GeoDataFrame, gdf: gpd.GeoDataFrame, max_cellsize: float, max_complexity: int
# ) -> List:
#     subdivided_elements = []
#     for grid_element in grid_gdf.geometry:
#         candidates = get_matches(grid_element, gdf)
#         density = len(candidates)
#         if density > max_complexity:
            
#             subdivision_cellsize = max_cellsize / 2
#             # Subdivide the grid element recursively
#             subgrid = create_grid(grid_element.bounds, subdivision_cellsize)
#             subdivided_elements.extend(
#                 subdivide_grid(subgrid, gdf, subdivision_cellsize, max_complexity)
#             )
#         elif density > 0:
#             subdivided_elements.append(grid_element)

#     return subdivided_elements


# def create_density_based_grid(
#     gdf: gpd.GeoDataFrame, max_cellsize: int = 10, max_complexity: int = 10000
# ) -> gpd.GeoDataFrame:
#     # Get the bounds of the GeoDataFrame
#     minx, miny, maxx, maxy = gdf.total_bounds

#     # Create an initial grid
#     grid_gdf = create_grid((minx, miny, maxx, maxy), max_cellsize)

#     # Subdivide grid elements based on density and complexity
#     subdivided_elements = subdivide_grid(grid_gdf, gdf, max_cellsize, max_complexity)

#     return gpd.GeoDataFrame(geometry=subdivided_elements)


# #  TODO: refactor this so old function mantains functionality for marine areas

# def split_gdf_by_grid(gdf: gpd.GeoDataFrame, grid_gdf: gpd.GeoDataFrame):
#     result = []
#     gdf["already_processed"] = False
#     for geometry in grid_gdf.geometry:
#         candidates = get_matches(geometry, gdf)
#         subset = gdf.loc[candidates.index][~gdf["already_processed"]]
#         gdf.loc[subset.index, "already_processed"] = True
#         if not subset.empty:
#             result.append(subset.drop(columns=["already_processed"]).reset_index(drop=True).copy())
#     return result


# @background
# def spatial_join_chunk(df_large_chunk, df_small, pbar):
#     try:
#         bbox = df_large_chunk.total_bounds

#         candidates = get_matches(box(*bbox), df_small.geometry)
#         if len(candidates) > 0:
#             subset = df_small.loc[candidates.index].clip(box(*bbox))

#             result = (
#                 gpd.overlay(df_large_chunk, subset).reset_index(drop=True)
#                 .clip(subset.geometry)
#                 .reset_index(drop=True)
#             )
#             result.geometry = result.geometry.apply(repair_geometry)
#         else:
#             result = gpd.GeoDataFrame(columns=df_large_chunk.columns)
#         return result
#     except Exception as e:
#         logging.error(e)
#         return gpd.GeoDataFrame()
#     finally:
#         pbar.update(1)


# async def spatial_join(
#     geodataframe_a: gpd.GeoDataFrame, geodataframe_b: gpd.GeoDataFrame
# ) -> gpd.GeoDataFrame:
#     """Create spatial join between two GeoDataFrames."""
#     # we build the spatial index for the larger GeoDataFrame
#     smaller_dim, larger_dim = arrange_dimensions(geodataframe_a, geodataframe_b)

#     logger.info(f"Processing {len(larger_dim)} elements")

#     grid = create_density_based_grid(larger_dim, max_cellsize=10, max_complexity=5000)

#     logger.info(f"grid created with {len(grid)} cells")

#     list_of_chunks = split_gdf_by_grid(larger_dim, grid)

#     logger.info(f"grid split into {len(list_of_chunks)} chunks")

#     with tqdm(total=len(list_of_chunks)) as pbar:  # we create a progress bar
#         new_df = await asyncio.gather(
#             *(spatial_join_chunk(chunk, smaller_dim, pbar) for chunk in list_of_chunks)
#         )

#     return gpd.GeoDataFrame(pd.concat(new_df, ignore_index=True), crs=smaller_dim.crs)


# @background
# def spatial_dissolve_chunk(geometry, gdf, pbar):
#     try:
#         logger.info("Processing chunk")
#         candidates = get_matches(
#             geometry,
#             gdf.geometry,
#         )
#         subset = gdf.loc[candidates.index]

#         result = pd.concat(
#             subset.clip(geometry).pipe(split_by_year, year_col="STATUS_YR"), ignore_index=True
#         ).copy()

#         data_chunk = [
#             (
#                 result[result["year"] <= 2010]
#                 .reset_index()
#                 .pipe(calculate_area, "area", None)
#                 .drop(columns=["geometry"])
#             )
#         ]
#         for year in range(2011, 2025):
#             data_chunk.append(
#                 result[result["year"] <= year]
#                 .dissolve(
#                     by=["iso_3"],
#                 )
#                 .assign(year=year)
#                 .reset_index()
#                 .pipe(calculate_area, "area", None)
#                 .drop(columns=["geometry"])
#             )

#         return pd.concat(data_chunk, ignore_index=True)
#     except Exception as e:
#         logging.error(e)
#         return gpd.GeoDataFrame()
#     finally:
#         pbar.update(1)

# async def process_grid(gdf):
#     grid_gdf = create_density_based_grid(gdf, max_cellsize=10, max_complexity=5000)
#     logger.info(f"grid created with {grid_gdf.shape[0]} cells")

#     with tqdm(total=grid_gdf.shape[0], desc="Processing grid elements") as pbar:
#         jobs = [spatial_dissolve_chunk(geometry, gdf, pbar) for geometry in grid_gdf.geometry.values]
#         result = await asyncio.gather(*jobs)
#     return result

### Coverage stats - Mpas

We are going to use the intermediate data from eez, in order to create a dataset that can be used as a land mask.
The steps are:
1. Load eez
2. Spatial inner Join the eez dataset with the Mpas one
3. Assign the location iso
4. dissolve by location iso and cummulative year
5. calculate the area for global regions and eez countries
6. prepare the data to be ingested in strapi
7. upload the data to strapi

In [130]:
pipe = "mpa"
strapi_collection = ""

pipe_dir_eez = FileConventionHandler("eez")
pipe_dir_mpas = FileConventionHandler(pipe)
output_file = pipe_dir_mpas.get_processed_step_path(current_step).joinpath(
    "mpa_coverage.csv"
)

# Download the EEZ file && unzip it
download_and_unzip_if_needed(pipe_dir_eez, prev_step, mysettings)
# Download the mpas file && unzip it
download_and_unzip_if_needed(pipe_dir_mpas, prev_step, mysettings)

# Load the data
eez = gpd.read_file(pipe_dir_eez.get_step_fmt_file_path(prev_step, "shp")).pipe(clean_geometries)
mpas = gpd.read_file(pipe_dir_mpas.get_step_fmt_file_path(prev_step, "shp")).pipe(clean_geometries)

/home/sofia/dev/skytruth-30x30/data/data/eez/processed/eez_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/eez/processed/preprocess
/home/sofia/dev/skytruth-30x30/data/data/mpa/processed/mpa_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/mpa/processed/preprocess


In [131]:
eez_mpas_data_join = await spatial_join(eez, mpas.pipe(filter_by_exluding_propossed_mpas))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 282/282 [08:11<00:00,  1.74s/it]


In [132]:
final_data = await process_mpa_data(
    eez_mpas_data_join.pipe(add_location_iso).pipe(assign_iso3),
    range(2011, time.localtime().tm_year + 1),
    ["PA_DEF", "iso_3"],
    {"protectedAreasCount": "sum"},
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [03:22<00:00, 14.49s/it]


In [133]:
# save final data
final_data.to_csv(pipe_dir_mpas.get_processed_step_path(prev_step).joinpath("mpa_preprocessed.csv"), index=False)


In [134]:
final_data2 = final_data.copy()

coverage = (
    final_data2.pipe(calculate_global_area, ["year", "PA_DEF"], {"area": "sum"}, "iso_3")
    .pipe(separate_parent_iso, "iso_3")
    .pipe(add_region_iso, "iso_3")
    .replace(
        {
            "iso_3": {
                "ATA": "ABNJ",
                "COK": "NZL",
                "IOT": "GBR",
                "NIU": "NZL",
                "SHN": "GBR",
                "SJM": "NOR",
                "UMI": "USA",
                "NCL": "FRA",
                "GIB": "GBR",
            }
        }
    )
    .pipe(calculate_stats_cov, ["year", "PA_DEF"], "iso_3").astype({"PA_DEF": int})
    .pipe(add_mpa_oecm_percentages)
    .pipe(add_total_marine_area)
    .pipe(coverage_stats2)
    .pipe(calculate_coverage_percentage_pa)
    .pipe(calculate_global_contribution)
    .pipe(add_is_last_year)
    .pipe(add_environment)
)
coverage.head(2)


Unnamed: 0,year,iso_3,area,protected_areas_count,oecms,pas,total_marine_area,protected_area,coverage,global_contribution,is_last_year,environment
0,2010,ABNJ,996236.125498,29.0,0.0,100.0,212881389.0,996236.13,0.467977,0.275966,0,1
1,2010,AF,129790.939474,427.0,2.34192,97.65808,14878058.0,129790.94,0.872365,0.035953,0,1


In [135]:
NewProtectedAreaExtentSchema(
    coverage.pipe(
        output,
        "iso_3",
        {},
        {},
        ["area", "iso_3", 'total_marine_area'],
    )
).to_csv(
    output_file,
    index=True,
)

In [136]:
remote_path = 'vizzuality_processed_data/strapi_tables/mpa_coverage.csv'

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=output_file,
    operation="w",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


In [None]:
# strapi_collection = "protection-coverage-stat"

In [None]:
# strapi.deleteCollectionData(strapi_collection, list(range(1, 2300)))

<helpers.strapi.Strapi at 0x7fda8ddb8860>

In [None]:
# strapi.importCollectionData(
#     strapi_collection,
#     output_file,
# )

### Coverage stats - terrestrial

In [118]:
pipe = "mpa-terrestrial"
step = "preprocess"
strapi_collection_mpas = "mpa-terrestrial"

pipe_dir = FileConventionHandler(pipe)
pipe_dir_gadm = FileConventionHandler("gadm")

working_folder = FileConventionHandler(pipe)
input_path = working_folder.pipe_raw_path
temp_working_path = working_folder.get_temp_file_path(step)
output_file_sjoin = pipe_dir.get_processed_step_path(current_step).joinpath("tpa_sjoin.shp")
output_file_dissolve = pipe_dir.get_processed_step_path(current_step).joinpath("tpa_dissolve.csv")
output_file_tpas = pipe_dir.get_processed_step_path(current_step).joinpath("tpa_coverage.csv")

# Download the protected atlas file && unzip it
download_and_unzip_if_needed(pipe_dir, prev_step, mysettings)
# Download the mpaatlas file 
download_and_unzip_if_needed(pipe_dir_gadm, prev_step, mysettings)

/home/sofia/dev/skytruth-30x30/data/data/mpa-terrestrial/processed/mpa-terrestrial_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/mpa-terrestrial/processed/preprocess
/home/sofia/dev/skytruth-30x30/data/data/gadm/processed/gadm_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/gadm/processed/preprocess


PosixPath('/home/sofia/dev/skytruth-30x30/data/data/gadm/processed/preprocess')

In [None]:
# # Load the data
# wdpa = gpd.read_file(pipe_dir.get_step_fmt_file_path(prev_step, "gpkg")).pipe(
#     clean_geometries
# )
# gadm = gpd.read_file(pipe_dir_gadm.get_step_fmt_file_path(prev_step, "shp")).pipe(clean_geometries)

# gadm.sindex
# wdpa.sindex

<geopandas.sindex.SpatialIndex at 0x7f706660eb40>

In [None]:
# # Spatial join using overlay
# wdpa_subset = wdpa[
#     ~(
#         (wdpa.bounds.minx < -181)
#         | (wdpa.bounds.miny < -91)
#         | (wdpa.bounds.maxx > 181)
#         | (wdpa.bounds.maxy > 91)
#     )
# ].reset_index(drop=True)

# sjoin_gdf = await spatial_join(wdpa_subset, gadm)
# sjoin_gdf.rename(columns={"GID_0": "iso_3"}, inplace=True)

In [None]:
# # test that we have not produce duplicates
# sjoin_gdf.loc[sjoin_gdf.duplicated(subset=["WDPA_PID", "iso_3"], keep=False)].sort_values(
#     "WDPA_PID"
# )

Unnamed: 0,WDPAID,WDPA_PID,PA_DEF,NAME,DESIG_ENG,IUCN_CAT,MARINE,GIS_AREA,STATUS,STATUS_YR,PARENT_ISO,COUNTRY,iso_3,area_km2,geometry


In [None]:
# sjoin_gdf = filter_by_exluding_propossed_mpas(sjoin_gdf)
# len(sjoin_gdf)

289352

In [None]:
# # Save the spatial join
# sjoin_gdf.to_file(output_file_sjoin, driver="ESRI Shapefile")

INFO:pyogrio._io:Created 289,352 records


In [119]:
sjoin_gdf = gpd.read_file(output_file_sjoin)
sjoin_gdf["STATUS_YR"] = sjoin_gdf["STATUS_YR"].astype("Int64")

In [120]:
# # Calculate wdpa cumulative counts and pa and oecm percentages
cumulative_counts = cumulative_pa_def_counts(sjoin_gdf)
cumulative_counts

PA_DEF,iso_3,year,1,0,protected_areas_count
0,AFG,2010,10,0.0,10.0
1,AFG,2011,10,0.0,10.0
2,AFG,2012,10,0.0,10.0
3,AFG,2013,10,0.0,10.0
4,AFG,2014,10,0.0,10.0
...,...,...,...,...,...
2884,ZWE,2020,229,0.0,229.0
2885,ZWE,2021,229,0.0,229.0
2886,ZWE,2022,229,0.0,229.0
2887,ZWE,2023,229,0.0,229.0


In [None]:
# # Dissolve geometries to calculate the coverage
# data = await process_grid(sjoin_gdf)

In [None]:
# tpa = pd.concat(data, ignore_index=True).drop(columns=['STATUS_YR', 'index']).rename(columns={'area': 'protected_area'})
# tpa.head(5)

In [None]:
# # Group by 'iso_3' and 'year' and sum the 'area'
# tpa_grouped = tpa.groupby(['iso_3', 'year'], as_index=False)['protected_area'].sum()
# tpa_grouped.reset_index(drop=True, inplace=True)
# tpa_grouped.head(5)

In [None]:
# # save to csv
# tpa_grouped.to_csv(output_file_dissolve, index=False)

In [122]:
tpa_grouped = pd.read_csv(output_file_dissolve)
tpa_grouped.head(5)

Unnamed: 0,iso_3,year,protected_area
0,AFG,2010,1078.918622
1,AFG,2011,1078.918622
2,AFG,2012,1078.918622
3,AFG,2013,1078.918622
4,AFG,2014,1078.918622


In [123]:
# Add pa and oecm counts to the coverage table
coverage = (
    pd.merge(tpa_grouped, cumulative_counts, on=['iso_3', 'year'], how='left')
    .pipe(calculate_global_area_tpa, ["year"])
    .pipe(add_region_iso2, "iso_3")
    .pipe(calculate_stats_cov_pa, ["year"], "iso_3")
    .pipe(calculate_pa_def_percentages)
    .pipe(add_total_terrestrial_area)
    .pipe(calculate_coverage_percentage_pa)
    .pipe(calculate_global_contribution)
    .pipe(add_is_last_year)
    .pipe(add_environment)
)

NewProtectedAreaExtentSchema(
    coverage.pipe(
        output2,
        "iso_3",
        {},
        {},
        ["iso_3", 'total_terrestrial_area'],
    )
).to_csv(
    output_file_tpas,
    index=True,
)

coverage

Unnamed: 0,year,iso_3,protected_area,protected_areas_count,oecms,pas,total_terrestrial_area,coverage,global_contribution,is_last_year,environment
0,2010,AF,3.636311e+06,7272.0,0.0,100.0,29993094.71,12.123827,2.694465,0,2
1,2010,AS,2.051386e+06,24782.0,0.0,100.0,31625555.58,6.486481,1.520053,0,2
2,2010,AT,1.108333e+02,2.0,0.0,100.0,12088229.65,0.000917,0.000082,0,2
3,2010,EU,4.306080e+06,116128.0,0.0,100.0,30037571.37,14.335645,3.190756,0,2
4,2010,,2.006295e+06,52176.0,0.0,100.0,19371151.92,10.357127,1.486642,0,2
...,...,...,...,...,...,...,...,...,...,...,...
3004,2024,YEM,5.145397e+03,15.0,0.0,100.0,453741.18,1.133994,0.003813,1,2
3005,2024,ZAF,1.143850e+05,1631.0,0.0,100.0,1221327.52,9.365631,0.084758,1,2
3006,2024,ZMB,2.929805e+05,557.0,0.0,100.0,753990.33,38.857330,0.217095,1,2
3007,2024,ZNC,2.779983e+00,8.0,0.0,100.0,3314.08,0.083884,0.000002,1,2


In [41]:
remote_path = 'vizzuality_processed_data/strapi_tables/tpa_coverage.csv'

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=output_file_tpas,
    operation="w",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


### Coverage stats - all

In [124]:
pipe = "pa"
pipe_tpa = "mpa-terrestrial"
pipe_mpa = "mpa"
step = "preprocess"

pipe_dir = FileConventionHandler(pipe)
pipe_dir_tpa = FileConventionHandler(pipe_tpa)
pipe_dir_mpa = FileConventionHandler(pipe_mpa)

input_path_tpas = pipe_dir_tpa.get_processed_step_path(current_step).joinpath("tpa_coverage.csv")
input_path_mpas = pipe_dir_mpa.get_processed_step_path(current_step).joinpath("mpa_coverage.csv")

output_file = pipe_dir.get_processed_step_path(current_step).joinpath("protection_coverage_stats.csv")

In [125]:
tpa = pd.read_csv(input_path_tpas)
mpa = pd.read_csv(input_path_mpas)

In [126]:
# concatenate the two dataframes
final_data = pd.concat([tpa, mpa], ignore_index=True)
final_data.index = range(1, len(final_data) + 1)
final_data['id'] = final_data.index
final_data[final_data['id'] == 1]

Unnamed: 0,id,year,protected_area,protected_areas_count,oecms,pas,coverage,global_contribution,is_last_year,environment,location
1,1,2010,3636311.0,7272,0.0,100.0,12.123827,2.694465,0,2,3


In [127]:
NewProtectedAreaExtentSchema(final_data).to_csv(output_file, index=True)

In [128]:
remote_path = 'vizzuality_processed_data/strapi_tables/protection_coverage_stats.csv'

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=output_file,
    operation="w",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


### Mpa atlas - country stats Fully or highly protected

We are going to use the intermediate data from eez, in order to create a dataset that can be used as a land mask.
The steps are:
1. Load eez
2. Spatial inner Join the eez dataset with the Mpaatlas one
3. iso assign using the sovereign one provided by mpaatlas
4. dissolve by location
5. calculate the area for global regions and eez countries ussing mollwide projection
6. prepare the data to be ingested in strapi
7. upload the data to strapi

In [22]:
pipe = "mpaatlas"
strapi_collection = "mpaa-protection-level-stat"

pipe_dir_eez = FileConventionHandler("eez")
pipe_dir_mpaatlas = FileConventionHandler(pipe)
output_file = pipe_dir_mpaatlas.get_processed_step_path(current_step).joinpath(
    "mpaatlas_protection_level.csv"
)

# Download the EEZ file && unzip it
download_and_unzip_if_needed(pipe_dir_eez, prev_step, mysettings)
# Download the mpas file && unzip it
download_and_unzip_if_needed(pipe_dir_mpaatlas, prev_step, mysettings)

# Load the data
eez = gpd.read_file(pipe_dir_eez.get_step_fmt_file_path(prev_step, "shp")).pipe(clean_geometries)
mpaatlas_intermediate = gpd.read_file(
    pipe_dir_mpaatlas.get_step_fmt_file_path(prev_step, "shp")
).pipe(clean_geometries)

/home/sofia/dev/skytruth-30x30/data/data/eez/processed/eez_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/eez/processed/preprocess
/home/sofia/dev/skytruth-30x30/data/data/mpaatlas/processed/mpaatlas_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/mpaatlas/processed/preprocess


In [None]:
eez_mpaatlas_data_join = await spatial_join(
    eez, mpaatlas_intermediate.pipe(mpaatlas_filter_stablishment)
)

In [None]:
# To get an idea of the spatial join results
# eez_mpaatlas_data_join.to_file(
#     pipe_dir_mpaatlas.get_processed_step_path(current_step).joinpath("mpaatlas_sjoin.shp"),
#     driver="ESRI Shapefile",
# )

In [75]:
eez_mpaatlas_data_join.dissolve(by=["protecti_1", "location_i"], aggfunc={"name": "count"}).reset_index().to_file(
pipe_dir_mpaatlas.get_processed_step_path(current_step).joinpath("mpaatlas_sjoin_dissolved.shp"),
driver="ESRI Shapefile",
)

INFO:pyogrio._io:Created 54 records


In [79]:
result = (
    eez_mpaatlas_data_join.rename(columns={"location_i": "iso_3"})
    .pipe(process_mpaatlas_data)  
    .pipe(calculate_global_area, gby_col=["protecti_1"], iso_column="iso_3")
    .pipe(separate_parent_iso)
    .replace(
        {
            "location_i": {
                "COK": "NZL",
                "IOT": "GBR",
                "NIU": "NZL",
                "SHN": "GBR",
                "SJM": "NOR",
                "UMI": "USA",
                "NCL": "FRA",
            }
        }
    )
    .pipe(add_region_iso, iso_column="iso_3")
    .pipe(calculate_stats, gby_col=["protecti_1"], iso_column="iso_3")
    .query('protecti_1 != "less protected or unknown"')
    .pipe(fix_monaco, iso_column="iso_3", area_column="area_km2")
    .pipe(add_total_marine_area)
    .pipe(calculate_coverage_percentage_mpatlas)
    .pipe(
        output,
        iso_column="iso_3",
        rep_d={
            "protecti_1": {
                "fully or highly protected": 1,
            }
        },
        rename={"protecti_1": "mpaa_protection_level", "area_km2": "area"},
        drop_cols=["total_marine_area", "iso_3"],
    )
)

NewProtectionLevelSchema(result[~result.location.isna()].assign(year=2024)).to_csv(
    output_file, index=True
)

In [82]:
remote_path = 'vizzuality_processed_data/strapi_tables/mpaatlas_protection_level.csv'

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=output_file,
    operation="w",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


In [None]:
# strapi_collection = "mpaa-protection-level-stat"

In [None]:
# strapi.deleteCollectionData(strapi_collection, list(range(1, 300)))

<helpers.strapi.Strapi at 0x7fda8ddb8860>

In [None]:
# strapi.importCollectionData(
#     strapi_collection,
#     output_file,
# )

### Protected seas  - fishing protection level

In [158]:
pipe = "protectedseas"
strapi_collection = "fishing-protection-level-stat"

pipe_dir = FileConventionHandler(pipe)
input_file = pipe_dir.get_processed_step_path(prev_step).joinpath("protectedseas_stats.xlsx")
output_file = pipe_dir.get_processed_step_path(current_step).joinpath("lfp.csv")

# Download the protected seas file && unzip it
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name="vizzuality_processed_data/protectedseas/preprocess/protectedseas_stats.xlsx",
    file=input_file,
    operation="r",
)

# Load the data
protectedseas_intermediate = pd.read_excel(input_file)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


In [159]:
protectedseas_intermediate[
    (
        protectedseas_intermediate.iso_ter.isna()
        & protectedseas_intermediate.includes_multi_jurisdictional_areas.eq(True)
    )
    | (
        protectedseas_intermediate.iso_ter.isna()
        & protectedseas_intermediate.includes_multi_jurisdictional_areas.eq(False)
        & ~protectedseas_intermediate.iso_sov.isin(
            protectedseas_intermediate[
                protectedseas_intermediate.includes_multi_jurisdictional_areas.eq(True)
            ].iso_sov.unique()
        )
    )
][protectedseas_intermediate.iso_sov.eq("ESP")]

Unnamed: 0,iso_ter,iso_sov,includes_multi_jurisdictional_areas,lfp,area_sqkm,total_area,pct_total
320,,ESP,True,5,142.97301,1011023.776,0.014141
321,,ESP,True,4,1639.682076,1011023.776,0.16218
322,,ESP,True,3,214532.8498,1011023.776,21.219367
323,,ESP,True,2,15064.13277,1011023.776,1.489988
324,,ESP,True,1,779644.1388,1011023.776,77.114323


In [160]:
final = (
    protectedseas_intermediate[
        (
            protectedseas_intermediate.iso_ter.isna()
            & protectedseas_intermediate.includes_multi_jurisdictional_areas.eq(True)
        )
        | (
            protectedseas_intermediate.iso_ter.isna()
            & protectedseas_intermediate.includes_multi_jurisdictional_areas.eq(False)
            & ~protectedseas_intermediate.iso_sov.isin(
                protectedseas_intermediate[
                    protectedseas_intermediate.includes_multi_jurisdictional_areas.eq(True)
                ].iso_sov.unique()
            )
        )
    ].replace(
        {
            "lfp": {
                5: "highly",
                4: "highly",
                3: "moderately",
                2: "less",
                1: "less",
            },
        }
    ).groupby(["iso_sov", "lfp"]).agg({"area_sqkm": "sum", "total_area": "max"}).reset_index()
    .pipe(
        calculate_global_area,
        gby_col=["lfp"],
        iso_column="iso_sov",
        agg_ops={"area_sqkm": "sum", "total_area": "sum"},
    )
    .pipe(add_region_iso, iso_column="iso_sov")
    .pipe(
        calculate_stats,
        gby_col=["lfp"],
        ops={"area_sqkm": "sum", "total_area": "sum"},
        iso_column="iso_sov",
    )
    .pipe(lambda x: x.assign(pct=round((x.area_sqkm / x.total_area)*100, 2)))
    .pipe(
        output,
        iso_column="iso_sov",
        rep_d={
            "lfp": {
                "highly": 1,
                "moderately": 2,
                "less": 3,
            }
        },
        rename={"lfp": "fishing_protection_level", "area_sqkm": "area"},
        drop_cols=["iso_sov", "total_area"],
    )
)
FPLSchema(final[final.location.notna()]).to_csv(output_file, index=True)

In [161]:
remote_path = 'vizzuality_processed_data/strapi_tables/lfp.csv'

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=output_file,
    operation="w",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


In [None]:
# strapi.deleteCollectionData(strapi_collection, list(range(1, 500)))

In [None]:
# strapi.importCollectionData(
#     strapi_collection,
#     output_file,
# )

  1- lower case the columns   
2- separate location that its regime is in dispute or on join regime  
3- calcualte area for mpaatlas data  
4- rename columns for merge  
5- merge maaatlas and mpa data identifying the source  
6- identify child resources and set them as childs  
7- calculate bbox  
8- set child resources  
9- prepare output for batch export  
10- upload data to strapi  

### Country mpas detail table data

  1- lower case the columns   
2- separate location that its regime is in dispute or on join regime  
3- calcualte area for mpaatlas data  
4- rename columns for merge  
5- merge maaatlas and mpa data identifying the source  
6- identify child resources and set them as childs  
7- calculate bbox  
8- set child resources  
9- prepare output for batch export  
10- upload data to strapi  

In [7]:
pipe = "mpa"
strapi_collection_mpas = "mpa"

pipe_dir = FileConventionHandler(pipe)
pipe_dir_mpaatlas = FileConventionHandler("mpaatlas")
output_file_mpas = pipe_dir.get_processed_step_path(current_step).joinpath("mpa_detail.csv")

# Download the protected atlas file && unzip it
download_and_unzip_if_needed(pipe_dir, prev_step, mysettings)
# Download the mpaatlas file 
download_and_unzip_if_needed(pipe_dir_mpaatlas, prev_step, mysettings)

/home/sofia/dev/skytruth-30x30/data/data/mpa/processed/mpa_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/mpa/processed/preprocess
/home/sofia/dev/skytruth-30x30/data/data/mpaatlas/processed/mpaatlas_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/mpaatlas/processed/preprocess


PosixPath('/home/sofia/dev/skytruth-30x30/data/data/mpaatlas/processed/preprocess')

In [8]:
# Load the data
mpa_intermediate = gpd.read_file(pipe_dir.get_step_fmt_file_path(prev_step, "shp")).pipe(
    clean_geometries
)
mpaatlas_intermediate = gpd.read_file(
    pipe_dir_mpaatlas.get_step_fmt_file_path(prev_step, "shp")
).pipe(clean_geometries)

In [9]:
init_table = (
    pd.concat(
        [
            (
                mpa_intermediate.pipe(columns_to_lower)
                .pipe(separate_parent_iso, iso_column="parent_iso")
                .pipe(change_ata_to_abnj)
                .rename(
                    columns={
                        "parent_iso": "iso",
                        "status_yr": "year",
                        "gis_m_area": "area_km2",
                    }
                ).drop(columns=['status'])
            ).assign(source="protected_planet"),
            (
                mpaatlas_intermediate.pipe(calculate_area)
                .pipe(extract_wdpaid_mpaatlas)
                .pipe(separate_parent_iso, iso_column="location_i")
                .rename(
                    columns={
                        "location_i": "iso",
                        "wdpa_id": "wdpa_pid",
                        "designatio": "desig_eng",
                    }
                )
            ).assign(source="mpaatlas")
            .assign(pa_def=1)
            .astype({"mpa_zone_i": "Int64"}),
        ],
        ignore_index=True,
    )
    .reset_index(drop=True)
    .replace(
        {
            "iso": {
                "COK": "NZL",
                "IOT": "GBR",
                "NIU": "NZL",
                "SHN": "GBR",
                "SJM": "NOR",
                "UMI": "USA",
                "NCL": "FRA",
            }
        }
    )
    .sort_values(by=["wdpa_pid", "wdpa_pid", "source"], ascending=[True, True, False])
)

In [10]:
#  to be run if things change a lot in the future
# iucn_cat = pd.DataFrame(
#     {"slug": init_table.iucn_cat.dropna().unique(), "name": init_table.iucn_cat.dropna().unique()},
#     index=pd.Index(np.arange(1, len(init_table.iucn_cat.dropna().unique()) + 1)),
# )
# iucn_cat.to_csv(pipe_dir.get_processed_step_path(current_step).joinpath("iucn_categories.csv"), index=True)

iucn_cat = pd.read_csv(
    pipe_dir.get_processed_step_path(current_step).joinpath("iucn_categories.csv"), index_col=0
)

In [11]:
mpa_table = (
    init_table.pipe(add_bbox, "bbox")
    .pipe(define_is_child)
    .pipe(set_child_id)
    .sort_values(by=["wdpaid", "is_child"], ascending=[True, True])
    .reset_index(drop=True)
    .pipe(add_total_marine_area)
    .rename(columns={"area_km2": "protected_area"})
    .pipe(calculate_coverage_percentage_pa)
    .pipe(add_environment)
    .pipe(
        output,
        iso_column="iso",
        rep_d={
            "status": {
                "Adopted": 4,
                "implemented": 6,
                "Established": 6,
                "Designated": 5,
                "Proposed": 3,
                "Inscribed": 3,
                "unknown": 1,
            },
            "pa_def": {"0": 2, "1": 1},
            "year": {0: pd.NA},
            "iucn_cat": dict(
                iucn_cat[["slug"]]
                .reset_index(drop=False)
                .iloc[:, [1, 0]]
                .to_dict(orient="tight")["data"]
            ),
            "source": {"protected_planet": 3, "mpaatlas": 1},
            "protection": {
                "full": 3,
                "light": 4,
                "incompatible": 5,
                "high": 6,
                "minimal": 7,
                "unknown": 8,
                "unknown/to be determined": 8,
            },
            "establishm": {
                "actively managed": 4,
                "implemented": 6,
                "designated": 5,
                "Designated": 5,
                "proposed or committed": 3,
                "Proposed": 3,
                "Inscribed": 3,
                "Established": 5,
                "Adopted": 5,
                "unknown": 1,
            },
        },
        rename={
            "pa_def": "protection_status",
            "protected_area": "area",
            "iucn_cat": "iucn_category",
            "desig_eng": "designation",
            "protection": "mpaa_protection_level",
            "establishm": "mpaa_establishment_stage",
            "source": "data_source",
        },
        drop_cols=["geometry", "protecti_1","mpa_zone_i", "iso", "total_marine_area"]
    )
    .astype(
        {
            "year": "Int32",
            "iucn_category": "Int64",
            "protection_status": "Int64",
        }
    )
    .query("coverage <= 100") 
    .sort_index()
)

  return df.assign(child_id=df[columns].bfill(axis=1)[columns[0]])
  df.replace(rep_d)


In [14]:
mpa_table[mpa_table["wdpaid"] == 170]

Unnamed: 0_level_0,wdpaid,wdpa_pid,protection_status,name,designation,iucn_category,year,area,data_source,mpaa_establishment_stage,mpaa_protection_level,bbox,is_child,child_id,coverage,environment,location
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
18,170,170.0,1,Isla del Coco,National Park,2.0,2022,54819.042632,3,,,"[-88.987016503, 4.529014728999982, -86.3670124...",False,170.0,9.150798,1,39.0
19,170,170.0,1,Isla del Coco - Zona Minima Intervencion,National Park,,1978,1950.5,1,4.0,3.0,"[-87.29513967897267, 5.298053442111269, -86.82...",True,170.0,0.325592,1,39.0
20,170,170.0,1,Isla del Coco - Zona Media Intervencion,National Park,,1978,0.93,1,4.0,3.0,"[-87.1038528170242, 5.492165352309547, -87.030...",True,170.0,0.000155,1,39.0
21,170,170.0,1,Isla del Coco - Zona Baja Intervencion,National Park,,1978,70.72,1,4.0,3.0,"[-87.11119966572133, 5.482019746658279, -86.95...",True,170.0,0.011805,1,39.0
22,170,170.0,1,Isla del Coco - 2022 Expansion,National Park,,2021,55081.21,1,5.0,8.0,"[-88.987, 4.529, -86.367, 6.237]",True,170.0,9.194561,1,39.0


In [None]:
# # Validate and save
# PAsSchema(mpa_table[mpa_table.location.notna()]).to_csv(output_file_mpas, index=True)

### Country pas - detail table data

1- lower case the columns   
2- separate location that its regime is in dispute or on join regime 
3- remove ATA and ABNJ because Protected planet doesn't include stats for ATA and ABNJ is marine 
4- rename columns for merge   
5- identify child resources and set them as childs  
6- calculate bbox  
7- set child resources  
8- prepare output for batch export  
9- upload data to strapi  

In [12]:
pipe = "mpa-terrestrial"
strapi_collection_mpas = "mpa-terrestrial"

pipe_dir = FileConventionHandler(pipe)
pipe_dir_gadm = FileConventionHandler("gadm")
output_file_tpas = pipe_dir.get_processed_step_path(current_step).joinpath("tpa_detail.csv")

# # Download the protected atlas file && unzip it
# download_and_unzip_if_needed(pipe_dir, prev_step, mysettings)
# # Download the gadm file 
# download_and_unzip_if_needed(pipe_dir_gadm, prev_step, mysettings)

In [13]:
tpa_intermediate = gpd.read_file(pipe_dir.get_step_fmt_file_path(prev_step, "gpkg"))

In [14]:
iucn_cat = pd.read_csv(
    pipe_dir.get_processed_step_path(current_step).joinpath("iucn_categories.csv"), index_col=0
)

In [15]:
init_table = (
    pd.concat(
        [
            (
                tpa_intermediate.pipe(columns_to_lower)
                .pipe(separate_parent_iso, iso_column="parent_iso")
                .query("parent_iso != 'ATA' and parent_iso != 'ABNJ'")
                .rename(
                    columns={
                        "parent_iso": "iso",
                        "status_yr": "year",
                        "gis_area": "protected_area",
                    }
                ).drop(columns=['status'])
            ).assign(source="protected_planet"),
        ],
        ignore_index=True,
    )
    .reset_index(drop=True)
    .replace(
        {
            "iso": {
                "COK": "NZL",
                "IOT": "GBR",
                "NIU": "NZL",
                "SHN": "GBR",
                "SJM": "NOR",
                "UMI": "USA",
                "NCL": "FRA",
            }
        }
    )
)

In [16]:
tpa_table = (
    init_table.pipe(add_bbox, "bbox")
    .pipe(define_is_child)
    .pipe(set_child_id_pa)
    .sort_values(by=["wdpaid", "is_child"], ascending=[True, True])
    .reset_index(drop=True)
    .pipe(add_total_terrestrial_area)
    .pipe(calculate_coverage_percentage_pa)
    .pipe(add_environment)
    .pipe(
        output2,
        iso_column="iso",
        rep_d={
            "pa_def": {"0": 2, "1": 1},
            "year": {0: pd.NA},
            "iucn_cat": dict(
                iucn_cat[["slug"]]
                .reset_index(drop=False)
                .iloc[:, [1, 0]]
                .to_dict(orient="tight")["data"]
            ),
            "source": {"protected_planet": 3},
        },
        rename={
            "pa_def": "protection_status",
            "protected_area": "area",
            "iucn_cat": "iucn_category",
            "desig_eng": "designation",
            "source": "data_source",
        },
        drop_cols=["geometry", "iso", "marine", "total_terrestrial_area"]
    )
    .astype(
        {
            "year": "Int32",
            "iucn_category": "Int64",
            "protection_status": "Int64",
        }
    )
    .query("coverage <= 100") 
    .sort_index()
)

  df.replace(rep_d)


In [17]:
# Add col mpaa_protection_level and mpa_establishment_stage to the table to validate it
tpa_table['mpaa_protection_level'] = np.nan
tpa_table['mpaa_establishment_stage'] = np.nan

In [17]:
# # Validate and save
# PAsSchema(tpa_table[tpa_table.location.notna()]).to_csv(output_file_tpas, index=True)

### Country marine and terrestrial - Detail table

In [18]:
pipe_mar = "mpa"
pipe_ter = "mpa-terrestrial"
pipe_pa = "pa"
step = "preprocess"
strapi_collection_pas = "pa"


pipe_dir_mar = FileConventionHandler(pipe_mar)
pipe_dir_ter = FileConventionHandler(pipe_ter)
pipe_dir_pa = FileConventionHandler(pipe_pa)

input_path_mar = pipe_dir_mar.get_processed_step_path(current_step).joinpath("mpa_detail.csv")
input_path_ter = pipe_dir_ter.get_processed_step_path(current_step).joinpath("tpa_detail.csv")
output_file_pa = pipe_dir_pa.get_processed_step_path(current_step).joinpath("pa_detail.csv")

In [19]:
final_table = pd.concat([mpa_table, tpa_table], ignore_index=True)
final_table.index = final_table.index + 1
final_table.index.name = 'id'
final_table = final_table.pipe(add_child_parent_relationship).drop(columns=['wdpa_pid', 'is_child', 'child_id']).sort_values(by=['parent'])

In [20]:
len(final_table)

306123

Note! When uploading the tables the schema doesn't work. I need to run the code to generate them and then it works.

In [83]:
# # Create final table with all the data
# mpa_table2 = pd.read_csv(input_path_mar)
# tpa_table2 = pd.read_csv(input_path_ter)
# final_table = pd.concat([mpa_table2, tpa_table2])
# final_table.index = range(1, len(final_table) + 1)
# final_table.index.name = 'id'
# final_table.drop(columns=['id'], inplace=True)
# final_table.head(2)

In [61]:
PAsSchema(final_table[final_table.location.notna()]).to_csv(output_file_pa, index=True)

In [23]:
# Divide table into two tables
final_table1 = final_table.drop(columns=['parent'])
final_table2 = final_table[['parent']]

In [45]:
# batch_export(
#     final_table1[final_table1.area.notna()],
#     4000,
#     PAsSchemaChunk1,
#     pipe_dir_pa.get_processed_step_path(current_step).joinpath("chunks1"),
#     "pa_detail",
#     format="json",
#     strapi_colection=strapi_collection_pas,
# )

batch_export(
    final_table2,
    10000,
    PAsSchemaChunk2,
    pipe_dir_pa.get_processed_step_path(current_step).joinpath("chunks2"),
    "pa_detail",
    format="json",
    strapi_colection=strapi_collection_pas,
)

In [40]:
# zip data
make_archive(pipe_dir_pa.get_processed_step_path(current_step).joinpath("chunks1"), pipe_dir_pa.get_processed_step_path(current_step).joinpath("chunks1.zip"))
make_archive(pipe_dir_pa.get_processed_step_path(current_step).joinpath("chunks2"), pipe_dir_pa.get_processed_step_path(current_step).joinpath("chunks2.zip"))

In [42]:
# LOAD
## load zipped file to GCS
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name='vizzuality_processed_data/strapi_tables/pa_chunks1.zip',
    file=pipe_dir_pa.get_processed_step_path(current_step).joinpath("chunks1.zip"),
    operation="w",
)

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name='vizzuality_processed_data/strapi_tables/pa_chunks2.zip',
    file=pipe_dir_pa.get_processed_step_path(current_step).joinpath("chunks2.zip"),
    operation="w",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token
DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


In [None]:
# strapi.deleteCollectionData("pa", list(range(1, 20914)))

In [None]:
# for i in range(0, 4):
#     strapi.importCollectionData(
#         strapi_collection_mpas,
#         mpa_folder.joinpath(f"mpa_detail_{i}.csv"),
#     )

In [62]:
# Retrive the ids left out in the batch process
left_out_ids = range(4000, 306124, 4000)
left_out_rows = final_table.loc[left_out_ids]
len(left_out_rows)

76

In [64]:
# Import all cols but parent
left_out_rows1 = left_out_rows.drop(columns=['parent'])

left_out_rows1 = left_out_rows1.reset_index()
left_out_rows1.index = left_out_rows1['id']

output_file = pipe_dir_pa.get_processed_step_path(current_step).joinpath("left_out_rows1.json")
left_out_rows1.to_json(output_file, orient="index")


In [65]:
# Import all cols but parent
left_out_rows2 = left_out_rows[['parent']]

left_out_rows2 = left_out_rows2.reset_index()
left_out_rows2.index = left_out_rows2['id']

output_file = pipe_dir_pa.get_processed_step_path(current_step).joinpath("left_out_rows2.json")
left_out_rows2.to_json(output_file, orient="index")

In [67]:
# zip data
make_archive(pipe_dir_pa.get_processed_step_path(current_step).joinpath("left_out_rows1.json"), pipe_dir_pa.get_processed_step_path(current_step).joinpath("left_out_rows1.zip"))
make_archive(pipe_dir_pa.get_processed_step_path(current_step).joinpath("left_out_rows2.json"), pipe_dir_pa.get_processed_step_path(current_step).joinpath("left_out_rows2.zip"))

In [68]:
## load zipped file to GCS
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name='vizzuality_processed_data/strapi_tables/pa_left_out_rows1.zip',
    file=pipe_dir_pa.get_processed_step_path(current_step).joinpath("left_out_rows1.zip"),
    operation="w",
)

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name='vizzuality_processed_data/strapi_tables/pa_left_out_rows2.zip',
    file=pipe_dir_pa.get_processed_step_path(current_step).joinpath("left_out_rows2.zip"),
    operation="w",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token
DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


### Habitats

In [4]:
pipe = "terrestrial-habitats"
collection_name = "terrestrial_habitats"

pipe_dir = FileConventionHandler(pipe)
input_file_ter = pipe_dir.get_processed_step_path(prev_step).joinpath("master_data_protection.csv")
input_file_mar = pipe_dir.get_processed_step_path(prev_step).joinpath("habitats6.csv")
output_file = pipe_dir.get_processed_step_path(current_step).joinpath("habitats_all.csv")

# Download the terrestrial habitats table from the bucket
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name="vizzuality_processed_data/habitats/preprocess/master_data_protection.csv",
    file=input_file_ter,
    operation="r",
)

# Download the marine habitats table from the bucket
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name="vizzuality_processed_data/processed_statistic_tables/habitats6.csv",
    file=input_file_mar,
    operation="r",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token
DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


In [5]:
habitat_mar = pd.read_csv(input_file_mar, na_values=['', 'NaN', 'NULL'])
habitat_mar['environment'] = 1
habitat_mar['location_id'] = habitat_mar['location_id'].fillna('NA')

In [26]:
habitat_ter = pd.read_csv(input_file_ter).drop(columns=['frac', 'perc_extent', 'total_area']).rename(columns ={'habitats':'habitat_name','total': 'pixel_habitat', 'protected': 'pixel_protected'})
habitat_ter

Unnamed: 0,iso_3,habitat_name,pixel_habitat,pixel_protected
0,GLOB,Artificial,28259249.0,2776193.0
1,GLOB,Desert,111106481.0,7778007.0
2,GLOB,Forest,49125087.0,10353320.0
3,GLOB,Grassland,35459546.0,5491398.0
4,GLOB,Other,864004.0,291628.0
...,...,...,...,...
1728,ZWE,Other,0.0,0.0
1729,ZWE,Rocky/mountains,0.0,0.0
1730,ZWE,Savanna,231134.0,97790.0
1731,ZWE,Shrubland,52656.0,4262.0


In [27]:
# Remove all rows where "habitat_name" is "Other"
habitat_ter = habitat_ter[habitat_ter['habitat_name'] != 'Other'].copy()

# calculate total_area by adding up "total" per iso_3
habitat_ter['total_pixels'] = habitat_ter.groupby('iso_3')['pixel_habitat'].transform('sum')
habitat_ter

Unnamed: 0,iso_3,habitat_name,pixel_habitat,pixel_protected,total_pixels
0,GLOB,Artificial,28259249.0,2776193.0,267352678.0
1,GLOB,Desert,111106481.0,7778007.0,267352678.0
2,GLOB,Forest,49125087.0,10353320.0,267352678.0
3,GLOB,Grassland,35459546.0,5491398.0,267352678.0
5,GLOB,Rocky/mountains,3571486.0,478102.0,267352678.0
...,...,...,...,...,...
1727,ZWE,Grassland,3417.0,263.0,418294.0
1729,ZWE,Rocky/mountains,0.0,0.0,418294.0
1730,ZWE,Savanna,231134.0,97790.0,418294.0
1731,ZWE,Shrubland,52656.0,4262.0,418294.0


In [28]:
# Assign territories to their soveraign countries
with open(scripts_dir.joinpath('data_commons/data/dependency_to_parent.json'), 'r') as json_file:
    dependency_to_parent = json.load(json_file)

mapping = {key: value[0] for key, value in dependency_to_parent.items()}

habitat_ter['iso_3'] = habitat_ter['iso_3'].map(mapping).fillna(habitat_ter['iso_3'])

In [29]:
# groupby country and habitats and sum the pixeles
habitat_ter_grouped = habitat_ter.groupby(['iso_3', 'habitat_name']).sum().reset_index()
habitat_ter_grouped

Unnamed: 0,iso_3,habitat_name,pixel_habitat,pixel_protected,total_pixels
0,AFG,Artificial,56625.0,938.0,782480.0
1,AFG,Desert,274553.0,3639.0,782480.0
2,AFG,Forest,3928.0,219.0,782480.0
3,AFG,Grassland,292319.0,9275.0,782480.0
4,AFG,Rocky/mountains,137716.0,16183.0,782480.0
...,...,...,...,...,...
1505,ZWE,Grassland,3417.0,263.0,418294.0
1506,ZWE,Rocky/mountains,0.0,0.0,418294.0
1507,ZWE,Savanna,231134.0,97790.0,418294.0
1508,ZWE,Shrubland,52656.0,4262.0,418294.0


In [30]:
# Calculate the percentage of protected pixels and the percentage of extent of the habitat
habitat_ter_grouped['protected_perc'] = habitat_ter_grouped['pixel_protected']/habitat_ter_grouped['pixel_habitat']*100
habitat_ter_grouped['habitat_perc'] = habitat_ter_grouped['pixel_habitat']/habitat_ter_grouped['total_pixels']*100
habitat_ter_grouped

Unnamed: 0,iso_3,habitat_name,pixel_habitat,pixel_protected,total_pixels,protected_perc,habitat_perc
0,AFG,Artificial,56625.0,938.0,782480.0,1.656512,7.236607
1,AFG,Desert,274553.0,3639.0,782480.0,1.325427,35.087542
2,AFG,Forest,3928.0,219.0,782480.0,5.575356,0.501994
3,AFG,Grassland,292319.0,9275.0,782480.0,3.172904,37.358016
4,AFG,Rocky/mountains,137716.0,16183.0,782480.0,11.750995,17.599939
...,...,...,...,...,...,...,...
1505,ZWE,Grassland,3417.0,263.0,418294.0,7.696810,0.816890
1506,ZWE,Rocky/mountains,0.0,0.0,418294.0,,0.000000
1507,ZWE,Savanna,231134.0,97790.0,418294.0,42.308791,55.256351
1508,ZWE,Shrubland,52656.0,4262.0,418294.0,8.094044,12.588275


In [31]:
# Add country's terrestrial area
add_total_terrestrial_area(habitat_ter_grouped)

Unnamed: 0,iso_3,habitat_name,pixel_habitat,pixel_protected,total_pixels,protected_perc,habitat_perc,total_terrestrial_area
0,AFG,Artificial,56625.0,938.0,782480.0,1.656512,7.236607,644050.28
1,AFG,Desert,274553.0,3639.0,782480.0,1.325427,35.087542,644050.28
2,AFG,Forest,3928.0,219.0,782480.0,5.575356,0.501994,644050.28
3,AFG,Grassland,292319.0,9275.0,782480.0,3.172904,37.358016,644050.28
4,AFG,Rocky/mountains,137716.0,16183.0,782480.0,11.750995,17.599939,644050.28
...,...,...,...,...,...,...,...,...
1505,ZWE,Grassland,3417.0,263.0,418294.0,7.696810,0.816890,391234.88
1506,ZWE,Rocky/mountains,0.0,0.0,418294.0,,0.000000,391234.88
1507,ZWE,Savanna,231134.0,97790.0,418294.0,42.308791,55.256351,391234.88
1508,ZWE,Shrubland,52656.0,4262.0,418294.0,8.094044,12.588275,391234.88


In [32]:
# Estimate the total area and the protected area based on pixels proportions and the total terrestrial area
habitat_ter_grouped['total_habitat_area'] = habitat_ter_grouped['total_terrestrial_area']*habitat_ter_grouped['habitat_perc']/100
habitat_ter_grouped['protected_habitat_area'] = habitat_ter_grouped['total_terrestrial_area']*habitat_ter_grouped['protected_perc']/100

In [33]:
habitat_ter_grouped[habitat_ter_grouped['iso_3'] == 'AUT']

Unnamed: 0,iso_3,habitat_name,pixel_habitat,pixel_protected,total_pixels,protected_perc,habitat_perc,total_terrestrial_area,total_habitat_area,protected_habitat_area
71,AUT,Artificial,56023.0,17428.0,126396.0,31.108652,44.323396,83709.48,37102.884569,26040.89066
72,AUT,Desert,799.0,703.0,126396.0,87.984981,0.63214,83709.48,529.161323,73651.770263
73,AUT,Forest,39594.0,17631.0,126396.0,44.529474,31.325358,83709.48,26222.294623,37275.391268
74,AUT,Grassland,16498.0,9748.0,126396.0,59.08595,13.052628,83709.48,10926.287233,49460.541341
75,AUT,Rocky/mountains,1534.0,1090.0,126396.0,71.056063,1.213646,83709.48,1015.936757,59480.660495
76,AUT,Shrubland,10911.0,4511.0,126396.0,41.343598,8.632393,83709.48,7226.131652,34608.51107
77,AUT,Wetlands/open water,1037.0,870.0,126396.0,83.895853,0.820437,83709.48,686.783844,70228.782642


In [95]:
# Add regions
habitat_ter_grouped = add_region_iso2(habitat_ter_grouped, 'iso_3')

regions = habitat_ter_grouped.groupby(['region', 'habitat_name']).agg({
    'total_area': 'sum',
    'protected_area': 'sum'
}).reset_index()

regions.rename(columns={'region': 'location_id'}, inplace=True)
habitat_ter_grouped.drop(columns=['pixel_habitat', 'pixel_protected', 'total_pixel_area', 'protect_perc', 'extent_perc', 'total_terrestrial_area', 'region'], inplace=True)
habitat_ter_grouped = habitat_ter_grouped.rename(columns = {'iso_3':'location_id'})

In [96]:
# Concatenate regions and habitat_ter_grouped dataframes
habitats_terrestrial = pd.concat([regions, habitat_ter_grouped], ignore_index=True)

# fill protected_area and total_area with 0 if they are NaN
habitats_terrestrial['protected_area'] = habitats_terrestrial['protected_area'].fillna(0)
habitats_terrestrial['total_area'] = habitats_terrestrial['total_area'].fillna(0)

In [97]:
# Add year and environment columns
habitats_terrestrial['year'] = 2024
habitats_terrestrial['environment'] = 2

In [119]:
# Concatenate terrestrial and marine habitats
habitats_all = pd.concat([habitats_terrestrial, habitat_mar], ignore_index=True).rename(columns={'habitat_name': 'habitat'})
habitats_all['habitat'] = habitats_all['habitat'].str.lower()
habitats_all.head(10)

Unnamed: 0,location_id,habitat,total_area,protected_area,year,environment
0,AF,artificial,2924001.0,233152.7,2024,2
1,AF,desert,9872960.0,672698.4,2024,2
2,AF,forest,4458009.0,971304.0,2024,2
3,AF,grassland,2035644.0,228349.1,2024,2
4,AF,rocky/mountains,238469.1,47745.87,2024,2
5,AF,savanna,8387535.0,1910888.0,2024,2
6,AF,shrubland,1766346.0,203718.7,2024,2
7,AF,wetlands/open water,310130.4,59319.74,2024,2
8,AS,artificial,8041755.0,269935.0,2024,2
9,AS,desert,3538487.0,315082.3,2024,2


In [120]:
# change habitat to have the id of the habitat
habitat_dict = {
    'mangroves': 5,
    'seamounts': 6,
    'artificial': 43,
    'forest': 45,
    'grassland': 46,
    'wetlands/open water': 50,
    'seagrasses': 2,
    'cold-water corals': 4,
    'desert': 44,
    'rocky/mountains': 47,
    'savanna': 48,
    'shrubland': 49,
    'saltmarshes': 1,
    'warm-water corals': 3
}

habitats_all['habitat'] = habitats_all['habitat'].replace(habitat_dict)
habitats_all.head(10)

Unnamed: 0,location_id,habitat,total_area,protected_area,year,environment
0,AF,43,2924001.0,233152.7,2024,2
1,AF,44,9872960.0,672698.4,2024,2
2,AF,45,4458009.0,971304.0,2024,2
3,AF,46,2035644.0,228349.1,2024,2
4,AF,47,238469.1,47745.87,2024,2
5,AF,48,8387535.0,1910888.0,2024,2
6,AF,49,1766346.0,203718.7,2024,2
7,AF,50,310130.4,59319.74,2024,2
8,AS,43,8041755.0,269935.0,2024,2
9,AS,44,3538487.0,315082.3,2024,2


In [121]:
output2(habitats_all, 'location_id', {}, {}, ['location_id']).to_csv(output_file, index=True)

In [122]:
a = pd.read_csv(output_file)
a

Unnamed: 0,id,habitat,total_area,protected_area,year,environment,location
0,1,43,2.924001e+06,233152.675055,2024,2,3.0
1,2,44,9.872960e+06,672698.366583,2024,2,3.0
2,3,45,4.458009e+06,971303.987441,2024,2,3.0
3,4,46,2.035644e+06,228349.125359,2024,2,3.0
4,5,47,2.384691e+05,47745.870360,2024,2,3.0
...,...,...,...,...,...,...,...
2168,2169,5,7.429267e+04,21277.220000,2020,1,4.0
2169,2170,5,1.246190e+03,732.143750,2020,1,6.0
2170,2171,5,2.415419e+03,2097.740000,2020,1,7.0
2171,2172,5,3.989344e+04,27151.740000,2020,1,8.0


In [123]:
# Upload csv to bucket
remote_path = 'vizzuality_processed_data/strapi_tables/habitats.csv'

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=output_file,
    operation="w",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token
