In [42]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
import logging
import sys
from pathlib import Path
import time
import pandas as pd
import geopandas as gpd
import numpy as np
import json
import dotenv

dotenv.load_dotenv()

scripts_dir = Path(".").joinpath("src")
if scripts_dir not in sys.path:
    sys.path.insert(0, scripts_dir.resolve().as_posix())

from helpers.strapi import Strapi
from helpers.settings import get_settings, Settings
from helpers.file_handler import FileConventionHandler
from helpers.utils import download_and_unzip_if_needed, writeReadGCP

from pipelines.output_schemas import (
    FPLSchema,
    ProtectionLevelSchema,
    MPAsSchema,
    HabitatsSchema,
    LocationSchema,
    ProtectedAreaExtentSchema,
)
from pipelines.processors import (
    add_envelope,
    add_location_iso,
    expand_multiple_locations,
    add_region_iso,
    calculate_eez_area,
    add_bbox,
    add_groups_and_members,
    add_location_name,
    output,
    clean_geometries,
    filter_by_exluding_propossed_mpas,
    spatial_join,
    process_mpa_data,
    assign_iso3,
    calculate_global_area,
    separate_parent_iso,
    calculate_stats_cov,
    coverage_stats,
    mpaatlas_filter_stablishment,
    process_mpaatlas_data,
    calculate_stats,
    fix_monaco,
    batch_export,
    calculate_area,
    define_is_child,
    set_child_id,
    add_child_parent_relationship,
    columns_to_lower,
    extract_wdpaid_mpaatlas,
)

logging.basicConfig(level=logging.DEBUG)
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("fiona").setLevel(logging.WARNING)

In [2]:
mysettings = get_settings()
prev_step = "preprocess"
current_step = "stats"

In [3]:
pipe_eez = "eez"
pipe_eez_dir = FileConventionHandler(pipe_eez)
pipe_gadm = "gadm"
pipe_gadm_dir = FileConventionHandler(pipe_gadm)

output_file = pipe_gadm_dir.get_processed_step_path(current_step).joinpath("locations_all.json")

# Download the EEZ file && unzip it
download_and_unzip_if_needed(pipe_eez_dir, prev_step, mysettings)

# Download the gadm file && unzip it
download_and_unzip_if_needed(pipe_gadm_dir, prev_step, mysettings)

/home/sofia/dev/skytruth-30x30/data/data/eez/processed/eez_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/eez/processed/preprocess
/home/sofia/dev/skytruth-30x30/data/data/gadm/processed/gadm_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/gadm/processed/preprocess


PosixPath('/home/sofia/dev/skytruth-30x30/data/data/gadm/processed/preprocess')

In [4]:
# # Download country translations
working_folder = FileConventionHandler(pipe_gadm)
input_path = working_folder.pipe_raw_path
input_path

translations_csv_url = "vizzuality_processed_data/gadm/preprocess/locations_translated.csv"
translations_csv_output = input_path.joinpath(translations_csv_url.split("/")[-1])

# writeReadGCP(
#     credentials=mysettings.GCS_KEYFILE_JSON,
#     bucket_name=mysettings.GCS_BUCKET,
#     blob_name=translations_csv_url,
#     file=translations_csv_output,
#     operation="r",
# )

country_commitments_url = "30x30 National Commitments - MPAtlas Country Targets.csv"
country_commitments_output = input_path.joinpath(country_commitments_url.split("/")[-1])

# writeReadGCP(
#     credentials=mysettings.GCS_KEYFILE_JSON,
#     bucket_name=mysettings.GCS_BUCKET,
#     blob_name=country_commitments_url,
#     file=country_commitments_output,
#     operation="r",
# )

#### Functions

In [22]:
from typing import List, Union
import pandera as pa
from pandera.typing import Index, Series
import pandas as pd

class LocationSchemaAll(pa.DataFrameModel):
    id: Index[int] = pa.Field(gt=0, coerce=True)
    code: Series[str] = pa.Field(coerce=True)
    name: Series[str] = pa.Field(coerce=True)
    name_es: Series[str] = pa.Field(coerce=True)
    name_fr: Series[str] = pa.Field(coerce=True)
    total_marine_area: Series[int] = pa.Field(ge=0, coerce=True)  # noqa: N815
    total_terrestrial_area: Series[int] = pa.Field(ge=0, coerce=True)  # noqa: N815
    type: Series[str] = pa.Field(
        unique_values_eq=["country", "worldwide", "region", "highseas"], coerce=True
    )
    groups: Series[List[int]] = pa.Field(coerce=True)
    marine_bounds: Series[List[float]] = pa.Field(coerce=True, nullable=True)
    terrestrial_bounds: Series[List[float]] = pa.Field(coerce=True, nullable=True)
    marine_target: Series[pd.Int64Dtype] = pa.Field(nullable=True, coerce=True)
    marine_target_year: Series[pd.Int64Dtype] = pa.Field(coerce=True, nullable=True)

def round_to_list(bounds):
    return list(np.round(bounds, decimals=5))

def add_bbox(df: gpd.GeoDataFrame, col_name: str = "bounds") -> gpd.GeoDataFrame:
    return df.assign(**{col_name: df.geometry.bounds.apply(round_to_list, axis=1)})

def add_translations(df, translations_csv_path):
    translations_df = pd.read_csv(translations_csv_path, keep_default_na=False, na_values=[])
    
    df = df.merge(translations_df[['code', 'name_es', 'name_fr']], left_on='iso', right_on='code', how='left')
    
    return df

def calculate_gadm_area(df: pd.DataFrame) -> pd.DataFrame:
    glob = gpd.GeoDataFrame(
        {
            "iso": "GLOB",
            "AREA_KM2": 134954835,
            "location_type": "worldwide",
            "region": np.nan,
            "geometry": gpd.GeoSeries([gpd.GeoSeries(df["geometry"]).unary_union]),
        },
        crs="EPSG:4326",
    )

    terrestrial_areas = (
        df
        .dissolve(by=["iso", "region"], aggfunc={"AREA_KM2": "sum"})
        .reset_index()
        .assign(location_type="country")
    )
    regions_areas = (
        df
        .dissolve(by=["region"], aggfunc={"AREA_KM2": "sum"})
        .reset_index()
        .rename(columns={"region": "iso"})
        .assign(location_type="region")
    )
    result = (
        pd.concat(
            [
                glob,
                regions_areas,
                terrestrial_areas,
            ],
            ignore_index=True,
        )
        .dropna(subset=["iso"])
        .reset_index(drop=True)
    )
    result.index = result.index + 1
    result.index.name = "id"

    # Round AREA_KM2 to integers
    result["AREA_KM2"] = result["AREA_KM2"].round().astype(int)

    return result.assign(id=result.index)

def add_groups_and_members_land(df: pd.DataFrame | gpd.GeoDataFrame) -> pd.DataFrame | gpd.GeoDataFrame:
    return df.assign(
        groups=lambda row: row[["region", "location_type"]].apply(
            lambda x: (np.where(df.iso == x["region"])[0] + 2).tolist()
            if x["location_type"] == "country"
            else [],
            axis=1,
        )
    )

def combine_columns(df, col1, col2, new_col):
    """
    Combine two columns in a DataFrame using combine_first and assign to a new column.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the columns to combine.
    col1 (str): The name of the first column.
    col2 (str): The name of the second column.
    new_col (str): The name of the new column to assign the combined result.

    Returns:
    pd.DataFrame: The DataFrame with the new combined column.
    """
    df[new_col] = df[col1].combine_first(df[col2])
    return df


def add_region_iso_2(
    df: pd.DataFrame | gpd.GeoDataFrame, iso_column
) -> pd.DataFrame | gpd.GeoDataFrame:
    regions = pd.read_json(scripts_dir.joinpath("data_commons/data/regions_data2.json"))

    def find_region_iso(iso: str) -> Union[str, None]:
        filtered_regions = list(filter(lambda x: iso in x["country_iso_3s"], regions.get("data")))
        return filtered_regions[0]["region_iso"] if len(filtered_regions) > 0 else None

    return df.assign(region=lambda row: row[iso_column].apply(find_region_iso))

def add_location_name_2(df: pd.DataFrame | gpd.GeoDataFrame) -> pd.DataFrame | gpd.GeoDataFrame:
    with open(scripts_dir.joinpath('data_commons/data/iso_map2.json'), 'r') as f:
        iso_map = json.load(f)

    def get_name(iso):
        test = iso_map.get(iso, np.nan)
        return test

    return df.assign(name=df.iso.apply(get_name))


#### Note: gadm includes some extra iso codes that had to be included in the regions_data.json (provided by protected planet) to process the terrestrial stats:

'XCA': Caspian Sea, included in Asia & Pacific region

'XKO': Kosovo, included in Europe region

'ZNC': Northern Cyprus, included in Europe region

In [23]:
# Process EEZ data (marine data)
locations = (
    gpd.read_file(pipe_eez_dir.get_step_fmt_file_path(prev_step, "shp"))
    .pipe(add_envelope)
    .pipe(add_location_iso)
    .pipe(expand_multiple_locations)
    .pipe(add_region_iso, 'iso')
    .pipe(calculate_eez_area)
    .pipe(add_bbox)
    .pipe(add_groups_and_members)
    .pipe(add_location_name)
    .pipe(add_translations, translations_csv_output)
    .rename(
        columns={
           
            "AREA_KM2": "total_marine_area",
            "location_type": "type",
            "bounds":'marine_bounds'
        }
    )
).reset_index(drop=True)

locations.drop(
    columns=list(
        set(locations.columns) -
        set(["code", "name", "name_es", "name_fr", "total_marine_area", "marine_bounds", "type", "groups", "id"])
    ),
    inplace=True,
)


id_lookup = locations.set_index('code')['id'].to_dict() # Create a lookup dictionary for IDs from EEZ data

# Process GADM data 
locations_land = (
    gpd.read_file(pipe_gadm_dir.get_step_fmt_file_path(prev_step, "shp"))
    .rename(columns={"GID_0": "iso", 'area_km2': 'AREA_KM2'})
    .pipe(add_envelope)
    .pipe(add_region_iso_2, 'iso') # add_region_iso_2 is used instead of add_region_iso because gadm includes new iso codes
    .pipe(calculate_gadm_area)
    .pipe(add_bbox)
    .pipe(add_groups_and_members_land)
    .pipe(add_location_name_2)
    .pipe(add_translations, translations_csv_output)
    .rename(
        columns={
            "AREA_KM2": "total_terrestrial_area",
            "location_type": "type",
            "bounds": "terrestrial_bounds"
        }
    )
).reset_index(drop=True)

locations_land['id'] = locations_land['code'].map(id_lookup) # Apply the EEZ IDs to the GADM dataset

nan_mask = locations_land['id'].isna() # Identify the NaN values in the id column

new_ids = pd.Series(
    range(max(id_lookup.values()) + 1, max(id_lookup.values()) + 1 + nan_mask.sum()),
    index=locations_land[nan_mask].index
) # Generate new IDs for any GADM rows without an EEZ match

locations_land['id'] = locations_land['id'].fillna(new_ids).astype(int) # Assign the new IDs to the NaN values in the id column

locations_land.drop(
    columns=list(
        set(locations_land.columns) -
        set(["code", "name", "name_es", "name_fr", "total_terrestrial_area", "type", "groups", "terrestrial_bounds", "id"])
    ),
    inplace=True,
)

# Merge EEZ and GADM datasets
combined_locations = pd.merge(
    locations, locations_land,
    on=['code', 'id'],
    suffixes=('_marine', '_land'),
    how='outer'  
)

# Combine data from land and marine for each base column
base_columns = ['type', 'groups', 'name', 'name_es', 'name_fr']
for base_col in base_columns:
    marine_col = f"{base_col}_marine"
    land_col = f"{base_col}_land"
    combined_locations = combine_columns(combined_locations, marine_col, land_col, base_col)


# Fill NaN values with 0 for each column
columns_to_fill = ['total_marine_area', 'total_terrestrial_area']
for col in columns_to_fill:
    combined_locations[col] = combined_locations[col].fillna(0).astype(int)

# Force the id column to be an integer
combined_locations['id'] = combined_locations['id'].astype(int)


# Drop unnecessary columns
combined_locations.drop(
    columns=[col for col in combined_locations.columns if col.endswith('_marine') or col.endswith('_land')],
    inplace=True
)
combined_locations = combined_locations.reset_index(drop=True)

combined_locations


  "geometry": gpd.GeoSeries([gpd.GeoSeries(df["geometry"]).unary_union]),


Unnamed: 0,total_marine_area,id,marine_bounds,code,total_terrestrial_area,terrestrial_bounds,type,groups,name,name_es,name_fr
0,212881389,2,"[-180.0, -76.80012, 180.0, 90.0]",ABNJ,0,,highseas,[],Areas Beyond National Jurisdiction,Áreas fuera de la jurisdicción nacional,Zones au-delà de la juridiction nationale
1,14878058,3,"[-28.84709, -50.31506, 75.85287, 38.80087]",AF,29993095,"[-25.3618, -34.83514, 63.50347, 37.55986]",region,[],Africa,África,Afrique
2,0,168,,AFG,644050,"[60.50487, 29.36157, 74.89413, 38.49041]",country,[4],Afghanistan,Afganistán,Afghanistan
3,495866,10,"[8.19586, -17.27214, 13.86517, -5.02988]",AGO,1251701,"[11.6687, -18.04208, 24.08007, -4.37259]",country,[3],Angola,Angola,Angola
4,12165,11,"[18.32149, 39.64039, 20.02083, 42.0112]",ALB,28690,"[19.26416, 39.6507, 21.04909, 42.66043]",country,[6],Albania,Albania,Albanie
...,...,...,...,...,...,...,...,...,...,...,...
206,527384,166,"[41.08194, 8.95275, 57.946, 16.64959]",YEM,453741,"[41.81458, 12.10819, 54.53542, 19.0]",country,[9],Yemen,Yemen,Yémen
207,1547576,167,"[13.34802, -50.31506, 42.8475, -26.86206]",ZAF,1221328,"[16.45189, -34.83514, 32.89125, -22.12503]",country,[3],South Africa,Sudáfrica,Afrique du Sud
208,0,209,,ZMB,753990,"[21.98004, -18.07918, 33.71244, -8.27198]",country,[3],Zambia,Zambia,Zambie
209,0,210,,ZNC,3314,"[32.602, 35.00272, 34.60792, 35.71208]",country,[6],Northern Cyprus,Chipre del Norte,Chypre du Nord


In [7]:
# Add the national commitments (only marine for now)
commit = pd.read_csv(country_commitments_output, header=1)
commit = commit.iloc[:, :6][commit['30% National Target'] == 'Y']
commit.drop(columns=["% Fully/Highly*"], inplace=True)
commit['% National Target'] = commit['% National Target'].str.replace('%', '').astype(int)

# When % National Target is 30, fill By Year with 2030
commit['By Year'] = commit['By Year'].fillna(commit['% National Target'].apply(lambda x: '2030' if x == 30 else None))


In [8]:
# Include the national commitments in the combined_locations table
combined_locations = combined_locations.merge(commit[['Iso Code', '% National Target', 'By Year']], 
                                     left_on='code', right_on='Iso Code', how='left')

combined_locations.rename(columns={'% National Target': 'marine_target', 'By Year': 'marine_target_year'}, inplace=True)

combined_locations.drop(columns=['Iso Code'], inplace=True)

combined_locations['marine_target'] = combined_locations['marine_target'].astype(pd.Int64Dtype())
combined_locations['marine_target_year'] = combined_locations['marine_target_year'].astype(pd.Int64Dtype())

combined_locations = combined_locations.reset_index(drop=True)

# Add marine_target and marine_target_year to the combined_locations table for code 'GLOB'
combined_locations.loc[combined_locations['code'] == 'GLOB', 'marine_target'] = 30
combined_locations.loc[combined_locations['code'] == 'GLOB', 'marine_target_year'] = 2030 


# Force the index to have the values in id column (so they follow the order they had in the previous table)
combined_locations['index'] = combined_locations['id']
combined_locations.set_index('index', inplace=True)
combined_locations.sort_index(inplace=True)

combined_locations

Unnamed: 0_level_0,total_marine_area,id,marine_bounds,code,total_terrestrial_area,terrestrial_bounds,type,groups,name,name_es,name_fr,marine_target,marine_target_year
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,361000000,1,"[-180.0, -85.5625, 180.0, 90.0]",GLOB,134954835,"[-180.0, -90.0, 180.0, 83.65833]",worldwide,[],Global,Global,Global,30,2030
2,212881389,2,"[-180.0, -76.80012, 180.0, 90.0]",ABNJ,0,,highseas,[],Areas Beyond National Jurisdiction,Áreas fuera de la jurisdicción nacional,Zones au-delà de la juridiction nationale,,
3,14878058,3,"[-28.84709, -50.31506, 75.85287, 38.80087]",AF,29993095,"[-25.3618, -34.83514, 63.50347, 37.55986]",region,[],Africa,África,Afrique,,
4,54088687,4,"[-180.0, -58.44947, 180.0, 47.73081]",AS,31625556,"[-180.0, -55.11694, 180.0, 53.56086]",region,[],Asia & Pacific,Asia y Pacífico,Asie et Pacifique,,
5,9618978,5,"[-180.0, -85.5625, 180.0, -57.18865]",AT,12088230,"[-180.0, -90.0, 180.0, -59.59375]",region,[],Antarctica,Antártida,Antarctique,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,0,207,,XCA,371055,"[46.75388, 36.5723, 54.04378, 47.01562]",country,[4],Caspian Sea,Mar Caspio,Mer Caspienne,,
208,0,208,,XKO,10819,"[19.97939, 41.84826, 21.79305, 43.24613]",country,[6],Kosovo,Kosovo,Kosovo,,
209,0,209,,ZMB,753990,"[21.98004, -18.07918, 33.71244, -8.27198]",country,[3],Zambia,Zambia,Zambie,,
210,0,210,,ZNC,3314,"[32.602, 35.00272, 34.60792, 35.71208]",country,[6],Northern Cyprus,Chipre del Norte,Chypre du Nord,,


In [9]:
# Prepare final JSON output
output_locations_combined = {
    "version": 2,
    "data": {
        "api::location.location": LocationSchemaAll(pd.DataFrame(combined_locations)).to_dict(
            orient="index"
        )
    },
}

# Write the output to a JSON file (stored in gadm folder)
with open(output_file, "w") as f:
    json.dump(output_locations_combined, f)

del output_locations_combined

In [16]:
## Create locations_code (stored in gadm folder)
(combined_locations[['id', 'code']].rename(columns={'id': 'location'})
 .to_csv(pipe_gadm_dir.get_processed_step_path(current_step)
     .joinpath('locations_code_all.csv'), index=False))

## Save locations_code in data_commons/data folder
(combined_locations[['id', 'code']].rename(columns={'id': 'location'})
 .to_csv(scripts_dir.joinpath('data_commons/data/locations_code_all.csv'), index=False))

In [17]:
# Upload files to bucket
remote_path_code = 'vizzuality_processed_data/strapi_tables/location_code.csv'
remote_path_table = 'vizzuality_processed_data/strapi_tables/locations.json'

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path_code,
    file=scripts_dir.joinpath('data_commons/data/locations_code_all.csv'),
    operation="w",
)

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path_table,
    file=output_file,
    operation="w",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token
DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token
