In [1]:
# TODO: should we save every output as a [geoparquet](https://geoparquet.org/) in the future to improve read performance (reduction 30% read time)?

In [234]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [262]:
from logging import getLogger
import shutil
from pathlib import Path
import geopandas as gpd
import pandas as pd
import requests

scripts_dir = Path("../..").joinpath("src")
import sys
if scripts_dir not in sys.path:
    sys.path.insert(0, scripts_dir.resolve().as_posix())

from helpers.utils import downloadFile, rm_tree, make_archive, writeReadGCP
from helpers.settings import get_settings
from helpers.file_handler import FileConventionHandler
from pipelines.utils import watch
from pipelines.processors import (
    set_wdpa_id,
    protection_level,
    status,
    create_year,
    calculate_area,
    get_mpas,
    set_location_iso,
    set_fps_classes,
    filter_by_methodology,
    filter_by_terrestrial,
    transform_points,
    clean_geometries,
    simplify_async,
)

In [182]:
mysettings = get_settings()

### eez_intermediate

In [13]:
# Pipe params
force_clean = True
step = "preprocess"
pipe = "eez"

In [14]:
# Data sources
## EEZ
EEZ_url = "https://www.marineregions.org/download_file.php"
EEZ_file_name = "eez_v11.shp"
EEZ_params = {"name": "World_EEZ_v11_20191118.zip"}
EEZ_headers = {
    "content-type": "application/x-www-form-urlencoded",
    "cookie": "PHPSESSID=29190501b4503e4b33725cd6bd01e2c6; vliz_webc=vliz_webc2; jwplayer.captionLabel=Off",
    "dnt": "1",
    "origin": "https://www.marineregions.org",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "same-origin",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
}

EEZ_body = {
    "name": "Jason",
    "organisation": "skytruth",
    "email": "hello@skytruth.com",
    "country": "Spain",
    "user_category": "academia",
    "purpose_category": "Conservation",
    "agree": "1",
}

## High seas
hs_url = "https://www.marineregions.org/download_file.php"
hs_file_name = "High_seas_v1.shp"
hs_params = {"name": "World_High_Seas_v1_20200826.zip"}
hs_headers = {
    "content-type": "application/x-www-form-urlencoded",
    "cookie": "PHPSESSID=29190501b4503e4b33725cd6bd01e2c6; vliz_webc=vliz_webc2; jwplayer.captionLabel=Off",
    "dnt": "1",
    "origin": "https://www.marineregions.org",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "same-origin",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
}
hs_body = {
    "name": "Jason",
    "organisation": "skytruth",
    "email": "hello@skytruth.com",
    "country": "Spain",
    "user_category": "academia",
    "purpose_category": "Conservation",
    "agree": "1",
}

In [15]:
working_folder = FileConventionHandler(pipe)
input_path = working_folder.pipe_raw_path
temp_working_path = working_folder.get_temp_file_path(step)

output_path = working_folder.get_processed_step_path(step)
output_file = working_folder.get_step_fmt_file_path(step, "shp")
zipped_output_file = working_folder.get_step_fmt_file_path(step, "zip", True)
remote_path = working_folder.get_remote_path(step)

In [64]:
# Extract data
## download files EEZ & High seas
downloadFile(
    EEZ_url,
    input_path,
    EEZ_body,
    EEZ_params,
    EEZ_headers,
    overwrite=force_clean,
)
downloadFile(hs_url, input_path, hs_body, hs_params, hs_headers, overwrite=force_clean)

PosixPath('/home/mambauser/data/eez/raw/World_High_Seas_v1_20200826.zip')

In [71]:
## unzip file if needed & load data
unziped_folders = []
for idx, path in enumerate(input_path.glob("*.zip")):
    unziped_folder = temp_working_path.joinpath(path.stem)
    print(unziped_folder)

    if unziped_folder.exists() and force_clean:
        rm_tree(unziped_folder)

    shutil.unpack_archive(path, unziped_folder)

    files = [gpd.read_file(file) for file in unziped_folder.rglob("*.shp") if "boundaries" not in file.stem]
    unziped_folders.append(
        pd.concat(files)
    )

/home/mambauser/data/eez/raw/temp_preprocess/World_High_Seas_v1_20200826
/home/mambauser/data/eez/raw/temp_preprocess/World_EEZ_v11_20191118


In [72]:
for idx, gdf in enumerate(unziped_folders):
    print(f"GeoDataFrame {idx} has {len(gdf)} rows and {len(gdf.columns)} columns")

GeoDataFrame 0 has 1 rows and 6 columns
GeoDataFrame 1 has 281 rows and 32 columns


In [73]:
# Transform data
## set the same structure for both datasets updating the high seas one
unziped_folders[0] = (
    unziped_folders[0]
    .rename(
        columns={"name": "GEONAME", "area_km2": "AREA_KM2", "mrgid": "MRGID"},
    )
    .assign(
        POL_TYPE="High Seas",
        ISO_SOV1="ABNJ",
    )
)

# merge datasets
df = pd.concat(unziped_folders, ignore_index=True)

df.drop(
    columns=list(
        set(df.columns)
        - set(
            [
                "MRGID",
                "GEONAME",
                "POL_TYPE",
                "ISO_SOV1",
                "ISO_SOV2",
                "ISO_SOV3",
                "AREA_KM2",
                "geometry",
            ]
        )
    ),
    inplace=True,
)

In [75]:
# save data
gpd.GeoDataFrame(
    df,
    crs=unziped_folders[0].crs,
).to_file(filename=output_file.as_posix(), driver="ESRI Shapefile")

# zip data
make_archive(output_path, zipped_output_file)

  ogr_write(


In [76]:
# clean unzipped files
rm_tree(temp_working_path) if temp_working_path.exists() else None
rm_tree(output_path) if output_path.exists() else None

In [13]:
# LOAD
## load zipped file to GCS
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=zipped_output_file,
    operation="w",
)

### Countries gadm intermediate

In [78]:
# Pipe params
force_clean = True
step = "preprocess"
pipe = "gadm"

In [263]:
working_folder = FileConventionHandler(pipe)
input_path = working_folder.pipe_raw_path
temp_working_path = working_folder.get_temp_file_path(step)

output_path = working_folder.get_processed_step_path(step)
output_file = working_folder.get_step_fmt_file_path(step, "shp")
zipped_output_file = working_folder.get_step_fmt_file_path(step, "zip", True)
remote_path = working_folder.get_remote_path(step)

In [110]:
gadm_url = "https://geodata.ucdavis.edu/gadm/gadm4.1/gadm_410-levels.zip"
gadm_file_name = "gadm_410-levels.zip"

In [111]:
# Download data
input_file = downloadFile(
    gadm_url,
    input_path,
    overwrite=force_clean,
    file=gadm_file_name,
)

In [183]:
# Check if there is a zip file in the input_path
zip_file = next(input_path.glob("*.zip"), None)
if zip_file:
    unziped_folder = temp_working_path.joinpath(zip_file.stem)
    print(f"Processing: {unziped_folder}")

    if unziped_folder.exists() and force_clean:
        shutil.rmtree(unziped_folder)
        print(f"Removed existing folder: {unziped_folder}")

    # Unpack the archive
    shutil.unpack_archive(zip_file, unziped_folder)
    print(f"Unpacked {zip_file} to {unziped_folder}")


Processing: /home/mambauser/data/gadm/raw/temp_preprocess/gadm_410-levels
Removed existing folder: /home/mambauser/data/gadm/raw/temp_preprocess/gadm_410-levels
Unpacked /home/mambauser/data/gadm/raw/gadm_410-levels.zip to /home/mambauser/data/gadm/raw/temp_preprocess/gadm_410-levels


In [304]:
# Add columns for translated names
data = [
    {"GID_0": "AFG", "COUNTRY_ES": "Afganistán", "COUNTRY_FR": "Afghanistan"},
    {"GID_0": "XAD", "COUNTRY_ES": "Akrotiri y Dhekelia", "COUNTRY_FR": "Akrotiri et Dhekelia"},
    {"GID_0": "ALB", "COUNTRY_ES": "Albania", "COUNTRY_FR": "Albanie"},
    {"GID_0": "DZA", "COUNTRY_ES": "Argelia", "COUNTRY_FR": "Algérie"},
    {"GID_0": "ASM", "COUNTRY_ES": "Samoa Americana", "COUNTRY_FR": "Samoa américaines"},
    {"GID_0": "AND", "COUNTRY_ES": "Andorra", "COUNTRY_FR": "Andorre"},
    {"GID_0": "AGO", "COUNTRY_ES": "Angola", "COUNTRY_FR": "Angola"},
    {"GID_0": "AIA", "COUNTRY_ES": "Anguila", "COUNTRY_FR": "Anguilla"},
    {"GID_0": "ATA", "COUNTRY_ES": "Antártida", "COUNTRY_FR": "Antarctique"},
    {"GID_0": "ATG", "COUNTRY_ES": "Antigua y Barbuda", "COUNTRY_FR": "Antigua-et-Barbuda"},
    {"GID_0": "ARG", "COUNTRY_ES": "Argentina", "COUNTRY_FR": "Argentine"},
    {"GID_0": "ARM", "COUNTRY_ES": "Armenia", "COUNTRY_FR": "Arménie"},
    {"GID_0": "ABW", "COUNTRY_ES": "Aruba", "COUNTRY_FR": "Aruba"},
    {"GID_0": "AUS", "COUNTRY_ES": "Australia", "COUNTRY_FR": "Australie"},
    {"GID_0": "AUT", "COUNTRY_ES": "Austria", "COUNTRY_FR": "Autriche"},
    {"GID_0": "AZE", "COUNTRY_ES": "Azerbaiyán", "COUNTRY_FR": "Azerbaïdjan"},
    {"GID_0": "BHS", "COUNTRY_ES": "Bahamas", "COUNTRY_FR": "Bahamas"},
    {"GID_0": "BHR", "COUNTRY_ES": "Baréin", "COUNTRY_FR": "Bahreïn"},
    {"GID_0": "BGD", "COUNTRY_ES": "Bangladés", "COUNTRY_FR": "Bangladesh"},
    {"GID_0": "BRB", "COUNTRY_ES": "Barbados", "COUNTRY_FR": "Barbade"},
    {"GID_0": "BLR", "COUNTRY_ES": "Bielorrusia", "COUNTRY_FR": "Biélorussie"},
    {"GID_0": "BEL", "COUNTRY_ES": "Bélgica", "COUNTRY_FR": "Belgique"},
    {"GID_0": "BLZ", "COUNTRY_ES": "Belice", "COUNTRY_FR": "Belize"},
    {"GID_0": "BEN", "COUNTRY_ES": "Benín", "COUNTRY_FR": "Bénin"},
    {"GID_0": "BMU", "COUNTRY_ES": "Bermudas", "COUNTRY_FR": "Bermudes"},
    {"GID_0": "BTN", "COUNTRY_ES": "Bután", "COUNTRY_FR": "Bhoutan"},
    {"GID_0": "BOL", "COUNTRY_ES": "Bolivia", "COUNTRY_FR": "Bolivie"},
    {"GID_0": "BES", "COUNTRY_ES": "Bonaire, San Eustaquio y Saba", "COUNTRY_FR": "Bonaire, Saint-Eustache et Saba"},
    {"GID_0": "BIH", "COUNTRY_ES": "Bosnia y Herzegovina", "COUNTRY_FR": "Bosnie-Herzégovine"},
    {"GID_0": "BWA", "COUNTRY_ES": "Botsuana", "COUNTRY_FR": "Botswana"},
    {"GID_0": "BVT", "COUNTRY_ES": "Isla Bouvet", "COUNTRY_FR": "Île Bouvet"},
    {"GID_0": "BRA", "COUNTRY_ES": "Brasil", "COUNTRY_FR": "Brésil"},
    {"GID_0": "IOT", "COUNTRY_ES": "Territorio Británico del Océano Índico", "COUNTRY_FR": "Territoire britannique de l'océan Indien"},
    {"GID_0": "VGB", "COUNTRY_ES": "Islas Vírgenes Británicas", "COUNTRY_FR": "Îles Vierges britanniques"},
    {"GID_0": "BRN", "COUNTRY_ES": "Brunéi", "COUNTRY_FR": "Brunei"},
    {"GID_0": "BGR", "COUNTRY_ES": "Bulgaria", "COUNTRY_FR": "Bulgarie"},
    {"GID_0": "BFA", "COUNTRY_ES": "Burkina Faso", "COUNTRY_FR": "Burkina Faso"},
    {"GID_0": "BDI", "COUNTRY_ES": "Burundi", "COUNTRY_FR": "Burundi"},
    {"GID_0": "CPV", "COUNTRY_ES": "Cabo Verde", "COUNTRY_FR": "Cap-Vert"},
    {"GID_0": "KHM", "COUNTRY_ES": "Camboya", "COUNTRY_FR": "Cambodge"},
    {"GID_0": "CMR", "COUNTRY_ES": "Camerún", "COUNTRY_FR": "Cameroun"},
    {"GID_0": "CAN", "COUNTRY_ES": "Canadá", "COUNTRY_FR": "Canada"},
    {"GID_0": "XCA", "COUNTRY_ES": "Mar Caspio", "COUNTRY_FR": "Mer Caspienne"},
    {"GID_0": "CYM", "COUNTRY_ES": "Islas Caimán", "COUNTRY_FR": "Îles Caïmans"},
    {"GID_0": "CAF", "COUNTRY_ES": "República Centroafricana", "COUNTRY_FR": "République centrafricaine"},
    {"GID_0": "TCD", "COUNTRY_ES": "Chad", "COUNTRY_FR": "Tchad"},
    {"GID_0": "CHL", "COUNTRY_ES": "Chile", "COUNTRY_FR": "Chili"},
    {"GID_0": "CHN", "COUNTRY_ES": "China", "COUNTRY_FR": "Chine"},
    {"GID_0": "CXR", "COUNTRY_ES": "Isla de Navidad", "COUNTRY_FR": "Île Christmas"},
    {"GID_0": "XCL", "COUNTRY_ES": "Isla Clipperton", "COUNTRY_FR": "Île Clipperton"},
    {"GID_0": "CCK", "COUNTRY_ES": "Islas Cocos", "COUNTRY_FR": "Îles Cocos"},
    {"GID_0": "COL", "COUNTRY_ES": "Colombia", "COUNTRY_FR": "Colombie"},
    {"GID_0": "COM", "COUNTRY_ES": "Comoras", "COUNTRY_FR": "Comores"},
    {"GID_0": "COK", "COUNTRY_ES": "Islas Cook", "COUNTRY_FR": "Îles Cook"},
    {"GID_0": "CRI", "COUNTRY_ES": "Costa Rica", "COUNTRY_FR": "Costa Rica"},
    {"GID_0": "HRV", "COUNTRY_ES": "Croacia", "COUNTRY_FR": "Croatie"},
    {"GID_0": "CUB", "COUNTRY_ES": "Cuba", "COUNTRY_FR": "Cuba"},
    {"GID_0": "CUW", "COUNTRY_ES": "Curazao", "COUNTRY_FR": "Curaçao"},
    {"GID_0": "CYP", "COUNTRY_ES": "Chipre", "COUNTRY_FR": "Chypre"},
    {"GID_0": "CZE", "COUNTRY_ES": "Chequia", "COUNTRY_FR": "Tchéquie"},
    {"GID_0": "CIV", "COUNTRY_ES": "Costa de Marfil", "COUNTRY_FR": "Côte d'Ivoire"},
    {"GID_0": "COD", "COUNTRY_ES": "República Democrática del Congo", "COUNTRY_FR": "République démocratique du Congo"},
    {"GID_0": "DNK", "COUNTRY_ES": "Dinamarca", "COUNTRY_FR": "Danemark"},
    {"GID_0": "DJI", "COUNTRY_ES": "Yibuti", "COUNTRY_FR": "Djibouti"},
    {"GID_0": "DMA", "COUNTRY_ES": "Dominica", "COUNTRY_FR": "Dominique"},
    {"GID_0": "DOM", "COUNTRY_ES": "República Dominicana", "COUNTRY_FR": "République dominicaine"},
    {"GID_0": "ECU", "COUNTRY_ES": "Ecuador", "COUNTRY_FR": "Équateur"},
    {"GID_0": "EGY", "COUNTRY_ES": "Egipto", "COUNTRY_FR": "Égypte"},
    {"GID_0": "SLV", "COUNTRY_ES": "El Salvador", "COUNTRY_FR": "Salvador"},
    {"GID_0": "GNQ", "COUNTRY_ES": "Guinea Ecuatorial", "COUNTRY_FR": "Guinée équatoriale"},
    {"GID_0": "ERI", "COUNTRY_ES": "Eritrea", "COUNTRY_FR": "Érythrée"},
    {"GID_0": "EST", "COUNTRY_ES": "Estonia", "COUNTRY_FR": "Estonie"},
    {"GID_0": "ETH", "COUNTRY_ES": "Etiopía", "COUNTRY_FR": "Éthiopie"},
    {"GID_0": "FLK", "COUNTRY_ES": "Islas Malvinas", "COUNTRY_FR": "Îles Malouines"},
    {"GID_0": "FRO", "COUNTRY_ES": "Islas Feroe", "COUNTRY_FR": "Îles Féroé"},
    {"GID_0": "FJI", "COUNTRY_ES": "Fiyi", "COUNTRY_FR": "Fidji"},
    {"GID_0": "FIN", "COUNTRY_ES": "Finlandia", "COUNTRY_FR": "Finlande"},
    {"GID_0": "FRA", "COUNTRY_ES": "Francia", "COUNTRY_FR": "France"},
    {"GID_0": "GUF", "COUNTRY_ES": "Guayana Francesa", "COUNTRY_FR": "Guyane française"},
    {"GID_0": "PYF", "COUNTRY_ES": "Polinesia Francesa", "COUNTRY_FR": "Polynésie française"},
    {"GID_0": "ATF", "COUNTRY_ES": "Territorios Australes Franceses", "COUNTRY_FR": "Terres australes françaises"},
    {"GID_0": "GAB", "COUNTRY_ES": "Gabón", "COUNTRY_FR": "Gabon"},
    {"GID_0": "GMB", "COUNTRY_ES": "Gambia", "COUNTRY_FR": "Gambie"},
    {"GID_0": "GEO", "COUNTRY_ES": "Georgia", "COUNTRY_FR": "Géorgie"},
    {"GID_0": "DEU", "COUNTRY_ES": "Alemania", "COUNTRY_FR": "Allemagne"},
    {"GID_0": "GHA", "COUNTRY_ES": "Ghana", "COUNTRY_FR": "Ghana"},
    {"GID_0": "GIB", "COUNTRY_ES": "Gibraltar", "COUNTRY_FR": "Gibraltar"},
    {"GID_0": "GRC", "COUNTRY_ES": "Grecia", "COUNTRY_FR": "Grèce"},
    {"GID_0": "GRL", "COUNTRY_ES": "Groenlandia", "COUNTRY_FR": "Groenland"},
    {"GID_0": "GRD", "COUNTRY_ES": "Granada", "COUNTRY_FR": "Grenade"},
    {"GID_0": "GLP", "COUNTRY_ES": "Guadalupe", "COUNTRY_FR": "Guadeloupe"},
    {"GID_0": "GUM", "COUNTRY_ES": "Guam", "COUNTRY_FR": "Guam"},
    {"GID_0": "GTM", "COUNTRY_ES": "Guatemala", "COUNTRY_FR": "Guatemala"},
    {"GID_0": "GGY", "COUNTRY_ES": "Guernesey", "COUNTRY_FR": "Guernesey"},
    {"GID_0": "GIN", "COUNTRY_ES": "Guinea", "COUNTRY_FR": "Guinée"},
    {"GID_0": "GNB", "COUNTRY_ES": "Guinea-Bisáu", "COUNTRY_FR": "Guinée-Bissau"},
    {"GID_0": "GUY", "COUNTRY_ES": "Guyana", "COUNTRY_FR": "Guyana"},
    {"GID_0": "HTI", "COUNTRY_ES": "Haití", "COUNTRY_FR": "Haïti"},
    {"GID_0": "HMD", "COUNTRY_ES": "Isla Heard y McDonald", "COUNTRY_FR": "Île Heard et îles McDonald"},  
    {"GID_0": "HND", "COUNTRY_ES": "Honduras", "COUNTRY_FR": "Honduras"},
    {"GID_0": "HUN", "COUNTRY_ES": "Hungría", "COUNTRY_FR": "Hongrie"},
    {"GID_0": "ISL", "COUNTRY_ES": "Islandia", "COUNTRY_FR": "Islande"},
    {"GID_0": "IND", "COUNTRY_ES": "India", "COUNTRY_FR": "Inde"},
    {"GID_0": "IDN", "COUNTRY_ES": "Indonesia", "COUNTRY_FR": "Indonésie"},
    {"GID_0": "IRN", "COUNTRY_ES": "Irán", "COUNTRY_FR": "Iran"},
    {"GID_0": "IRQ", "COUNTRY_ES": "Irak", "COUNTRY_FR": "Irak"},
    {"GID_0": "IRL", "COUNTRY_ES": "Irlanda", "COUNTRY_FR": "Irlande"},
    {"GID_0": "IMN", "COUNTRY_ES": "Isla de Man", "COUNTRY_FR": "Île de Man"},
    {"GID_0": "ISR", "COUNTRY_ES": "Israel", "COUNTRY_FR": "Israël"},
    {"GID_0": "ITA", "COUNTRY_ES": "Italia", "COUNTRY_FR": "Italie"},
    {"GID_0": "JAM", "COUNTRY_ES": "Jamaica", "COUNTRY_FR": "Jamaïque"},
    {"GID_0": "JPN", "COUNTRY_ES": "Japón", "COUNTRY_FR": "Japon"},
    {"GID_0": "JEY", "COUNTRY_ES": "Jersey", "COUNTRY_FR": "Jersey"},
    {"GID_0": "JOR", "COUNTRY_ES": "Jordania", "COUNTRY_FR": "Jordanie"},
    {"GID_0": "KAZ", "COUNTRY_ES": "Kazajistán", "COUNTRY_FR": "Kazakhstan"},
    {"GID_0": "KEN", "COUNTRY_ES": "Kenia", "COUNTRY_FR": "Kenya"},
    {"GID_0": "KIR", "COUNTRY_ES": "Kiribati", "COUNTRY_FR": "Kiribati"},
    {"GID_0": "XKO", "COUNTRY_ES": "Kosovo", "COUNTRY_FR": "Kosovo"},
    {"GID_0": "KWT", "COUNTRY_ES": "Kuwait", "COUNTRY_FR": "Koweït"},
    {"GID_0": "KGZ", "COUNTRY_ES": "Kirguistán", "COUNTRY_FR": "Kirghizistan"},
    {"GID_0": "LAO", "COUNTRY_ES": "Laos", "COUNTRY_FR": "Laos"},
    {"GID_0": "LVA", "COUNTRY_ES": "Letonia", "COUNTRY_FR": "Lettonie"},
    {"GID_0": "LBN", "COUNTRY_ES": "Líbano", "COUNTRY_FR": "Liban"},
    {"GID_0": "LSO", "COUNTRY_ES": "Lesoto", "COUNTRY_FR": "Lesotho"},
    {"GID_0": "LBR", "COUNTRY_ES": "Liberia", "COUNTRY_FR": "Liberia"},
    {"GID_0": "LBY", "COUNTRY_ES": "Libia", "COUNTRY_FR": "Libye"},
    {"GID_0": "LIE", "COUNTRY_ES": "Liechtenstein", "COUNTRY_FR": "Liechtenstein"},
    {"GID_0": "LTU", "COUNTRY_ES": "Lituania", "COUNTRY_FR": "Lituanie"},
    {"GID_0": "LUX", "COUNTRY_ES": "Luxemburgo", "COUNTRY_FR": "Luxembourg"},
    {"GID_0": "MDG", "COUNTRY_ES": "Madagascar", "COUNTRY_FR": "Madagascar"},
    {"GID_0": "MWI", "COUNTRY_ES": "Malaui", "COUNTRY_FR": "Malawi"},
    {"GID_0": "MYS", "COUNTRY_ES": "Malasia", "COUNTRY_FR": "Malaisie"},
    {"GID_0": "MDV", "COUNTRY_ES": "Maldivas", "COUNTRY_FR": "Maldives"},
    {"GID_0": "MLI", "COUNTRY_ES": "Malí", "COUNTRY_FR": "Mali"},
    {"GID_0": "MLT", "COUNTRY_ES": "Malta", "COUNTRY_FR": "Malte"},
    {"GID_0": "MHL", "COUNTRY_ES": "Islas Marshall", "COUNTRY_FR": "Îles Marshall"},
    {"GID_0": "MTQ", "COUNTRY_ES": "Martinica", "COUNTRY_FR": "Martinique"},
    {"GID_0": "MRT", "COUNTRY_ES": "Mauritania", "COUNTRY_FR": "Mauritanie"},
    {"GID_0": "MUS", "COUNTRY_ES": "Mauricio", "COUNTRY_FR": "Maurice"},
    {"GID_0": "MYT", "COUNTRY_ES": "Mayotte", "COUNTRY_FR": "Mayotte"},
    {"GID_0": "FSM", "COUNTRY_ES": "Micronesia", "COUNTRY_FR": "Micronésie"},
    {"GID_0": "MDA", "COUNTRY_ES": "Moldavia", "COUNTRY_FR": "Moldavie"},
    {"GID_0": "MCO", "COUNTRY_ES": "Mónaco", "COUNTRY_FR": "Monaco"},
    {"GID_0": "MNG", "COUNTRY_ES": "Mongolia", "COUNTRY_FR": "Mongolie"},
    {"GID_0": "MNE", "COUNTRY_ES": "Montenegro", "COUNTRY_FR": "Monténégro"},
    {"GID_0": "MSR", "COUNTRY_ES": "Montserrat", "COUNTRY_FR": "Montserrat"},
    {"GID_0": "MAR", "COUNTRY_ES": "Marruecos", "COUNTRY_FR": "Maroc"},
    {"GID_0": "MOZ", "COUNTRY_ES": "Mozambique", "COUNTRY_FR": "Mozambique"},
    {"GID_0": "MMR", "COUNTRY_ES": "Myanmar", "COUNTRY_FR": "Myanmar"},
    {"GID_0": "MEX", "COUNTRY_ES": "México", "COUNTRY_FR": "Mexique"},
    {"GID_0": "NAM", "COUNTRY_ES": "Namibia", "COUNTRY_FR": "Namibie"},
    {"GID_0": "NRU", "COUNTRY_ES": "Nauru", "COUNTRY_FR": "Nauru"},
    {"GID_0": "NPL", "COUNTRY_ES": "Nepal", "COUNTRY_FR": "Népal"},
    {"GID_0": "NLD", "COUNTRY_ES": "Países Bajos", "COUNTRY_FR": "Pays-Bas"},
    {"GID_0": "NCL", "COUNTRY_ES": "Nueva Caledonia", "COUNTRY_FR": "Nouvelle-Calédonie"},
    {"GID_0": "NZL", "COUNTRY_ES": "Nueva Zelanda", "COUNTRY_FR": "Nouvelle-Zélande"},
    {"GID_0": "NIC", "COUNTRY_ES": "Nicaragua", "COUNTRY_FR": "Nicaragua"},
    {"GID_0": "NER", "COUNTRY_ES": "Níger", "COUNTRY_FR": "Niger"},
    {"GID_0": "NGA", "COUNTRY_ES": "Nigeria", "COUNTRY_FR": "Nigéria"},
    {"GID_0": "NIU", "COUNTRY_ES": "Niue", "COUNTRY_FR": "Niue"},
    {"GID_0": "NFK", "COUNTRY_ES": "Isla Norfolk", "COUNTRY_FR": "Île Norfolk"},
    {"GID_0": "PRK", "COUNTRY_ES": "Corea del Norte", "COUNTRY_FR": "Corée du Nord"},
    {"GID_0": "MKD", "COUNTRY_ES": "Macedonia del Norte", "COUNTRY_FR": "Macédoine du Nord"},
    {"GID_0": "ZNC", "COUNTRY_ES": "Chipre del Norte", "COUNTRY_FR": "Chypre du Nord"},
    {"GID_0": "MNP", "COUNTRY_ES": "Islas Marianas del Norte", "COUNTRY_FR": "Îles Mariannes du Nord"},
    {"GID_0": "NOR", "COUNTRY_ES": "Noruega", "COUNTRY_FR": "Norvège"},
    {"GID_0": "OMN", "COUNTRY_ES": "Omán", "COUNTRY_FR": "Oman"},
    {"GID_0": "PAK", "COUNTRY_ES": "Pakistán", "COUNTRY_FR": "Pakistan"},
    {"GID_0": "PLW", "COUNTRY_ES": "Palaos", "COUNTRY_FR": "Palaos"},
    {"GID_0": "PSE", "COUNTRY_ES": "Palestina", "COUNTRY_FR": "Palestine"},
    {"GID_0": "PAN", "COUNTRY_ES": "Panamá", "COUNTRY_FR": "Panama"},
    {"GID_0": "PNG", "COUNTRY_ES": "Papúa Nueva Guinea", "COUNTRY_FR": "Papouasie-Nouvelle-Guinée"},
    {"GID_0": "XPI", "COUNTRY_ES": "Islas Paracelso", "COUNTRY_FR": "Îles Paracels"},
    {"GID_0": "PRY", "COUNTRY_ES": "Paraguay", "COUNTRY_FR": "Paraguay"},
    {"GID_0": "PER", "COUNTRY_ES": "Perú", "COUNTRY_FR": "Pérou"},
    {"GID_0": "PHL", "COUNTRY_ES": "Filipinas", "COUNTRY_FR": "Philippines"},
    {"GID_0": "PCN", "COUNTRY_ES": "Islas Pitcairn", "COUNTRY_FR": "Îles Pitcairn"},
    {"GID_0": "POL", "COUNTRY_ES": "Polonia", "COUNTRY_FR": "Pologne"},
    {"GID_0": "PRT", "COUNTRY_ES": "Portugal", "COUNTRY_FR": "Portugal"},
    {"GID_0": "PRI", "COUNTRY_ES": "Puerto Rico", "COUNTRY_FR": "Porto Rico"},
    {"GID_0": "QAT", "COUNTRY_ES": "Catar", "COUNTRY_FR": "Qatar"},
    {"GID_0": "COG", "COUNTRY_ES": "República del Congo", "COUNTRY_FR": "République du Congo"},
    {"GID_0": "ROU", "COUNTRY_ES": "Rumania", "COUNTRY_FR": "Roumanie"},
    {"GID_0": "RUS", "COUNTRY_ES": "Rusia", "COUNTRY_FR": "Russie"},
    {"GID_0": "RWA", "COUNTRY_ES": "Ruanda", "COUNTRY_FR": "Rwanda"},
    {"GID_0": "REU", "COUNTRY_ES": "Reunión", "COUNTRY_FR": "La Réunion"},
    {"GID_0": "SHN", "COUNTRY_ES": "Santa Elena, Ascensión y Tristán de Acuña", "COUNTRY_FR": "Sainte-Hélène, Ascension et Tristan da Cunha"},
    {"GID_0": "KNA", "COUNTRY_ES": "San Cristóbal y Nieves", "COUNTRY_FR": "Saint-Christophe-et-Niévès"},
    {"GID_0": "LCA", "COUNTRY_ES": "Santa Lucía", "COUNTRY_FR": "Sainte-Lucie"},
    {"GID_0": "SPM", "COUNTRY_ES": "San Pedro y Miquelón", "COUNTRY_FR": "Saint-Pierre-et-Miquelon"},
    {"GID_0": "VCT", "COUNTRY_ES": "San Vicente y las Granadinas", "COUNTRY_FR": "Saint-Vincent-et-les-Grenadines"},
    {"GID_0": "BLM", "COUNTRY_ES": "San Bartolomé", "COUNTRY_FR": "Saint-Barthélemy"},
    {"GID_0": "MAF", "COUNTRY_ES": "San Martín", "COUNTRY_FR": "Saint-Martin"},
    {"GID_0": "WSM", "COUNTRY_ES": "Samoa", "COUNTRY_FR": "Samoa"},
    {"GID_0": "SMR", "COUNTRY_ES": "San Marino", "COUNTRY_FR": "Saint-Marin"},
    {"GID_0": "SAU", "COUNTRY_ES": "Arabia Saudita", "COUNTRY_FR": "Arabie Saoudite"},
    {"GID_0": "SEN", "COUNTRY_ES": "Senegal", "COUNTRY_FR": "Sénégal"},
    {"GID_0": "SRB", "COUNTRY_ES": "Serbia", "COUNTRY_FR": "Serbie"},
    {"GID_0": "SYC", "COUNTRY_ES": "Seychelles", "COUNTRY_FR": "Seychelles"},
    {"GID_0": "SLE", "COUNTRY_ES": "Sierra Leona", "COUNTRY_FR": "Sierra Leone"},
    {"GID_0": "SGP", "COUNTRY_ES": "Singapur", "COUNTRY_FR": "Singapour"},
    {"GID_0": "SXM", "COUNTRY_ES": "Sint Maarten", "COUNTRY_FR": "Saint-Martin"},
    {"GID_0": "SVK", "COUNTRY_ES": "Eslovaquia", "COUNTRY_FR": "Slovaquie"},
    {"GID_0": "SVN", "COUNTRY_ES": "Eslovenia", "COUNTRY_FR": "Slovénie"},
    {"GID_0": "SLB", "COUNTRY_ES": "Islas Salomón", "COUNTRY_FR": "Îles Salomon"},
    {"GID_0": "SOM", "COUNTRY_ES": "Somalia", "COUNTRY_FR": "Somalie"},
    {"GID_0": "ZAF", "COUNTRY_ES": "Sudáfrica", "COUNTRY_FR": "Afrique du Sud"},
    {"GID_0": "SGS", "COUNTRY_ES": "Georgia del Sur y las Islas Sandwich del Sur", "COUNTRY_FR": "Géorgie du Sud et îles Sandwich du Sud"},
    {"GID_0": "KOR", "COUNTRY_ES": "Corea del Sur", "COUNTRY_FR": "Corée du Sud"},
    {"GID_0": "SSD", "COUNTRY_ES": "Sudán del Sur", "COUNTRY_FR": "Soudan du Sud"},
    {"GID_0": "ESP", "COUNTRY_ES": "España", "COUNTRY_FR": "Espagne"},
    {"GID_0": "XSP", "COUNTRY_ES": "Islas Spratly", "COUNTRY_FR": "Îles Spratleys"},
    {"GID_0": "LKA", "COUNTRY_ES": "Sri Lanka", "COUNTRY_FR": "Sri Lanka"},
    {"GID_0": "SDN", "COUNTRY_ES": "Sudán", "COUNTRY_FR": "Soudan"},
    {"GID_0": "SUR", "COUNTRY_ES": "Surinam", "COUNTRY_FR": "Suriname"},
    {"GID_0": "SJM", "COUNTRY_ES": "Svalbard y Jan Mayen", "COUNTRY_FR": "Svalbard et Jan Mayen"},
    {"GID_0": "SWZ", "COUNTRY_ES": "Suazilandia", "COUNTRY_FR": "Swaziland"},
    {"GID_0": "SWE", "COUNTRY_ES": "Suecia", "COUNTRY_FR": "Suède"},
    {"GID_0": "CHE", "COUNTRY_ES": "Suiza", "COUNTRY_FR": "Suisse"},
    {"GID_0": "SYR", "COUNTRY_ES": "Siria", "COUNTRY_FR": "Syrie"},
    {"GID_0": "STP", "COUNTRY_ES": "Santo Tomé y Príncipe", "COUNTRY_FR": "São Tomé-et-Principe"},
    {"GID_0": "TWN", "COUNTRY_ES": "Taiwán", "COUNTRY_FR": "Taïwan"},
    {"GID_0": "TJK", "COUNTRY_ES": "Tayikistán", "COUNTRY_FR": "Tadjikistan"},
    {"GID_0": "TZA", "COUNTRY_ES": "Tanzania", "COUNTRY_FR": "Tanzanie"},
    {"GID_0": "THA", "COUNTRY_ES": "Tailandia", "COUNTRY_FR": "Thaïlande"},
    {"GID_0": "TLS", "COUNTRY_ES": "Timor-Leste", "COUNTRY_FR": "Timor-Leste"},
    {"GID_0": "TGO", "COUNTRY_ES": "Togo", "COUNTRY_FR": "Togo"},
    {"GID_0": "TKL", "COUNTRY_ES": "Tokelau", "COUNTRY_FR": "Tokelau"},
    {"GID_0": "TON", "COUNTRY_ES": "Tonga", "COUNTRY_FR": "Tonga"},
    {"GID_0": "TTO", "COUNTRY_ES": "Trinidad y Tobago", "COUNTRY_FR": "Trinité-et-Tobago"},
    {"GID_0": "TUN", "COUNTRY_ES": "Túnez", "COUNTRY_FR": "Tunisie"},
    {"GID_0": "TUR", "COUNTRY_ES": "Turquía", "COUNTRY_FR": "Turquie"},
    {"GID_0": "TKM", "COUNTRY_ES": "Turkmenistán", "COUNTRY_FR": "Turkménistan"},
    {"GID_0": "TCA", "COUNTRY_ES": "Islas Turcas y Caicos", "COUNTRY_FR": "Îles Turques-et-Caïques"},
    {"GID_0": "TUV", "COUNTRY_ES": "Tuvalu", "COUNTRY_FR": "Tuvalu"},
    {"GID_0": "UGA", "COUNTRY_ES": "Uganda", "COUNTRY_FR": "Ouganda"},
    {"GID_0": "UKR", "COUNTRY_ES": "Ucrania", "COUNTRY_FR": "Ukraine"},
    {"GID_0": "ARE", "COUNTRY_ES": "Emiratos Árabes Unidos", "COUNTRY_FR": "Émirats Arabes Unis"},
    {"GID_0": "GBR", "COUNTRY_ES": "Reino Unido", "COUNTRY_FR": "Royaume-Uni"},
    {"GID_0": "USA", "COUNTRY_ES": "Estados Unidos", "COUNTRY_FR": "États-Unis"},
    {"GID_0": "UMI", "COUNTRY_ES": "Islas Ultramarinas Menores de los Estados Unidos", "COUNTRY_FR": "Îles mineures éloignées des États-Unis"},
    {"GID_0": "URY", "COUNTRY_ES": "Uruguay", "COUNTRY_FR": "Uruguay"},
    {"GID_0": "UZB", "COUNTRY_ES": "Uzbekistán", "COUNTRY_FR": "Ouzbékistan"},
    {"GID_0": "VUT", "COUNTRY_ES": "Vanuatu", "COUNTRY_FR": "Vanuatu"},
    {"GID_0": "VAT", "COUNTRY_ES": "Ciudad del Vaticano", "COUNTRY_FR": "Cité du Vatican"},
    {"GID_0": "VEN", "COUNTRY_ES": "Venezuela", "COUNTRY_FR": "Venezuela"},
    {"GID_0": "VNM", "COUNTRY_ES": "Vietnam", "COUNTRY_FR": "Vietnam"},
    {"GID_0": "VIR", "COUNTRY_ES": "Islas Vírgenes de los Estados Unidos", "COUNTRY_FR": "Îles Vierges des États-Unis"},
    {"GID_0": "WLF", "COUNTRY_ES": "Wallis y Futuna", "COUNTRY_FR": "Wallis-et-Futuna"},
    {"GID_0": "ESH", "COUNTRY_ES": "Sahara Occidental", "COUNTRY_FR": "Sahara occidental"},
    {"GID_0": "YEM", "COUNTRY_ES": "Yemen", "COUNTRY_FR": "Yémen"},
    {"GID_0": "ZMB", "COUNTRY_ES": "Zambia", "COUNTRY_FR": "Zambie"},
    {"GID_0": "ZWE", "COUNTRY_ES": "Zimbabue", "COUNTRY_FR": "Zimbabwe"},
    {"GID_0": "ALA", "COUNTRY_ES": "Islas Åland", "COUNTRY_FR": "Îles Åland"}]

df = pd.DataFrame(data)

In [286]:
# Select data adm_0, dissolve and save as shp
geopackage_file = next(unziped_folder.rglob("*.gpkg"), None)

if geopackage_file:
    print(f"Found GeoPackage: {geopackage_file}")

    # Specify the layer to read
    layer_name = "ADM_0"
    gdf = gpd.read_file(geopackage_file, layer=layer_name)
    print(f"Selected layer: {layer_name}")   
    
else:
    print("No GeoPackage file found in the unzipped folder.")

Found GeoPackage: /home/mambauser/data/gadm/raw/temp_preprocess/gadm_410-levels/gadm_410-levels.gpkg
Selected layer: ADM_0


In [287]:
dependency_to_parent = {
    "ABW": ("NLD", "Netherlands"),
    "AIA": ("GBR", "United Kingdom"),
    "ALA": ("FIN", "Finland"),
    "ASM": ("USA", "United States"),
    "ATF": ("FRA", "France"),
    "BES": ("NLD", "Netherlands"),
    "BLM": ("FRA", "France"),
    "BMU": ("GBR", "United Kingdom"),
    "BVT": ("NOR", "Norway"),
    "CCK": ("AUS", "Australia"),
    "COK": ("NZL", "New Zealand"),
    "CUW": ("NLD", "Netherlands"),
    "CXR": ("AUS", "Australia"),
    "CYM": ("GBR", "United Kingdom"),
    "FLK": ("GBR", "United Kingdom"),
    "FRO": ("DNK", "Denmark"),
    "GGY": ("GBR", "United Kingdom"),
    "GLP": ("FRA", "France"),
    "GRL": ("DNK", "Denmark"),
    "GUF": ("FRA", "France"),
    "GUM": ("USA", "United States"),
    "HMD": ("AUS", "Australia"),
    "IMN": ("GBR", "United Kingdom"),
    "IOT": ("GBR", "United Kingdom"),
    "JEY": ("GBR", "United Kingdom"),
    "KGZ": ("KGZ", "Kyrgyzstan"),
    "LAO": ("LAO", "Laos"),
    "LIE": ("LIE", "Liechtenstein"),
    "LSO": ("LSO", "Lesotho"),
    "LUX": ("LUX", "Luxembourg"),
    "MAF": ("FRA", "France"),
    "MDA": ("MDA", "Moldova"),
    "MKD": ("MKD", "North Macedonia"),
    "MLI": ("MLI", "Mali"),
    "MNG": ("MNG", "Mongolia"),
    "MNP": ("USA", "United States"),
    "MSR": ("GBR", "United Kingdom"),
    "MTQ": ("FRA", "France"),
    "MWI": ("MWI", "Malawi"),
    "MYT": ("FRA", "France"),
    "NCL": ("FRA", "France"),
    "NER": ("NER", "Niger"),
    "NFK": ("AUS", "Australia"),
    "NIU": ("NZL", "New Zealand"),
    "NPL": ("NPL", "Nepal"),
    "PCN": ("GBR", "United Kingdom"),
    "PRI": ("USA", "United States"),
    "PRY": ("PRY", "Paraguay"),
    "PYF": ("FRA", "France"),
    "REU": ("FRA", "France"),
    "RWA": ("RWA", "Rwanda"),
    "SGS": ("GBR", "United Kingdom"),
    "SHN": ("GBR", "United Kingdom"),
    "SJM": ("NOR", "Norway"),
    "SMR": ("SMR", "San Marino"),
    "SPM": ("FRA", "France"),
    "SRB": ("SRB", "Serbia"),
    "SSD": ("SSD", "South Sudan"),
    "SVK": ("SVK", "Slovakia"),
    "SWZ": ("SWZ", "Eswatini"),
    "SXM": ("NLD", "Netherlands"),
    "TCA": ("GBR", "United Kingdom"),
    "TCD": ("TCD", "Chad"),
    "TJK": ("TJK", "Tajikistan"),
    "TKL": ("NZL", "New Zealand"),
    "UGA": ("UGA", "Uganda"),
    "UMI": ("USA", "United States"),
    "UZB": ("UZB", "Uzbekistan"),
    "VAT": ("VAT", "Vatican City"),
    "VGB": ("GBR", "United Kingdom"),
    "VIR": ("USA", "United States"),
    "WLF": ("FRA", "France"),
    "ZMB": ("ZMB", "Zambia"),
    "ZWE": ("ZWE", "Zimbabwe"),
}

def update_gid_0_and_country(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    Updates the GID_0 and COUNTRY values in the GeoDataFrame for dependent territories 
    with the GID_0 and COUNTRY of their sovereign parent countries.

    Parameters:
    gdf (gpd.GeoDataFrame): The input GeoDataFrame with 'GID_0' and 'COUNTRY' columns.

    Returns:
    gpd.GeoDataFrame: The GeoDataFrame with updated 'GID_0' and 'COUNTRY' values for dependent territories.
    """
    # Map GID_0 to the updated values
    gdf['GID_0'] = gdf['GID_0'].map(lambda x: dependency_to_parent.get(x, (x, x))[0])
    
    # Update COUNTRY based on the updated GID_0
    gdf['COUNTRY'] = gdf['GID_0'].map(lambda x: {v[0]: v[1] for k, v in dependency_to_parent.items()}.get(x, gdf['COUNTRY'].loc[gdf['GID_0'] == x].values[0]))

    return gdf

# Assuming your GeoDataFrame is called `gdf`
gdf_updated = update_gid_0_and_country(gdf)


In [None]:
# Dissolve by country
gdf_updated = gdf_updated.dissolve(by='COUNTRY').reset_index()

# Calculate area
gdf_updated = gdf_updated.pipe(calculate_area)

In [309]:
# Add translations
gdf_updated = gdf_updated.merge(df, on='GID_0')
gdf_updated

Unnamed: 0,COUNTRY,geometry,GID_0,area_km2,COUNTRY_ES,COUNTRY_FR
0,Afghanistan,"MULTIPOLYGON (((63.61425 29.46993, 63.60868 29...",AFG,644050.28,Afganistán,Afghanistan
1,Akrotiri and Dhekelia,"MULTIPOLYGON (((33.00764 34.62583, 33.00708 34...",XAD,233.64,Akrotiri y Dhekelia,Akrotiri et Dhekelia
2,Albania,"MULTIPOLYGON (((19.27804 40.50524, 19.28189 40...",ALB,28689.62,Albania,Albanie
3,Algeria,"MULTIPOLYGON (((2.84535 36.74691, 2.84597 36.7...",DZA,2311455.23,Argelia,Algérie
4,Andorra,"POLYGON ((1.61725 42.62406, 1.63334 42.62553, ...",AND,450.35,Andorra,Andorre
...,...,...,...,...,...,...
201,Vietnam,"MULTIPOLYGON (((103.46895 9.25602, 103.46736 9...",VNM,330364.87,Vietnam,Vietnam
202,Western Sahara,"MULTIPOLYGON (((-16.83569 22.15403, -16.83597 ...",ESH,267892.77,Sahara Occidental,Sahara occidental
203,Yemen,"MULTIPOLYGON (((42.1618 15.03042, 42.16236 15....",YEM,453741.18,Yemen,Yémen
204,Zambia,"POLYGON ((25.87834 -17.97218, 25.87034 -17.970...",ZMB,753990.33,Zambia,Zambie


In [None]:
final_gadm = await simplify_async(gdf_updated)

In [312]:
# Save the file
final_gadm.to_file(output_file.as_posix(), driver="ESRI Shapefile")

In [313]:
# zip data
make_archive(output_path, zipped_output_file)

In [314]:
# load zipped file to GCS
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=zipped_output_file,
    operation="w",
)

### Mpa Atlas intermediate

In [4]:
force_clean = True
step = "preprocess"
pipe = "mpaatlas"

In [5]:
# Data source
mpaatlas_url = "https://guide.mpatlas.org/api/v1/zone/geojson"
mpaatlas_file_name = "mpatlas_assess_zone.geojson"

In [6]:
working_folder = FileConventionHandler(pipe)
input_path = working_folder.pipe_raw_path
temp_working_path = working_folder.get_temp_file_path(step)

output_path = working_folder.get_processed_step_path(step)
output_file = working_folder.get_step_fmt_file_path(step, "shp")
zipped_output_file = working_folder.get_step_fmt_file_path(step, "zip", True)
remote_path = working_folder.get_remote_path(step)

In [7]:
# Download data
input_file = downloadFile(
    mpaatlas_url,
    input_path,
    overwrite=force_clean,
    file=mpaatlas_file_name,
)

In [8]:
if not force_clean and zipped_output_file.exists():
    print(f"File {zipped_output_file} already exists")

# Transform data
gdf = gpd.read_file(input_file)

df = (gdf
      .pipe(set_wdpa_id)
      .pipe(protection_level)
      .pipe(status)
      .pipe(create_year))

df.drop(
    columns=list(
        set(df.columns)
        - set(
            [
                "wdpa_id",
                "mpa_zone_id", 
                "name",
                "designation",
                "sovereign",
                "establishment_stage",
                "protection_mpaguide_level",
                "protection_level",
                "year",
                "geometry",
            ]
        )
    ),
    inplace=True,
)
df.rename(columns={"sovereign": "location_id", "wdpa_pid": "wdpa_id"}, inplace=True)

In [9]:
#save data
gpd.GeoDataFrame(
    df,
    crs=gdf.crs,
).to_file(filename=output_file.as_posix(), driver="ESRI Shapefile", encoding="utf-8")

make_archive(output_path, zipped_output_file)

  ).to_file(filename=output_file.as_posix(), driver="ESRI Shapefile", encoding="utf-8")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


In [10]:
# LOAD
## load zipped file to GCS
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=zipped_output_file,
    operation="w",
)

In [13]:
# clean unzipped files
rm_tree(temp_working_path) if temp_working_path.exists() else None
rm_tree(output_path) if output_path.exists() else None

### Protected seas intermediate

In [22]:
# DEPRECATED
force_clean = True
step = "preprocess"
pipe = "protectedseas"

In [23]:
ps_csv_url = "ProtectedSeas/ProtectedSeas_06142023.csv"
ps_csv_output = input_path.joinpath(ps_csv_url.split("/")[-1])

ps_geometries_url = (
    "ProtectedSeas/ProtectedSeas_ProtectedSeas_06142023_shp_ProtectedSeas_06142023_shp.zip"
)
ps_geometries_output = input_path.joinpath(ps_geometries_url.split("/")[-1])

In [24]:
working_folder = FileConventionHandler(pipe)
input_path = working_folder.pipe_raw_path
temp_working_path = working_folder.get_temp_file_path(step)

output_path = working_folder.get_processed_step_path(step)
output_file = working_folder.get_step_fmt_file_path(step, "shp")
zipped_output_file = working_folder.get_step_fmt_file_path(step, "zip", True)
remote_path = working_folder.get_remote_path(step)

In [25]:
if not force_clean and zipped_output_file.exists():
    print(f"File {zipped_output_file} already exists")

In [26]:
## get the data

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=ps_csv_url,
    file=ps_csv_output,
    operation="r",
)

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=ps_geometries_url,
    file=ps_geometries_output,
    operation="r",
)

In [27]:
# unzip shapefile
shutil.unpack_archive(ps_geometries_output, temp_working_path)

In [28]:
# transform data
# TODO: Modify the preprocessing steps so we do not eliminate the geometries that does not intersect with MPAs - do to a change in the processing methodology
data_table = pd.read_csv(ps_csv_output).pipe(get_mpas).pipe(set_location_iso).pipe(set_fps_classes)

data_table.drop(
    columns=data_table.columns.difference(
        [
            "site_id",
            "iso",
            "FPS_cat",
            "site_name",
            "country",
            "wdpa_id",
            "removal_of_marine_life_is_prohibited",
            "total_area",
        ]
    ),
    inplace=True,
)

data_table.rename(columns={"removal_of_marine_life_is_prohibited": "FPS"}, inplace=True)

# load geoemtries & merge

gdf = gpd.read_file(ps_geometries_output)

  return df[mask1][mask2].reset_index()


In [29]:
# save data
gdf.merge(data_table, how="inner", left_on="SITE_ID", right_on="site_id").drop(
    columns=["SITE_ID", "SITE_NAME"]
).to_file(filename=output_file.as_posix(), driver="ESRI Shapefile", encoding="utf-8")

# zip data
make_archive(output_path, zipped_output_file)

In [30]:
# clean unzipped files
rm_tree(temp_working_path) if temp_working_path.exists() else None
rm_tree(output_path) if output_path.exists() else None

In [31]:
# LOAD
## load zipped file to GCS
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=zipped_output_file,
    operation="w",
)

### Mpas protected planet intermediate

In [11]:
force_clean = True
step = "preprocess"
pipe = "mpa"

In [12]:
mpa_url = "https://www.protectedplanet.net/downloads"
mpa_body = {
    "domain": "general",
    "format": "shp",
    "token": "marine",
    "id": 21961,
}

In [13]:
working_folder = FileConventionHandler(pipe)
input_path = working_folder.pipe_raw_path
temp_working_path = working_folder.get_temp_file_path(step)

output_path = working_folder.get_processed_step_path(step)
output_file = working_folder.get_step_fmt_file_path(step, "shp")
zipped_output_file = working_folder.get_step_fmt_file_path(step, "zip", True)
remote_path = working_folder.get_remote_path(step)

In [14]:
# download data
r = requests.post(url=mpa_url, data=mpa_body)
r.raise_for_status()

download_url = r.json().get("url")
input_file_name = f'{r.json().get("title")}.zip'
print(r.json())

input_file =  downloadFile(
    url=download_url,
    output_path=input_path,
    overwrite=force_clean,
    file=input_file_name,
)

{'id': 'marine-shp', 'title': 'WDPA_WDOECM_Aug2024_Public_marine_shp', 'url': 'https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_WDOECM_Aug2024_Public_marine_shp.zip', 'hasFailed': False, 'token': 'marine'}


In [15]:
# unzip file twice due how data is provisioned by protected planet
shutil.unpack_archive(
    input_file,
    temp_working_path,
    "zip",
)

for file in temp_working_path.glob("*.zip"):
    shutil.unpack_archive(file, temp_working_path.joinpath(file.stem), "zip")

In [16]:
# load data & Transform it
unziped_folders = []
for file in temp_working_path.glob("*/*.shp"):
    df = (
        gpd.read_file(file)
        .pipe(filter_by_methodology)
        .pipe(transform_points)
        .pipe(clean_geometries)
    )
    unziped_folders.append(df)

# merge datasets
gdf = gpd.GeoDataFrame(
    pd.concat(unziped_folders, ignore_index=True),
    crs=unziped_folders[0].crs,
)

gdf.drop(
    columns=list(
        set(gdf.columns)
        - set(
            [
                "geometry",
                "WDPAID",
                "WDPA_PID",
                "PA_DEF",
                "NAME",
                "PARENT_ISO",
                "DESIG_ENG",
                "IUCN_CAT",
                "STATUS",
                "STATUS_YR",
                "GIS_M_AREA",
                "AREA_KM2",
            ]
        )
    ),
    inplace=True,
)
gdf["WDPAID"] = pd.to_numeric(gdf["WDPAID"], downcast="integer")

In [17]:
# save data & zip it
gdf.to_file(filename=output_file, driver="ESRI Shapefile", encoding="utf-8")

make_archive(output_path, zipped_output_file)

In [18]:
# LOAD
## load zipped file to GCS
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=zipped_output_file,
    operation="w",
)

In [19]:
# clean unzipped files
rm_tree(temp_working_path) if temp_working_path.exists() else None
rm_tree(output_path) if output_path.exists() else None

### Mpas protected planet intermediate terrestrial

In [4]:
force_clean = True
step = "preprocess"
pipe = "mpa-terrestrial"

In [5]:
mpa_url = "https://www.protectedplanet.net/downloads"
mpa_body = {
    "domain": "general",
    "format": "shp",
    "token": "wdpa",
    "id": 76011,
}

In [6]:
working_folder = FileConventionHandler(pipe)
input_path = working_folder.pipe_raw_path
temp_working_path = working_folder.get_temp_file_path(step)

output_path = working_folder.get_processed_step_path(step)
output_file = working_folder.get_step_fmt_file_path(step, "gpkg")
zipped_output_file = working_folder.get_step_fmt_file_path(step, "zip", True)
remote_path = working_folder.get_remote_path(step)

In [7]:
# download data
r = requests.post(url=mpa_url, data=mpa_body)
r.raise_for_status()

download_url = r.json().get("url")
input_file_name = f'{r.json().get("title")}.zip'
print(r.json())

input_file = downloadFile(
    url=download_url,
    output_path=input_path,
    overwrite=force_clean,
    file=input_file_name,
)

{'id': 'wdpa-shp', 'title': 'WDPA_Sep2024_Public_shp', 'url': 'https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_Sep2024_Public_shp.zip', 'hasFailed': False, 'token': 'wdpa'}


In [9]:
# unzip file twice due how data is provisioned by protected planet
shutil.unpack_archive(
    input_file,
    temp_working_path,
    "zip",
)

for file in temp_working_path.glob("*.zip"):
    shutil.unpack_archive(file, temp_working_path.joinpath(file.stem), "zip")

In [10]:
# load data & Transform it
unziped_folders = []
for file in temp_working_path.glob("*/*.shp"):
    df = (
        gpd.read_file(file)
        .pipe(filter_by_methodology)
        .pipe(filter_by_terrestrial)
        .pipe(transform_points)
        .pipe(clean_geometries)
    )
    unziped_folders.append(df)

# merge datasets
gdf = gpd.GeoDataFrame(
    pd.concat(unziped_folders, ignore_index=True),
    crs=unziped_folders[0].crs,
)

gdf.drop(
    columns=list(
        set(gdf.columns)
        - set(
            [
                "geometry",
                "WDPAID",
                "WDPA_PID",
                "PA_DEF",
                "NAME",
                "PARENT_ISO",
                "DESIG_ENG",
                "IUCN_CAT",
                "STATUS",
                "STATUS_YR",
                "GIS_AREA",
                "MARINE",
            ]
        )
    ),
    inplace=True,
)
gdf["WDPAID"] = pd.to_numeric(gdf["WDPAID"], downcast="integer")

In [None]:
final_wdpa_terrestrial = await simplify_async(gdf)

In [11]:
# save data & zip it
final_wdpa_terrestrial.to_file(
    filename=output_file,
    driver="GPKG",
    layer="name",
    encoding="utf-8",
)

In [12]:
output_file

PosixPath('/home/mambauser/data/mpa-terrestrial/processed/preprocess/mpa-terrestrial_preprocess.gpkg')

In [12]:
# LOAD
## load zipped file to GCS
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=output_file,
    operation="w",
)

: 

In [None]:
# clean unzipped files
rm_tree(temp_working_path) if temp_working_path.exists() else None
rm_tree(output_path) if output_path.exists() else None

### Habitats

In [4]:
force_clean = True
step = "preprocess"
pipe = "habitats"

In [5]:
habitats_download_url = "https://habitats.oceanplus.org/downloads/global_statistics.zip"
Mangroves_download_url = "https://mangrove-atlas-api.herokuapp.com/admin/widget_protected_areas.csv"
mangroves_request_headers = {
    "Cookie": "_mangrove_atlas_api_session=fJuobvI2fH42WfGfMtRTp%2BksIDdPEpY6DG8uCuITsENtrRGG4AA3nYEeAI7dytzpK%2F0dGIHq84O54MRr6eiPgiwCYXp2XP4IzXM40dFt%2FI6hoB0WXC%2Fwrd81XreNnMZiSEE6IVT5R0fqMcmsZdPn53u0A1d4CGU3FfliOZuWkckBuA%2F7C4upBGuSS8817LqOh1slG%2BsEOGp3nk7WX4fMoPbsHWtARfFwdfoAHz448LO7uWuZdyiu7YOrS0ZxOZEb9JZ8hcUJph4pBFofZLpOvtQQutgZY21T5bhQ7Kwfl56e6Qr0SZ%2B8sIzMfky3h%2FjOA6DNTLoy%2BZLiZBAgFHlTYm2JwlwqWgAZU8D7cE7Zn%2Fxgf3LFF9pZ9Fe3QG4c8LIwH%2FxqjEd8GsZAhBMgBWbxubigQ9gZssZt6CIO--7qiVsTAT8JAKj1jU--U7TI%2Fz9c151bfD8iZdkBDw%3D%3D"
}
seamounts_download_url = "https://datadownload-production.s3.amazonaws.com/ZSL002_ModelledSeamounts2011_v1.zip"

In [6]:
working_folder = FileConventionHandler(pipe)
input_path = working_folder.pipe_raw_path
temp_working_path = working_folder.get_temp_file_path(step)

output_path = working_folder.get_processed_step_path(step)
output_file = working_folder.get_step_fmt_file_path(step, "shp")
zipped_output_file = working_folder.get_step_fmt_file_path(step, "zip", True)
remote_path = working_folder.get_remote_path(step)

#### Seamounts

In [8]:
input_seamounts_path = input_path.joinpath("seamounts")
input_seamounts_path.mkdir(parents=True, exist_ok=True)
# download data
input_file_name = "seamounts.zip"
input_file = downloadFile(
    url=seamounts_download_url,
    output_path=input_seamounts_path,
    overwrite=force_clean,
    file=input_file_name,
)

In [9]:
# unzip data
shutil.unpack_archive(
    input_file,
    temp_working_path,
    "zip",
)

In [13]:
temp_working_path

PosixPath('/home/mambauser/data/habitats/raw/temp_preprocess')

In [24]:
first =gpd.read_file(next(temp_working_path.rglob("*SeamountsBaseArea.shp")))

In [25]:
first

Unnamed: 0,PEAKID,DEPTH,HEIGHT,LONG,LAT,AREA2D,FILTER,geometry
0,26000,-2547,1148,2.762500,84.979736,982.028337,0,"POLYGON ((2.91249 84.82976, 2.76249 84.79636, ..."
1,26157,-3084,1296,9.143056,84.935292,348.473055,0,"POLYGON ((9.99309 84.93526, 9.25139 84.82696, ..."
2,26158,-3043,1342,9.183333,84.938070,367.540380,0,"POLYGON ((9.07499 85.04636, 9.18329 85.03806, ..."
3,26228,-3142,1379,8.748611,84.907514,299.443636,0,"POLYGON ((9.79859 84.90756, 8.83199 84.82416, ..."
4,26229,-3146,1383,8.887500,84.913070,309.588492,0,"POLYGON ((8.88749 84.83806, 8.81249 84.83806, ..."
...,...,...,...,...,...,...,...,...
33447,4999430,-298,1376,-142.295833,-74.566097,819.608801,0,"POLYGON ((-142.29582 -74.72444, -142.46251 -74..."
33448,4999462,-295,1274,-142.250000,-74.570264,777.598079,1,"POLYGON ((-142.25001 -74.72864, -142.41671 -74..."
33449,4999913,-348,3288,-164.179167,-74.766097,1000.023088,1,"POLYGON ((-164.01251 -74.93274, -164.17921 -74..."
33450,5000862,-2739,1060,-158.162500,-75.141097,814.426234,0,"POLYGON ((-158.16251 -75.28274, -158.30421 -75..."


In [None]:
if not force_clean and zipped_output_file.exists():
    print(f"File {zipped_output_file} already exists")