## Data download and preprocessing

This notebook handles data downloading and preprocessing, preparing it for use in tiles.ipynb, locations.ipynb, and/or precalculations.ipynb.

### Set up

In [None]:
%load_ext autoreload
%autoreload 2

In [5]:
from logging import getLogger
import shutil
from pathlib import Path
import geopandas as gpd
import pandas as pd
import requests
import json
import dotenv  

dotenv.load_dotenv()

scripts_dir = Path(".").joinpath("src")
import sys
if scripts_dir not in sys.path:
    sys.path.insert(0, scripts_dir.resolve().as_posix())

from helpers.utils import downloadFile, rm_tree, make_archive, writeReadGCP
from helpers.settings import get_settings
from helpers.file_handler import FileConventionHandler
from pipelines.utils import watch
from pipelines.processors import (
    set_wdpa_id,
    protection_level,
    status,
    create_year,
    calculate_area,
    get_mpas,
    set_location_iso,
    set_fps_classes,
    filter_by_methodology,
    filter_by_terrestrial,
    transform_points,
    clean_geometries,
    simplify_async,
)

In [6]:
mysettings = get_settings()

### eez_intermediate

In [4]:
# Pipe params
force_clean = True
step = "preprocess"
pipe = "eez"

In [14]:
# Data sources
## EEZ
EEZ_url = "https://www.marineregions.org/download_file.php"
EEZ_file_name = "eez_v11.shp"
EEZ_params = {"name": "World_EEZ_v11_20191118.zip"}
EEZ_headers = {
    "content-type": "application/x-www-form-urlencoded",
    "cookie": "PHPSESSID=29190501b4503e4b33725cd6bd01e2c6; vliz_webc=vliz_webc2; jwplayer.captionLabel=Off",
    "dnt": "1",
    "origin": "https://www.marineregions.org",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "same-origin",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
}

EEZ_body = {
    "name": "Jason",
    "organisation": "skytruth",
    "email": "hello@skytruth.com",
    "country": "Spain",
    "user_category": "academia",
    "purpose_category": "Conservation",
    "agree": "1",
}

## High seas
hs_url = "https://www.marineregions.org/download_file.php"
hs_file_name = "High_seas_v1.shp"
hs_params = {"name": "World_High_Seas_v1_20200826.zip"}
hs_headers = {
    "content-type": "application/x-www-form-urlencoded",
    "cookie": "PHPSESSID=29190501b4503e4b33725cd6bd01e2c6; vliz_webc=vliz_webc2; jwplayer.captionLabel=Off",
    "dnt": "1",
    "origin": "https://www.marineregions.org",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "same-origin",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
}
hs_body = {
    "name": "Jason",
    "organisation": "skytruth",
    "email": "hello@skytruth.com",
    "country": "Spain",
    "user_category": "academia",
    "purpose_category": "Conservation",
    "agree": "1",
}

In [15]:
working_folder = FileConventionHandler(pipe)
input_path = working_folder.pipe_raw_path
temp_working_path = working_folder.get_temp_file_path(step)

output_path = working_folder.get_processed_step_path(step)
output_file = working_folder.get_step_fmt_file_path(step, "shp")
zipped_output_file = working_folder.get_step_fmt_file_path(step, "zip", True)
remote_path = working_folder.get_remote_path(step)

In [None]:
# Extract data
## download files EEZ & High seas
downloadFile(
    EEZ_url,
    input_path,
    EEZ_body,
    EEZ_params,
    EEZ_headers,
    overwrite=force_clean,
)
downloadFile(hs_url, input_path, hs_body, hs_params, hs_headers, overwrite=force_clean)

In [None]:
## unzip file if needed & load data
unziped_folders = []
for idx, path in enumerate(input_path.glob("*.zip")):
    unziped_folder = temp_working_path.joinpath(path.stem)
    print(unziped_folder)

    if unziped_folder.exists() and force_clean:
        rm_tree(unziped_folder)

    shutil.unpack_archive(path, unziped_folder)

    files = [gpd.read_file(file) for file in unziped_folder.rglob("*.shp") if "boundaries" not in file.stem]
    unziped_folders.append(
        pd.concat(files)
    )

In [None]:
for idx, gdf in enumerate(unziped_folders):
    print(f"GeoDataFrame {idx} has {len(gdf)} rows and {len(gdf.columns)} columns")

In [73]:
# Transform data
## set the same structure for both datasets updating the high seas one
unziped_folders[0] = (
    unziped_folders[0]
    .rename(
        columns={"name": "GEONAME", "area_km2": "AREA_KM2", "mrgid": "MRGID"},
    )
    .assign(
        POL_TYPE="High Seas",
        ISO_SOV1="ABNJ",
    )
)

# merge datasets
df = pd.concat(unziped_folders, ignore_index=True)

df.drop(
    columns=list(
        set(df.columns)
        - set(
            [
                "MRGID",
                "GEONAME",
                "POL_TYPE",
                "ISO_SOV1",
                "ISO_SOV2",
                "ISO_SOV3",
                "AREA_KM2",
                "geometry",
            ]
        )
    ),
    inplace=True,
)

In [None]:
# save data
gpd.GeoDataFrame(
    df,
    crs=unziped_folders[0].crs,
).to_file(filename=output_file.as_posix(), driver="ESRI Shapefile")

# zip data
make_archive(output_path, zipped_output_file)

In [76]:
# clean unzipped files
rm_tree(temp_working_path) if temp_working_path.exists() else None
rm_tree(output_path) if output_path.exists() else None

In [13]:
# LOAD
## load zipped file to GCS
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=zipped_output_file,
    operation="w",
)

### Countries gadm intermediate

In [3]:
# Pipe params
force_clean = True
step = "preprocess"
pipe = "gadm"

In [4]:
working_folder = FileConventionHandler(pipe)
input_path = working_folder.pipe_raw_path
temp_working_path = working_folder.get_temp_file_path(step)

output_path = working_folder.get_processed_step_path(step)
output_file = working_folder.get_step_fmt_file_path(step, "shp")
zipped_output_file = working_folder.get_step_fmt_file_path(step, "zip", True)
remote_path = working_folder.get_remote_path(step)

In [4]:
gadm_url = "https://geodata.ucdavis.edu/gadm/gadm4.1/gadm_410-levels.zip"
gadm_file_name = "gadm_410-levels.zip"

In [5]:
# Download data
input_file = downloadFile(
    gadm_url,
    input_path,
    overwrite=force_clean,
    file=gadm_file_name,
)

In [5]:
# Check if there is a zip file in the input_path
zip_file = next(input_path.glob("*.zip"), None)
if zip_file:
    unziped_folder = temp_working_path.joinpath(zip_file.stem)
    print(f"Processing: {unziped_folder}")

    if unziped_folder.exists() and force_clean:
        shutil.rmtree(unziped_folder)
        print(f"Removed existing folder: {unziped_folder}")

    # Unpack the archive
    shutil.unpack_archive(zip_file, unziped_folder)
    print(f"Unpacked {zip_file} to {unziped_folder}")


Processing: /home/sofia/dev/skytruth-30x30/data/data/gadm/raw/temp_preprocess/gadm_410-levels
Removed existing folder: /home/sofia/dev/skytruth-30x30/data/data/gadm/raw/temp_preprocess/gadm_410-levels
Unpacked /home/sofia/dev/skytruth-30x30/data/data/gadm/raw/gadm_410-levels.zip to /home/sofia/dev/skytruth-30x30/data/data/gadm/raw/temp_preprocess/gadm_410-levels


In [6]:
# Select data adm_0, dissolve and save as shp
geopackage_file = next(unziped_folder.rglob("*.gpkg"), None)

if geopackage_file:
    print(f"Found GeoPackage: {geopackage_file}")

    # Specify the layer to read
    layer_name = "ADM_0"
    gdf = gpd.read_file(geopackage_file, layer=layer_name)
    print(f"Selected layer: {layer_name}")   
    
else:
    print("No GeoPackage file found in the unzipped folder.")

Found GeoPackage: /home/sofia/dev/skytruth-30x30/data/data/gadm/raw/temp_preprocess/gadm_410-levels/gadm_410-levels.gpkg
Selected layer: ADM_0


In [7]:
def update_gid_0_and_country(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    Updates the GID_0 and COUNTRY values in the GeoDataFrame for dependent territories 
    with the GID_0 and COUNTRY of their sovereign parent countries.

    Parameters:
    gdf (gpd.GeoDataFrame): The input GeoDataFrame with 'GID_0' and 'COUNTRY' columns.

    Returns:
    gpd.GeoDataFrame: The GeoDataFrame with updated 'GID_0' and 'COUNTRY' values for dependent territories.
    """
    # Load the dependency_to_parent mapping
    with open(scripts_dir.joinpath('data_commons/data/dependency_to_parent.json'), 'r') as json_file:
        dependency_to_parent = json.load(json_file)

    # Map GID_0 to the updated values
    gdf['GID_0'] = gdf['GID_0'].map(lambda x: dependency_to_parent.get(x, (x, x))[0])
    
    # Update COUNTRY based on the updated GID_0
    gdf['COUNTRY'] = gdf['GID_0'].map(lambda x: {v[0]: v[1] for k, v in dependency_to_parent.items()}.get(x, gdf['COUNTRY'].loc[gdf['GID_0'] == x].values[0]))

    return gdf


def add_translations(df, translations_csv_path):
    translations_df = pd.read_csv(translations_csv_path, keep_default_na=False, na_values=[])
    
    df = df.merge(translations_df[['code', 'name_es', 'name_fr']], left_on='GID_0', right_on='code', how='left')
    
    return df

In [8]:
# Assign territories to their parent countries
gdf_updated = update_gid_0_and_country(gdf)

# Dissolve by country
gdf_updated = gdf_updated.dissolve(by='COUNTRY').reset_index()

# Calculate area
gdf_updated = gdf_updated.pipe(calculate_area)

In [9]:
# Download country translations
working_folder = FileConventionHandler(pipe)
input_path = working_folder.pipe_raw_path

translations_csv_url = "vizzuality_processed_data/gadm/preprocess/locations_translated.csv"
translations_csv_output = input_path.joinpath(translations_csv_url.split("/")[-1])

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=translations_csv_url,
    file=translations_csv_output,
    operation="r",
)

translations_path = input_path.joinpath('locations_translated.csv')

In [10]:
# Add translations for country names
gdf_translated = add_translations(gdf_updated, translations_path).drop(columns=['code'])
gdf_translated.head(1)

Unnamed: 0,COUNTRY,geometry,GID_0,area_km2,name_es,name_fr
0,Afghanistan,"MULTIPOLYGON (((63.61425 29.46993, 63.60868 29...",AFG,644050.28,Afganistán,Afghanistan


In [11]:
final_gadm = await simplify_async(gdf_translated)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 204/204 [05:40<00:00,  1.67s/it]


In [12]:
# Save the file
final_gadm.to_file(output_file.as_posix(), driver="ESRI Shapefile")

In [13]:
# zip data
make_archive(output_path, zipped_output_file)

In [14]:
# load zipped file to GCS
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=zipped_output_file,
    operation="w",
)

### Mpa Atlas intermediate

In [12]:
force_clean = True
step = "preprocess"
pipe = "mpaatlas"

In [13]:
# Data source
mpaatlas_url = "https://guide.mpatlas.org/api/v1/zone/geojson"
mpaatlas_file_name = "mpatlas_assess_zone.geojson"

In [14]:
working_folder = FileConventionHandler(pipe)
input_path = working_folder.pipe_raw_path
temp_working_path = working_folder.get_temp_file_path(step)

output_path = working_folder.get_processed_step_path(step)
output_file = working_folder.get_step_fmt_file_path(step, "shp")
zipped_output_file = working_folder.get_step_fmt_file_path(step, "zip", True)
remote_path = working_folder.get_remote_path(step)

In [15]:
# Download data
input_file = downloadFile(
    mpaatlas_url,
    input_path,
    overwrite=force_clean,
    file=mpaatlas_file_name,
)

In [16]:
if not force_clean and zipped_output_file.exists():
    print(f"File {zipped_output_file} already exists")

# Transform data
gdf = gpd.read_file(input_file)

df = (gdf
      .pipe(set_wdpa_id)
      .pipe(protection_level)
      .pipe(status)
      .pipe(create_year))

df.drop(
    columns=list(
        set(df.columns)
        - set(
            [
                "wdpa_id",
                "mpa_zone_id", 
                "name",
                "designation",
                "sovereign",
                "establishment_stage",
                "protection_mpaguide_level",
                "protection_level",
                "year",
                "geometry",
            ]
        )
    ),
    inplace=True,
)
df.rename(columns={"sovereign": "location_id", "wdpa_pid": "wdpa_id"}, inplace=True)

In [17]:
#save data
gpd.GeoDataFrame(
    df,
    crs=gdf.crs,
).to_file(filename=output_file.as_posix(), driver="ESRI Shapefile", encoding="utf-8")

make_archive(output_path, zipped_output_file)

  ).to_file(filename=output_file.as_posix(), driver="ESRI Shapefile", encoding="utf-8")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


In [10]:
# LOAD
## load zipped file to GCS
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=zipped_output_file,
    operation="w",
)

In [13]:
# clean unzipped files
rm_tree(temp_working_path) if temp_working_path.exists() else None
rm_tree(output_path) if output_path.exists() else None

### Mpas protected planet intermediate

In [3]:
force_clean = True
step = "preprocess"
pipe = "mpa"

In [6]:
mpa_url = "https://www.protectedplanet.net/downloads"
mpa_body = {
    "domain": "general",
    "format": "shp",
    "token": "marine",
    "id": 21961,
}

In [6]:
working_folder = FileConventionHandler(pipe)
input_path = working_folder.pipe_raw_path
temp_working_path = working_folder.get_temp_file_path(step)

output_path = working_folder.get_processed_step_path(step)
output_file = working_folder.get_step_fmt_file_path(step, "shp")
zipped_output_file = working_folder.get_step_fmt_file_path(step, "zip", True)
remote_path = working_folder.get_remote_path(step)

# # download data
# r = requests.post(url=mpa_url, data=mpa_body)
# r.raise_for_status()

# download_url = r.json().get("url")
# input_file_name = f'{r.json().get("title")}.zip'
# print(r.json())

# input_file =  downloadFile(
#     url=download_url,
#     output_path=input_path,
#     overwrite=force_clean,
#     file=input_file_name,
# )

In [9]:
# unzip file twice due how data is provisioned by protected planet
shutil.unpack_archive(
    input_file,
    temp_working_path,
    "zip",
)

for file in temp_working_path.glob("*.zip"):
    shutil.unpack_archive(file, temp_working_path.joinpath(file.stem), "zip")

In [10]:
# load data & Transform it
unziped_folders = []
for file in temp_working_path.glob("*/*.shp"):
    df = (
        gpd.read_file(file)
        .pipe(filter_by_methodology)
        .pipe(transform_points)
        .pipe(clean_geometries)
    )
    unziped_folders.append(df)

# merge datasets
gdf = gpd.GeoDataFrame(
    pd.concat(unziped_folders, ignore_index=True),
    crs=unziped_folders[0].crs,
)

gdf.drop(
    columns=list(
        set(gdf.columns)
        - set(
            [
                "geometry",
                "WDPAID",
                "WDPA_PID",
                "PA_DEF",
                "NAME",
                "PARENT_ISO",
                "DESIG_ENG",
                "IUCN_CAT",
                "STATUS",
                "STATUS_YR",
                "GIS_M_AREA",
                "AREA_KM2",
            ]
        )
    ),
    inplace=True,
)
gdf["WDPAID"] = pd.to_numeric(gdf["WDPAID"], downcast="integer")

In [11]:
# save data & zip it
gdf.to_file(filename=output_file, driver="ESRI Shapefile", encoding="utf-8")

make_archive(output_path, zipped_output_file)

In [11]:
# LOAD
## load zipped file to GCS
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=zipped_output_file,
    operation="w",
)

In [12]:
# clean unzipped files
rm_tree(temp_working_path) if temp_working_path.exists() else None
rm_tree(output_path) if output_path.exists() else None

### Pas protected planet intermediate terrestrial

In [7]:
force_clean = True
step = "preprocess"
pipe = "mpa-terrestrial"

In [14]:
mpa_url = "https://www.protectedplanet.net/downloads"
mpa_body = {
    "domain": "general",
    "format": "shp",
    "token": "wdpa",
    "id": 76011,
}

In [15]:
working_folder = FileConventionHandler(pipe)
# input_path = working_folder.pipe_raw_path
input_file = working_folder.pipe_raw_path.joinpath("WDPA_Sep2024_Public_shp.zip")
temp_working_path = working_folder.get_temp_file_path(step)

output_path = working_folder.get_processed_step_path(step)
output_file = working_folder.get_step_fmt_file_path(step, "gpkg")
zipped_output_file = working_folder.get_step_fmt_file_path(step, "zip", True)
remote_path = working_folder.get_remote_path(step)

In [16]:
# download data
r = requests.post(url=mpa_url, data=mpa_body)
r.raise_for_status()

download_url = r.json().get("url")
input_file_name = f'{r.json().get("title")}.zip'
print(r.json())

input_file = downloadFile(
    url=download_url,
    output_path=input_path,
    overwrite=force_clean,
    file=input_file_name,
)

{'id': 'wdpa-shp', 'title': 'WDPA_Oct2024_Public_shp', 'url': 'https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_Oct2024_Public_shp.zip', 'hasFailed': False, 'token': 'wdpa'}


In [16]:
# unzip file twice due how data is provisioned by protected planet
shutil.unpack_archive(
    input_file,
    temp_working_path,
    "zip",
)

for file in temp_working_path.glob("*.zip"):
    shutil.unpack_archive(file, temp_working_path.joinpath(file.stem), "zip")

In [17]:
# load data & Transform it
unziped_folders = []
for file in temp_working_path.glob("*/*.shp"):
    df = (
        gpd.read_file(file)
        .pipe(filter_by_methodology)
        .pipe(filter_by_terrestrial)
        .pipe(transform_points)
        .pipe(clean_geometries)
    )
    unziped_folders.append(df)

# merge datasets
gdf = gpd.GeoDataFrame(
    pd.concat(unziped_folders, ignore_index=True),
    crs=unziped_folders[0].crs,
)

gdf.drop(
    columns=list(
        set(gdf.columns)
        - set(
            [
                "geometry",
                "WDPAID",
                "WDPA_PID",
                "PA_DEF",
                "NAME",
                "PARENT_ISO",
                "DESIG_ENG",
                "IUCN_CAT",
                "STATUS",
                "STATUS_YR",
                "GIS_AREA",
                "MARINE",
            ]
        )
    ),
    inplace=True,
)
gdf["WDPAID"] = pd.to_numeric(gdf["WDPAID"], downcast="integer")

In [18]:
final_wdpa_terrestrial = await simplify_async(gdf)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 292261/292261 [03:34<00:00, 1362.25it/s]


In [19]:
# save data & zip it
final_wdpa_terrestrial.to_file(
    filename=output_file,
    driver="GPKG",
    layer="name",
    encoding="utf-8",
)

In [None]:
# LOAD
## load zipped file to GCS
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=output_file,
    operation="w",
)

In [13]:
# # clean unzipped files
# rm_tree(temp_working_path) if temp_working_path.exists() else None
# rm_tree(output_path) if output_path.exists() else None

### Protected planet intermediate all

In [43]:
force_clean = True
step = "preprocess"
pipe = "pa"

In [44]:
mpa_url = "https://www.protectedplanet.net/downloads"
mpa_body = {
    "domain": "general",
    "format": "shp",
    "token": "wdpa",
    "id": 76011,
}

In [45]:
working_folder = FileConventionHandler(pipe)
input_path = working_folder.pipe_raw_path
temp_working_path = working_folder.get_temp_file_path(step)

output_path = working_folder.get_processed_step_path(step)
output_file = working_folder.get_step_fmt_file_path(step, "gpkg")
zipped_output_file = working_folder.get_step_fmt_file_path(step, "zip", True)
remote_path = working_folder.get_remote_path(step)

In [46]:
# download data
r = requests.post(url=mpa_url, data=mpa_body)
r.raise_for_status()

download_url = r.json().get("url")
input_file_name = f'{r.json().get("title")}.zip'
print(r.json())

# input_file = downloadFile(
#     url=download_url,
#     output_path=input_path,
#     overwrite=force_clean,
#     file=input_file_name,
# )

{'id': 'wdpa-shp', 'title': 'WDPA_Sep2024_Public_shp', 'url': 'https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_Sep2024_Public_shp.zip', 'hasFailed': False, 'token': 'wdpa'}


In [47]:
# unzip file twice due how data is provisioned by protected planet
shutil.unpack_archive(
    input_file,
    temp_working_path,
    "zip",
)

for file in temp_working_path.glob("*.zip"):
    shutil.unpack_archive(file, temp_working_path.joinpath(file.stem), "zip")

In [68]:
# load data & Transform it
unziped_folders = []
for file in temp_working_path.glob("*/*.shp"):
    df = (
        gpd.read_file(file)
        .pipe(filter_by_methodology)
        .pipe(transform_points)
        .pipe(clean_geometries)
    )
    unziped_folders.append(df)

# merge datasets
gdf = gpd.GeoDataFrame(
    pd.concat(unziped_folders, ignore_index=True),
    crs=unziped_folders[0].crs,
)

gdf.drop(
    columns=list(
        set(gdf.columns)
        - set(
            [
                "geometry",
                "WDPAID",
                "WDPA_PID",
                "PA_DEF",
                "NAME",
                "PARENT_ISO",
                "DESIG_ENG",
                "IUCN_CAT",
                "STATUS",
                "STATUS_YR",
                "GIS_AREA",
                "GIS_M_AREA",
                "MARINE",
            ]
        )
    ),
    inplace=True,
)
gdf["WDPAID"] = pd.to_numeric(gdf["WDPAID"], downcast="integer")

In [70]:
final_wdpa = await simplify_async(gdf)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 298912/298912 [03:53<00:00, 1277.78it/s]


<class 'shapely.geometry.base.GeometrySequence'>


  1%|▉                                                                                                                                                            | 1817/298912 [00:11<03:42, 1338.09it/s]

<class 'shapely.geometry.base.GeometrySequence'>


  1%|█▉                                                                                                                                                           | 3731/298912 [00:12<03:25, 1433.85it/s]

'Polygon' object has no attribute 'geoms'


  1%|██▏                                                                                                                                                          | 4223/298912 [00:12<01:23, 3536.39it/s]

<class 'shapely.geometry.base.GeometrySequence'>


  4%|██████                                                                                                                                                      | 11698/298912 [00:15<04:00, 1191.93it/s]

<class 'shapely.geometry.base.GeometrySequence'>


 21%|████████████████████████████████                                                                                                                            | 61318/298912 [00:27<03:03, 1298.19it/s]

<class 'shapely.geometry.base.GeometrySequence'>


 22%|██████████████████████████████████▉                                                                                                                         | 66972/298912 [00:29<02:28, 1566.84it/s]

<class 'shapely.geometry.base.GeometrySequence'>


 32%|██████████████████████████████████████████████████▌                                                                                                         | 96777/298912 [00:35<01:04, 3139.64it/s]

<class 'shapely.geometry.base.GeometrySequence'>


 34%|█████████████████████████████████████████████████████▏                                                                                                     | 102462/298912 [00:37<00:46, 4270.30it/s]

<class 'shapely.geometry.base.GeometrySequence'>


 36%|███████████████████████████████████████████████████████▍                                                                                                   | 106818/298912 [00:38<01:33, 2059.87it/s]

<class 'shapely.geometry.base.GeometrySequence'>


 41%|██████████████████████████████████████████████████████████████▉                                                                                            | 121477/298912 [00:41<01:20, 2212.60it/s]

<class 'shapely.geometry.base.GeometrySequence'>


 43%|███████████████████████████████████████████████████████████████████                                                                                        | 129353/298912 [00:44<01:10, 2404.69it/s]

'Polygon' object has no attribute 'geoms'


 46%|██████████████████████████████████████████████████████████████████████▊                                                                                    | 136616/298912 [00:46<01:06, 2457.71it/s]

'Polygon' object has no attribute 'geoms'


 50%|████████████████████████████████████████████████████████████████████████████▊                                                                              | 148130/298912 [00:50<01:02, 2399.55it/s]

<class 'shapely.geometry.base.GeometrySequence'>


 51%|██████████████████████████████████████████████████████████████████████████████▍                                                                            | 151376/298912 [00:51<01:09, 2121.47it/s]

<class 'shapely.geometry.base.GeometrySequence'>


 54%|███████████████████████████████████████████████████████████████████████████████████                                                                        | 160280/298912 [00:53<01:55, 1197.48it/s]

'Polygon' object has no attribute 'geoms'


 55%|█████████████████████████████████████████████████████████████████████████████████████▌                                                                     | 164997/298912 [00:54<01:16, 1760.31it/s]

'Polygon' object has no attribute 'geoms'


 56%|██████████████████████████████████████████████████████████████████████████████████████▍                                                                    | 166577/298912 [00:55<01:03, 2072.04it/s]

<class 'shapely.geometry.base.GeometrySequence'>


 58%|█████████████████████████████████████████████████████████████████████████████████████████▌                                                                 | 172769/298912 [00:56<01:01, 2037.28it/s]

<class 'shapely.geometry.base.GeometrySequence'>


 58%|██████████████████████████████████████████████████████████████████████████████████████████▎                                                                | 174238/298912 [00:57<00:30, 4024.73it/s]

<class 'shapely.geometry.base.GeometrySequence'>


 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                  | 201035/298912 [01:03<00:17, 5566.19it/s]

<class 'shapely.geometry.base.GeometrySequence'>


 69%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                | 205073/298912 [01:04<00:17, 5454.62it/s]

<class 'shapely.geometry.base.GeometrySequence'>


 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                             | 210501/298912 [01:05<00:27, 3184.24it/s]

<class 'shapely.geometry.base.GeometrySequence'>


 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                      | 225210/298912 [01:08<00:17, 4259.37it/s]

<class 'shapely.geometry.base.GeometrySequence'>


 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 225947/298912 [01:09<00:48, 1498.54it/s]

<class 'shapely.geometry.base.GeometrySequence'>


 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                    | 229699/298912 [01:10<00:23, 2896.14it/s]

<class 'shapely.geometry.base.GeometrySequence'>


 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                        | 252333/298912 [01:15<00:15, 3001.02it/s]

'Polygon' object has no attribute 'geoms'


 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉           | 277458/298912 [01:21<00:07, 2831.56it/s]

<class 'shapely.geometry.base.GeometrySequence'>


 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 281117/298912 [01:22<00:12, 1449.83it/s]

<class 'shapely.geometry.base.GeometrySequence'>
<class 'shapely.geometry.base.GeometrySequence'>
<class 'shapely.geometry.base.GeometrySequence'>


 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 284183/298912 [01:23<00:04, 3294.97it/s]

<class 'shapely.geometry.base.GeometrySequence'>


 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 296106/298912 [01:32<00:00, 3532.30it/s]

<class 'shapely.geometry.base.GeometrySequence'>


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 298912/298912 [03:53<00:00,  2.59it/s]

In [71]:
# save data & zip it
final_wdpa.to_file(
    filename=output_file,
    driver="GPKG",
    layer="name",
    encoding="utf-8",
)

In [75]:
# LOAD
## load zipped file to GCS
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=output_file,
    operation="w",
)

In [None]:
# clean unzipped files
rm_tree(temp_working_path) if temp_working_path.exists() else None
rm_tree(output_path) if output_path.exists() else None

### Habitats

In [4]:
force_clean = True
step = "preprocess"
pipe = "habitats"

In [5]:
habitats_download_url = "https://habitats.oceanplus.org/downloads/global_statistics.zip"
Mangroves_download_url = "https://mangrove-atlas-api.herokuapp.com/admin/widget_protected_areas.csv"
mangroves_request_headers = {
    "Cookie": "_mangrove_atlas_api_session=fJuobvI2fH42WfGfMtRTp%2BksIDdPEpY6DG8uCuITsENtrRGG4AA3nYEeAI7dytzpK%2F0dGIHq84O54MRr6eiPgiwCYXp2XP4IzXM40dFt%2FI6hoB0WXC%2Fwrd81XreNnMZiSEE6IVT5R0fqMcmsZdPn53u0A1d4CGU3FfliOZuWkckBuA%2F7C4upBGuSS8817LqOh1slG%2BsEOGp3nk7WX4fMoPbsHWtARfFwdfoAHz448LO7uWuZdyiu7YOrS0ZxOZEb9JZ8hcUJph4pBFofZLpOvtQQutgZY21T5bhQ7Kwfl56e6Qr0SZ%2B8sIzMfky3h%2FjOA6DNTLoy%2BZLiZBAgFHlTYm2JwlwqWgAZU8D7cE7Zn%2Fxgf3LFF9pZ9Fe3QG4c8LIwH%2FxqjEd8GsZAhBMgBWbxubigQ9gZssZt6CIO--7qiVsTAT8JAKj1jU--U7TI%2Fz9c151bfD8iZdkBDw%3D%3D"
}
seamounts_download_url = "https://datadownload-production.s3.amazonaws.com/ZSL002_ModelledSeamounts2011_v1.zip"

In [6]:
working_folder = FileConventionHandler(pipe)
input_path = working_folder.pipe_raw_path
temp_working_path = working_folder.get_temp_file_path(step)

output_path = working_folder.get_processed_step_path(step)
output_file = working_folder.get_step_fmt_file_path(step, "shp")
zipped_output_file = working_folder.get_step_fmt_file_path(step, "zip", True)
remote_path = working_folder.get_remote_path(step)

#### Seamounts

In [8]:
input_seamounts_path = input_path.joinpath("seamounts")
input_seamounts_path.mkdir(parents=True, exist_ok=True)
# download data
input_file_name = "seamounts.zip"
input_file = downloadFile(
    url=seamounts_download_url,
    output_path=input_seamounts_path,
    overwrite=force_clean,
    file=input_file_name,
)

In [9]:
# unzip data
shutil.unpack_archive(
    input_file,
    temp_working_path,
    "zip",
)

In [None]:
temp_working_path

In [24]:
first =gpd.read_file(next(temp_working_path.rglob("*SeamountsBaseArea.shp")))

In [None]:
first

In [None]:
if not force_clean and zipped_output_file.exists():
    print(f"File {zipped_output_file} already exists")