## Pre-calculations for 30x30 widgets and tables data 
This notebook performs the calculations needed for the data displayed in the widgets and tables of the 30x30 platform. It should be run after executing the intermediate.ipynb and locations.ipynb notebooks.

### Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from pathlib import Path
import time
import pandas as pd
import geopandas as gpd
import numpy as np
import json
import dotenv
import sys
from pathlib import Path
import pandas as pd
import geopandas as gpd
import numpy as np
import asyncio
from tqdm.asyncio import tqdm
from itertools import product
from shapely.geometry import box

dotenv.load_dotenv()

scripts_dir = Path(".").joinpath("src")
if scripts_dir not in sys.path:
    sys.path.insert(0, scripts_dir.resolve().as_posix())

from helpers.strapi import Strapi
from helpers.settings import get_settings, Settings
from helpers.file_handler import FileConventionHandler
from helpers.utils import download_and_unzip_if_needed, writeReadGCP, make_archive

from pipelines.output_schemas import (
    FPLSchema,
    ProtectionLevelSchema,
    PAsSchema,
    HabitatsSchema,
    LocationSchema,
    ProtectedAreaExtentSchema,
    PAsSchemaChunk1,
    PAsSchemaChunk2,
)
from pipelines.processors import (
    add_envelope,
    add_location_iso,
    expand_multiple_locations,
    add_region_iso,
    calculate_eez_area,
    add_bbox,
    add_groups_and_members,
    add_location_name,
    output,
    clean_geometries,
    filter_by_exluding_propossed_mpas,
    spatial_join,
    process_mpa_data,
    assign_iso3,
    calculate_global_area,
    separate_parent_iso,
    calculate_stats_cov,
    coverage_stats,
    mpaatlas_filter_stablishment,
    process_mpaatlas_data,
    calculate_stats,
    fix_monaco,
    batch_export,
    calculate_area,
    define_is_child,
    set_child_id,
    add_child_parent_relationship,
    columns_to_lower,
    extract_wdpaid_mpaatlas,
    simplify_async,
    get_matches,
    repair_geometry, 
    arrange_dimensions,
    add_total_area, 
    change_ata_to_abnj,
    calculate_padef_percentages,
    calculate_coverage_percentage,
    calculate_coverage_percentage_mpatlas,
    calculate_global_contribution,
    add_is_last_year,
    add_environment,
    cumulative_pa_def_counts, 
    process_final_coverage,
    process_grid
    
)


In [3]:
mysettings = get_settings()
prev_step = "preprocess"
current_step = "stats"

In [3]:
# # Strapi setup
# strapi = Strapi(url=mysettings.STRAPI_URL)
# strapi.login(jwt=mysettings.STRAPI_JWT)

### Coverage stats - Marine Protected Areas

We are going to use the intermediate data from eez, in order to create a dataset that can be used as a land mask.
The steps are:
1. Load eez
2. Spatial inner Join the eez dataset with the Mpas one
3. Assign the location iso
4. dissolve by location iso and cummulative year
5. calculate the area for global regions and eez countries
6. prepare the data to be ingested in strapi
7. upload the data to strapi

In [None]:
pipe = "mpa"
strapi_collection = ""

pipe_dir_eez = FileConventionHandler("eez")
pipe_dir_mpas = FileConventionHandler(pipe)
output_file = pipe_dir_mpas.get_processed_step_path(current_step).joinpath(
    "mpa_coverage.csv"
)

# Download the EEZ file && unzip it
download_and_unzip_if_needed(pipe_dir_eez, prev_step, mysettings)
# Download the mpas file && unzip it
download_and_unzip_if_needed(pipe_dir_mpas, prev_step, mysettings)

/home/sofia/dev/skytruth-30x30/data/data/eez/processed/eez_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/eez/processed/preprocess
/home/sofia/dev/skytruth-30x30/data/data/mpa/processed/mpa_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/mpa/processed/preprocess


PosixPath('/home/sofia/dev/skytruth-30x30/data/data/mpa/processed/preprocess')

In [None]:
# Load the data
eez = gpd.read_file(pipe_dir_eez.get_step_fmt_file_path(prev_step, "shp")).pipe(clean_geometries)
mpas = gpd.read_file(pipe_dir_mpas.get_step_fmt_file_path(prev_step, "shp")).pipe(clean_geometries)

# Join the eez data with the wdpa data
eez_mpas_data_join = await spatial_join(eez, mpas.pipe(filter_by_exluding_propossed_mpas), environment="marine")

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 282/282 [08:27<00:00,  1.80s/it]


In [None]:
# Prepare the mpa data
final_data = await process_mpa_data(
    eez_mpas_data_join.pipe(add_location_iso).pipe(assign_iso3),
    range(2011, time.localtime().tm_year + 1),
    ["PA_DEF", "iso_3"],
    {"protectedAreasCount": "sum"},
)

# Save the results
final_data.to_csv(pipe_dir_mpas.get_processed_step_path(prev_step).joinpath("mpa_preprocessed.csv"), index=False)

In [None]:
# Load the results
final_data = pd.read_csv(pipe_dir_mpas.get_processed_step_path(prev_step).joinpath("mpa_preprocessed.csv"))
final_data

Unnamed: 0,PA_DEF,iso_3,protectedAreasCount,year,area
0,0,COL,2,2010,3295.358429
1,0,ESP;MAR,1,2010,0.641148
2,0,MAR,9,2010,205.459059
3,0,PHL,24,2010,31956.310702
4,1,ABNJ,29,2010,996236.123210
...,...,...,...,...,...
145,1,VNM,31,2010,3357.704625
146,1,VUT,5,2010,18.148840
147,1,WSM,2,2010,99.018821
148,1,YEM,5,2010,1410.245095


In [None]:
# Create coverage stats table
final_data2 = final_data.copy()

coverage = (
    final_data2.pipe(calculate_global_area, ["year", "PA_DEF"], "marine", {"area": "sum"}, "iso_3")
    .pipe(separate_parent_iso, "iso_3")
    .pipe(add_region_iso, "iso_3")
    .replace(
        {
            "iso_3": {
                "ATA": "ABNJ",
                "COK": "NZL",
                "IOT": "GBR",
                "NIU": "NZL",
                "SHN": "GBR",
                "SJM": "NOR",
                "UMI": "USA",
                "NCL": "FRA",
                "GIB": "GBR",
            }
        }
    )
    .pipe(calculate_stats_cov, ["year", "PA_DEF"], "iso_3", environment='marine').astype({"PA_DEF": int})
    .pipe(calculate_padef_percentages, 'marine')
    .pipe(add_total_area, 'marine')
    .pipe(coverage_stats)
    .pipe(calculate_coverage_percentage)
    .pipe(calculate_global_contribution)
    .pipe(add_is_last_year)
    .pipe(add_environment)
)

# Create the output and save it
ProtectedAreaExtentSchema(
    coverage.pipe(
        output,
        "iso_3",
        {},
        {},
        ["area", "iso_3", 'total_marine_area'],
    )
).to_csv(
    output_file,
    index=True,
)


coverage.head(2)

Unnamed: 0,year,iso_3,area,protected_areas_count,oecms,pas,total_marine_area,protected_area,coverage,global_contribution,is_last_year,environment
0,2010,ABNJ,996236.12321,29.0,0.0,100.0,212881389,996236.12,0.467977,0.275966,1,1
1,2010,AF,129790.939457,427.0,2.34192,97.65808,14878058,129790.94,0.872365,0.035953,1,1


In [None]:
# Upload the results to GCS
remote_path = 'vizzuality_processed_data/strapi_tables/mpa_coverage.csv'

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=output_file,
    operation="w",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


In [None]:
# strapi_collection = "protection-coverage-stat"

In [None]:
# strapi.deleteCollectionData(strapi_collection, list(range(1, 2300)))

<helpers.strapi.Strapi at 0x7fda8ddb8860>

In [None]:
# strapi.importCollectionData(
#     strapi_collection,
#     output_file,
# )

### Coverage stats - Terrestrial Protected Areas

In [14]:
pipe = "mpa-terrestrial"
step = "preprocess"
strapi_collection_mpas = "mpa-terrestrial"

pipe_dir = FileConventionHandler(pipe)
pipe_dir_gadm = FileConventionHandler("gadm")

working_folder = FileConventionHandler(pipe)
input_path = working_folder.pipe_raw_path
temp_working_path = working_folder.get_temp_file_path(step)
output_file_sjoin = pipe_dir.get_processed_step_path(current_step).joinpath("tpa_sjoin.shp")
output_file_dissolve = pipe_dir.get_processed_step_path(current_step).joinpath("tpa_dissolve.csv")
output_file_tpas = pipe_dir.get_processed_step_path(current_step).joinpath("tpa_coverage.csv")

# Download the protected atlas file && unzip it
download_and_unzip_if_needed(pipe_dir, prev_step, mysettings)
# Download the mpaatlas file 
download_and_unzip_if_needed(pipe_dir_gadm, prev_step, mysettings)

/home/sofia/dev/skytruth-30x30/data/data/mpa-terrestrial/processed/mpa-terrestrial_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/mpa-terrestrial/processed/preprocess
/home/sofia/dev/skytruth-30x30/data/data/gadm/processed/gadm_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/gadm/processed/preprocess


PosixPath('/home/sofia/dev/skytruth-30x30/data/data/gadm/processed/preprocess')

In [15]:
# Load the data
wdpa = gpd.read_file(pipe_dir.get_step_fmt_file_path(prev_step, "gpkg")).pipe(
    clean_geometries
)
gadm = gpd.read_file(pipe_dir_gadm.get_step_fmt_file_path(prev_step, "shp")).pipe(clean_geometries)

gadm.sindex
wdpa.sindex

<geopandas.sindex.SpatialIndex at 0x7f32a506ae70>

In [22]:
# Spatial join using overlay
wdpa_subset = wdpa[
    ~(
        (wdpa.bounds.minx < -181)
        | (wdpa.bounds.miny < -91)
        | (wdpa.bounds.maxx > 181)
        | (wdpa.bounds.maxy > 91)
    )
].reset_index(drop=True)

sjoin_gdf = await spatial_join(wdpa_subset, gadm, environment="terrestrial")
sjoin_gdf.rename(columns={"GID_0": "iso_3"}, inplace=True)

INFO:notebook:Processing 286305 elements
INFO:notebook:Grid created with 433 cells
INFO:notebook:Grid split into 392 chunks
100%|███████████████████████████████████████████████████████████████████████████████████████████| 392/392 [12:49<00:00,  1.96s/it]


In [23]:
# Test existence of duplicates
sjoin_gdf.loc[sjoin_gdf.duplicated(subset=["WDPA_PID", "iso_3"], keep=False)].sort_values(
    "WDPA_PID"
)

Unnamed: 0,WDPAID,WDPA_PID,PA_DEF,NAME,DESIG_ENG,IUCN_CAT,MARINE,GIS_AREA,STATUS,STATUS_YR,PARENT_ISO,COUNTRY,iso_3,area_km2,name_es,name_fr,geometry


In [24]:
# Exclude "proposed" protected areas
sjoin_gdf = filter_by_exluding_propossed_mpas(sjoin_gdf)

# Save the results of the spatial join
sjoin_gdf.to_file(output_file_sjoin, driver="ESRI Shapefile")

INFO:pyogrio._io:Created 289,352 records


In [25]:
# Load the data
sjoin_gdf = gpd.read_file(output_file_sjoin)
sjoin_gdf["STATUS_YR"] = sjoin_gdf["STATUS_YR"].astype("Int64")

In [26]:
# Calculate wdpa cumulative counts and pa and oecm percentages
cumulative_counts = cumulative_pa_def_counts(sjoin_gdf)
cumulative_counts

PA_DEF,iso_3,year,1,0,protected_areas_count
0,AFG,2010,10,0.0,10.0
1,AFG,2011,10,0.0,10.0
2,AFG,2012,10,0.0,10.0
3,AFG,2013,10,0.0,10.0
4,AFG,2014,10,0.0,10.0
...,...,...,...,...,...
2884,ZWE,2020,229,0.0,229.0
2885,ZWE,2021,229,0.0,229.0
2886,ZWE,2022,229,0.0,229.0
2887,ZWE,2023,229,0.0,229.0


In [None]:
# Dissolve geometries to calculate the coverage
data = await process_grid(sjoin_gdf, "terrestrial")

# Concatenate the data in a single dataframe
tpa = pd.concat(data, ignore_index=True).drop(columns=['index']).rename(columns={'area': 'protected_area'})

# Group by 'iso_3' and 'year' and sum the 'area'
tpa_grouped = tpa.groupby(['iso_3', 'year'], as_index=False)['protected_area'].sum()
tpa_grouped.reset_index(drop=True, inplace=True)

# save to csv
tpa_grouped.to_csv(output_file_dissolve, index=False)

In [43]:
# Load dissolved data
tpa_grouped = pd.read_csv(output_file_dissolve)
tpa_grouped.head(5)

Unnamed: 0,iso_3,year,protected_area
0,AFG,2010,1078.918622
1,AFG,2011,1078.918622
2,AFG,2012,1078.918622
3,AFG,2013,1078.918622
4,AFG,2014,1078.918622


In [44]:
# Create coverage stats table
coverage = (
    pd.merge(tpa_grouped, cumulative_counts, on=['iso_3', 'year'], how='left')
    .pipe(calculate_global_area, ["year"], environment='terrestrial')
    .pipe(add_region_iso, "iso_3")
    .pipe(calculate_stats_cov, ["year"], "iso_3", environment= "terrestrial")
    .pipe(calculate_padef_percentages, "terrestrial")
    .pipe(add_total_area, "terrestrial")
    .pipe(calculate_coverage_percentage)
    .pipe(calculate_global_contribution)
    .pipe(add_is_last_year)
    .pipe(add_environment)
)

ProtectedAreaExtentSchema(
    coverage.pipe(
        output,
        "iso_3",
        {},
        {},
        ["iso_3", 'total_terrestrial_area'],
    )
).to_csv(
    output_file_tpas,
    index=True,
)

coverage

Unnamed: 0,year,iso_3,protected_area,protected_areas_count,oecms,pas,total_terrestrial_area,coverage,global_contribution,is_last_year,environment
0,2010,AF,3.636311e+06,7272.0,0.0,100.0,29993095,12.123827,2.694465,0,2
1,2010,AS,2.051386e+06,24782.0,0.0,100.0,31625556,6.486481,1.520053,0,2
2,2010,AT,1.108333e+02,2.0,0.0,100.0,12088230,0.000917,0.000082,0,2
3,2010,EU,4.306080e+06,116128.0,0.0,100.0,30037571,14.335645,3.190756,0,2
4,2010,,2.044176e+06,52176.0,0.0,100.0,19371152,10.552683,1.514711,0,2
...,...,...,...,...,...,...,...,...,...,...,...
3004,2024,YEM,5.145397e+03,15.0,0.0,100.0,453741,1.133994,0.003813,1,2
3005,2024,ZAF,1.143850e+05,1631.0,0.0,100.0,1221328,9.365627,0.084758,1,2
3006,2024,ZMB,2.929805e+05,557.0,0.0,100.0,753990,38.857347,0.217095,1,2
3007,2024,ZNC,2.779983e+00,8.0,0.0,100.0,3314,0.083886,0.000002,1,2


In [41]:
# Save the results in GCS
remote_path = 'vizzuality_processed_data/strapi_tables/tpa_coverage.csv'

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=output_file_tpas,
    operation="w",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


### Coverage stats - Concatenate Marine & Terrestrial

In [3]:
pipe = "pa"
pipe_tpa = "mpa-terrestrial"
pipe_mpa = "mpa"
step = "preprocess"

pipe_dir = FileConventionHandler(pipe)
pipe_dir_tpa = FileConventionHandler(pipe_tpa)
pipe_dir_mpa = FileConventionHandler(pipe_mpa)

input_path_tpas = pipe_dir_tpa.get_processed_step_path(current_step).joinpath("tpa_coverage.csv")
input_path_mpas = pipe_dir_mpa.get_processed_step_path(current_step).joinpath("mpa_coverage.csv")

output_file = pipe_dir.get_processed_step_path(current_step).joinpath("protection_coverage_stats.csv")

In [5]:
# Concatenate the marine and terrestrial data
final_data = process_final_coverage(input_path_tpas, input_path_mpas)

# Filter the DataFrame to get the row where 'id' is 1
final_data[final_data['id'] == 1]

Unnamed: 0,id,year,protected_area,protected_areas_count,oecms,pas,coverage,global_contribution,is_last_year,environment,location
1,1,2010,3636311.0,7272,0.0,100.0,12.123827,2.694465,0,2,3


In [6]:
ProtectedAreaExtentSchema(final_data).to_csv(output_file, index=True)

In [128]:
# Save the results in GCS
remote_path = 'vizzuality_processed_data/strapi_tables/protection_coverage_stats.csv'

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=output_file,
    operation="w",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


### MPAtlas - Marine Conservation Protection Levels

We are going to use the intermediate data from eez, in order to create a dataset that can be used as a land mask.
The steps are:
1. Load eez
2. Spatial inner Join the eez dataset with the Mpaatlas one
3. iso assign using the sovereign one provided by mpaatlas
4. dissolve by location
5. calculate the area for global regions and eez countries ussing mollwide projection
6. prepare the data to be ingested in strapi
7. upload the data to strapi

In [16]:
pipe = "mpaatlas"
strapi_collection = "mpaa-protection-level-stat"

pipe_dir_eez = FileConventionHandler("eez")
pipe_dir_mpaatlas = FileConventionHandler(pipe)
output_file = pipe_dir_mpaatlas.get_processed_step_path(current_step).joinpath(
    "mpaatlas_protection_level.csv"
)

# Download the EEZ file && unzip it
download_and_unzip_if_needed(pipe_dir_eez, prev_step, mysettings)
# Download the mpas file && unzip it
download_and_unzip_if_needed(pipe_dir_mpaatlas, prev_step, mysettings)

# Load the data
eez = gpd.read_file(pipe_dir_eez.get_step_fmt_file_path(prev_step, "shp")).pipe(clean_geometries)
mpaatlas_intermediate = gpd.read_file(
    pipe_dir_mpaatlas.get_step_fmt_file_path(prev_step, "shp")
).pipe(clean_geometries)

/home/sofia/dev/skytruth-30x30/data/data/eez/processed/eez_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/eez/processed/preprocess
/home/sofia/dev/skytruth-30x30/data/data/mpaatlas/processed/mpaatlas_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/mpaatlas/processed/preprocess


In [17]:
eez_mpaatlas_data_join = await spatial_join(
    eez, mpaatlas_intermediate.pipe(mpaatlas_filter_stablishment)
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 282/282 [00:28<00:00,  9.89it/s]


In [None]:
# To get an idea of the spatial join results
# eez_mpaatlas_data_join.to_file(
#     pipe_dir_mpaatlas.get_processed_step_path(current_step).joinpath("mpaatlas_sjoin.shp"),
#     driver="ESRI Shapefile",
# )

In [18]:
eez_mpaatlas_data_join.dissolve(by=["protecti_1", "location_i"], aggfunc={"name": "count"}).reset_index().to_file(
pipe_dir_mpaatlas.get_processed_step_path(current_step).joinpath("mpaatlas_sjoin_dissolved.shp"),
driver="ESRI Shapefile",
)

INFO:pyogrio._io:Created 55 records


In [25]:
eez_mpaatlas_data_join2 = eez_mpaatlas_data_join.copy()

result = (
    eez_mpaatlas_data_join2.rename(columns={"location_i": "iso_3"})
    .pipe(process_mpaatlas_data)  
    .pipe(calculate_global_area, gby_col=["protecti_1"], iso_column="iso_3", environment = "marine")
    .pipe(separate_parent_iso, iso_column="iso_3")
    .replace(
        {
            "iso_3": {
                "COK": "NZL",
                "IOT": "GBR",
                "NIU": "NZL",
                "SHN": "GBR",
                "SJM": "NOR",
                "UMI": "USA",
                "NCL": "FRA",
            }
        }
    )
    .pipe(add_region_iso, iso_column="iso_3")
    .pipe(calculate_stats, gby_col=["protecti_1"], iso_column="iso_3")
    .query('protecti_1 != "less protected or unknown"')
    .pipe(fix_monaco, iso_column="iso_3", area_column="area_km2")
    .pipe(add_total_area, 'marine')
    .pipe(calculate_coverage_percentage_mpatlas)
    .pipe(
        output,
        iso_column="iso_3",
        rep_d={
            "protecti_1": {
                "fully or highly protected": 1,
            }
        },
        rename={"protecti_1": "mpaa_protection_level", "area_km2": "area"},
        drop_cols=["total_marine_area", "iso_3"],
    )
)

ProtectionLevelSchema(result[~result.location.isna()].assign(year=2024)).to_csv(
    output_file, index=True
)

In [82]:
# Save the results in GCS
remote_path = 'vizzuality_processed_data/strapi_tables/mpaatlas_protection_level.csv'

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=output_file,
    operation="w",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


In [None]:
# strapi_collection = "mpaa-protection-level-stat"

In [None]:
# strapi.deleteCollectionData(strapi_collection, list(range(1, 300)))

<helpers.strapi.Strapi at 0x7fda8ddb8860>

In [None]:
# strapi.importCollectionData(
#     strapi_collection,
#     output_file,
# )

### Protected Seas  - Level of Fishing Protection

In [158]:
pipe = "protectedseas"
strapi_collection = "fishing-protection-level-stat"

pipe_dir = FileConventionHandler(pipe)
input_file = pipe_dir.get_processed_step_path(prev_step).joinpath("protectedseas_stats.xlsx")
output_file = pipe_dir.get_processed_step_path(current_step).joinpath("lfp.csv")

# Download the protected seas file && unzip it
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name="vizzuality_processed_data/protectedseas/preprocess/protectedseas_stats.xlsx",
    file=input_file,
    operation="r",
)

# Load the data
protectedseas_intermediate = pd.read_excel(input_file)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


In [159]:
protectedseas_intermediate[
    (
        protectedseas_intermediate.iso_ter.isna()
        & protectedseas_intermediate.includes_multi_jurisdictional_areas.eq(True)
    )
    | (
        protectedseas_intermediate.iso_ter.isna()
        & protectedseas_intermediate.includes_multi_jurisdictional_areas.eq(False)
        & ~protectedseas_intermediate.iso_sov.isin(
            protectedseas_intermediate[
                protectedseas_intermediate.includes_multi_jurisdictional_areas.eq(True)
            ].iso_sov.unique()
        )
    )
][protectedseas_intermediate.iso_sov.eq("ESP")]

Unnamed: 0,iso_ter,iso_sov,includes_multi_jurisdictional_areas,lfp,area_sqkm,total_area,pct_total
320,,ESP,True,5,142.97301,1011023.776,0.014141
321,,ESP,True,4,1639.682076,1011023.776,0.16218
322,,ESP,True,3,214532.8498,1011023.776,21.219367
323,,ESP,True,2,15064.13277,1011023.776,1.489988
324,,ESP,True,1,779644.1388,1011023.776,77.114323


In [160]:
final = (
    protectedseas_intermediate[
        (
            protectedseas_intermediate.iso_ter.isna()
            & protectedseas_intermediate.includes_multi_jurisdictional_areas.eq(True)
        )
        | (
            protectedseas_intermediate.iso_ter.isna()
            & protectedseas_intermediate.includes_multi_jurisdictional_areas.eq(False)
            & ~protectedseas_intermediate.iso_sov.isin(
                protectedseas_intermediate[
                    protectedseas_intermediate.includes_multi_jurisdictional_areas.eq(True)
                ].iso_sov.unique()
            )
        )
    ].replace(
        {
            "lfp": {
                5: "highly",
                4: "highly",
                3: "moderately",
                2: "less",
                1: "less",
            },
        }
    ).groupby(["iso_sov", "lfp"]).agg({"area_sqkm": "sum", "total_area": "max"}).reset_index()
    .pipe(
        calculate_global_area,
        gby_col=["lfp"],
        iso_column="iso_sov",
        agg_ops={"area_sqkm": "sum", "total_area": "sum"},
    )
    .pipe(add_region_iso, iso_column="iso_sov")
    .pipe(
        calculate_stats,
        gby_col=["lfp"],
        ops={"area_sqkm": "sum", "total_area": "sum"},
        iso_column="iso_sov",
    )
    .pipe(lambda x: x.assign(pct=round((x.area_sqkm / x.total_area)*100, 2)))
    .pipe(
        output,
        iso_column="iso_sov",
        rep_d={
            "lfp": {
                "highly": 1,
                "moderately": 2,
                "less": 3,
            }
        },
        rename={"lfp": "fishing_protection_level", "area_sqkm": "area"},
        drop_cols=["iso_sov", "total_area"],
    )
)
FPLSchema(final[final.location.notna()]).to_csv(output_file, index=True)

In [161]:
remote_path = 'vizzuality_processed_data/strapi_tables/lfp.csv'

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=output_file,
    operation="w",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


In [None]:
# strapi.deleteCollectionData(strapi_collection, list(range(1, 500)))

In [None]:
# strapi.importCollectionData(
#     strapi_collection,
#     output_file,
# )

  1- lower case the columns   
2- separate location that its regime is in dispute or on join regime  
3- calcualte area for mpaatlas data  
4- rename columns for merge  
5- merge maaatlas and mpa data identifying the source  
6- identify child resources and set them as childs  
7- calculate bbox  
8- set child resources  
9- prepare output for batch export  
10- upload data to strapi  

### Country Detail Table Data
The country detail table is done for marine and terrestrial independently and the results are concatenated.

Methodology for marine:

1- lower case the columns   
2- separate location that its regime is in dispute or on join regime 
3- remove ATA and ABNJ because Protected planet doesn't include stats for ATA and ABNJ is marine  
4- calculate area for mpaatlas data  
5- rename columns for merge  
6- merge maaatlas and mpa data identifying the source  
7- identify child resources and set them as childs  
8- calculate bbox  
9- set child resources  
10- Add coverage percentage
11- Add environment marine


In [5]:
pipe = "mpa"
strapi_collection_mpas = "mpa"

pipe_dir = FileConventionHandler(pipe)
pipe_dir_mpaatlas = FileConventionHandler("mpaatlas")
output_file_mpas = pipe_dir.get_processed_step_path(current_step).joinpath("mpa_detail.csv")

# Download the protected atlas file && unzip it
download_and_unzip_if_needed(pipe_dir, prev_step, mysettings)
# Download the mpaatlas file 
download_and_unzip_if_needed(pipe_dir_mpaatlas, prev_step, mysettings)

# Load the data
mpa_intermediate = gpd.read_file(pipe_dir.get_step_fmt_file_path(prev_step, "shp")).pipe(
    clean_geometries
)
mpaatlas_intermediate = gpd.read_file(
    pipe_dir_mpaatlas.get_step_fmt_file_path(prev_step, "shp")
).pipe(clean_geometries)

/home/sofia/dev/skytruth-30x30/data/data/mpa/processed/mpa_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/mpa/processed/preprocess
/home/sofia/dev/skytruth-30x30/data/data/mpaatlas/processed/mpaatlas_preprocess.zip
/home/sofia/dev/skytruth-30x30/data/data/mpaatlas/processed/preprocess


In [6]:
# Load iucn categories
# iucn_cat = pd.DataFrame(
#     {"slug": init_table.iucn_cat.dropna().unique(), "name": init_table.iucn_cat.dropna().unique()},
#     index=pd.Index(np.arange(1, len(init_table.iucn_cat.dropna().unique()) + 1)),
# )
# iucn_cat.to_csv(pipe_dir.get_processed_step_path(current_step).joinpath("iucn_categories.csv"), index=True)
iucn_cat = pd.read_csv(
    pipe_dir.get_processed_step_path(current_step).joinpath("iucn_categories.csv"), index_col=0
)

In [7]:
# Preprocess marine tables (mpa and mpaatlas) and concatenate them
init_table = (
    pd.concat(
        [
            (
                mpa_intermediate.pipe(columns_to_lower)
                .pipe(separate_parent_iso, iso_column="parent_iso")
                .pipe(change_ata_to_abnj)
                .rename(
                    columns={
                        "parent_iso": "iso",
                        "status_yr": "year",
                        "gis_m_area": "area_km2",
                    }
                ).drop(columns=['status'])
            ).assign(source="protected_planet"),
            (
                mpaatlas_intermediate.pipe(calculate_area)
                .pipe(extract_wdpaid_mpaatlas)
                .pipe(separate_parent_iso, iso_column="location_i")
                .rename(
                    columns={
                        "location_i": "iso",
                        "wdpa_id": "wdpa_pid",
                        "designatio": "desig_eng",
                    }
                )
            ).assign(source="mpaatlas")
            .assign(pa_def=1)
            .astype({"mpa_zone_i": "Int64"}),
        ],
        ignore_index=True,
    )
    .reset_index(drop=True)
    .replace(
        {
            "iso": {
                "COK": "NZL",
                "IOT": "GBR",
                "NIU": "NZL",
                "SHN": "GBR",
                "SJM": "NOR",
                "UMI": "USA",
                "NCL": "FRA",
            }
        }
    )
    .sort_values(by=["wdpa_pid", "wdpa_pid", "source"], ascending=[True, True, False])
)

In [8]:
mpa_table = (
    init_table.pipe(add_bbox, "bbox")
    .pipe(define_is_child)
    .pipe(set_child_id, 'marine')
    .sort_values(by=["wdpaid", "is_child"], ascending=[True, True])
    .reset_index(drop=True)
    .pipe(add_total_area, 'marine')
    .rename(columns={"area_km2": "protected_area"})
    .pipe(calculate_coverage_percentage)
    .pipe(add_environment)
    .pipe(
        output,
        iso_column="iso",
        rep_d={
            "status": {
                "Adopted": 4,
                "implemented": 6,
                "Established": 6,
                "Designated": 5,
                "Proposed": 3,
                "Inscribed": 3,
                "unknown": 1,
            },
            "pa_def": {"0": 2, "1": 1},
            "year": {0: pd.NA},
            "iucn_cat": dict(
                iucn_cat[["slug"]]
                .reset_index(drop=False)
                .iloc[:, [1, 0]]
                .to_dict(orient="tight")["data"]
            ),
            "source": {"protected_planet": 3, "mpaatlas": 1},
            "protection": {
                "full": 3,
                "light": 4,
                "incompatible": 5,
                "high": 6,
                "minimal": 7,
                "unknown": 8,
                "unknown/to be determined": 8,
            },
            "establishm": {
                "actively managed": 4,
                "implemented": 6,
                "designated": 5,
                "Designated": 5,
                "proposed or committed": 3,
                "Proposed": 3,
                "Inscribed": 3,
                "Established": 5,
                "Adopted": 5,
                "unknown": 1,
            },
        },
        rename={
            "pa_def": "protection_status",
            "protected_area": "area",
            "iucn_cat": "iucn_category",
            "desig_eng": "designation",
            "protection": "mpaa_protection_level",
            "establishm": "mpaa_establishment_stage",
            "source": "data_source",
        },
        drop_cols=["geometry", "protecti_1","mpa_zone_i", "iso", "total_marine_area"]
    )
    .astype(
        {
            "year": "Int32",
            "iucn_category": "Int64",
            "protection_status": "Int64",
        }
    )
    .query("coverage <= 100") 
    .sort_index()
)

  return df.assign(child_id=df[columns].bfill(axis=1)[columns[0]])
  df.replace(rep_d)


Methodology for terrestrial:

1- lower case the columns   
2- separate location that its regime is in dispute or on join regime 
3- remove ATA and ABNJ because Protected planet doesn't include stats for ATA and ABNJ is marine  
4- rename columns for merge   
5- identify child resources and set them as childs  
6- calculate bbox  
7- set child resources  
8- Add coverage percentage   
9- Add environment terrestrial  
10- Add marine fields with nan  

In [9]:
pipe = "mpa-terrestrial"
strapi_collection_mpas = "mpa-terrestrial"

pipe_dir = FileConventionHandler(pipe)
pipe_dir_gadm = FileConventionHandler("gadm")
output_file_tpas = pipe_dir.get_processed_step_path(current_step).joinpath("tpa_detail.csv")

# # Download the protected atlas file && unzip it
# download_and_unzip_if_needed(pipe_dir, prev_step, mysettings)
# # Download the gadm file 
# download_and_unzip_if_needed(pipe_dir_gadm, prev_step, mysettings)

In [10]:
tpa_intermediate = gpd.read_file(pipe_dir.get_step_fmt_file_path(prev_step, "gpkg"))

In [11]:
iucn_cat = pd.read_csv(
    pipe_dir.get_processed_step_path(current_step).joinpath("iucn_categories.csv"), index_col=0
)

In [12]:
init_table = (
    pd.concat(
        [
            (
                tpa_intermediate.pipe(columns_to_lower)
                .pipe(separate_parent_iso, iso_column="parent_iso")
                .query("parent_iso != 'ATA' and parent_iso != 'ABNJ'")
                .rename(
                    columns={
                        "parent_iso": "iso",
                        "status_yr": "year",
                        "gis_area": "protected_area",
                    }
                ).drop(columns=['status'])
            ).assign(source="protected_planet"),
        ],
        ignore_index=True,
    )
    .reset_index(drop=True)
    .replace(
        {
            "iso": {
                "COK": "NZL",
                "IOT": "GBR",
                "NIU": "NZL",
                "SHN": "GBR",
                "SJM": "NOR",
                "UMI": "USA",
                "NCL": "FRA",
            }
        }
    )
)

In [13]:
tpa_table = (
    init_table.pipe(add_bbox, "bbox")
    .pipe(define_is_child)
    .pipe(set_child_id, 'terrestrial')
    .sort_values(by=["wdpaid", "is_child"], ascending=[True, True])
    .reset_index(drop=True)
    .pipe(add_total_area, 'terrestrial')
    .pipe(calculate_coverage_percentage)
    .pipe(add_environment)
    .pipe(
        output,
        iso_column="iso",
        rep_d={
            "pa_def": {"0": 2, "1": 1},
            "year": {0: pd.NA},
            "iucn_cat": dict(
                iucn_cat[["slug"]]
                .reset_index(drop=False)
                .iloc[:, [1, 0]]
                .to_dict(orient="tight")["data"]
            ),
            "source": {"protected_planet": 3},
        },
        rename={
            "pa_def": "protection_status",
            "protected_area": "area",
            "iucn_cat": "iucn_category",
            "desig_eng": "designation",
            "source": "data_source",
        },
        drop_cols=["geometry", "iso", "marine", "total_terrestrial_area"]
    )
    .astype(
        {
            "year": "Int32",
            "iucn_category": "Int64",
            "protection_status": "Int64",
        }
    )
    .query("coverage <= 100") 
    .sort_index()
)

  df.replace(rep_d)


In [14]:
# Add col mpaa_protection_level and mpa_establishment_stage to the table to validate it
tpa_table['mpaa_protection_level'] = np.nan
tpa_table['mpaa_establishment_stage'] = np.nan

Concatenate marine and terrestrial tables

1- Concatenate tables  
2- Add parent and children columns  
3- Sort by parent  
4- Create batch export for all columns by parent (to handle relations when uploading in Strapi)  
5- Create batch export only for column parent (to handle relations when uploading in Strapi)

In [15]:
pipe_pa = "pa"
step = "preprocess"
strapi_collection_pas = "pa"

pipe_dir_pa = FileConventionHandler(pipe_pa)

output_file_pa = pipe_dir_pa.get_processed_step_path(current_step).joinpath("pa_detail.csv")

In [16]:
final_table = pd.concat([mpa_table, tpa_table], ignore_index=True)
final_table.index = final_table.index + 1
final_table.index.name = 'id'
final_table = final_table.pipe(add_child_parent_relationship).drop(columns=['wdpa_pid', 'is_child', 'child_id']).sort_values(by=['parent'])

In [15]:
PAsSchema(final_table[final_table.location.notna()]).to_csv(output_file_pa, index=True)

In [17]:
# Divide table into two tables
final_table1 = final_table.drop(columns=['parent'])
final_table2 = final_table[['parent']]

In [None]:
# Divide output in chunks to be uploaded to strapi
batch_export(
    final_table1[final_table1.area.notna()],
    4000,
    PAsSchemaChunk1,
    pipe_dir_pa.get_processed_step_path(current_step).joinpath("chunks1"),
    "pa_detail",
    format="json",
    strapi_colection=strapi_collection_pas,
)

batch_export(
    final_table2,
    10000,
    PAsSchemaChunk2,
    pipe_dir_pa.get_processed_step_path(current_step).joinpath("chunks2"),
    "pa_detail",
    format="json",
    strapi_colection=strapi_collection_pas,
)

In [40]:
# zip data
make_archive(pipe_dir_pa.get_processed_step_path(current_step).joinpath("chunks1"), pipe_dir_pa.get_processed_step_path(current_step).joinpath("chunks1.zip"))
make_archive(pipe_dir_pa.get_processed_step_path(current_step).joinpath("chunks2"), pipe_dir_pa.get_processed_step_path(current_step).joinpath("chunks2.zip"))

In [42]:
# Save zipped file in GCS
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name='vizzuality_processed_data/strapi_tables/pa_chunks1.zip',
    file=pipe_dir_pa.get_processed_step_path(current_step).joinpath("chunks1.zip"),
    operation="w",
)

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name='vizzuality_processed_data/strapi_tables/pa_chunks2.zip',
    file=pipe_dir_pa.get_processed_step_path(current_step).joinpath("chunks2.zip"),
    operation="w",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token
DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


In [None]:
# strapi.deleteCollectionData("pa", list(range(1, 20914)))

In [None]:
# for i in range(0, 4):
#     strapi.importCollectionData(
#         strapi_collection_mpas,
#         mpa_folder.joinpath(f"mpa_detail_{i}.csv"),
#     )

### Habitats

**Note:** The marine habitat data has already been processed in the habitats.ipynb notebook. The terrestrial data has been processed in another notebook (TBD). This section imports the output from habitats.ipynb and output generated for terrestrial habitats (stored in the bucket), and generates the final output table required for Strapi.

In [4]:
pipe = "terrestrial-habitats"
collection_name = "terrestrial_habitats"

pipe_dir = FileConventionHandler(pipe)
input_file_ter = pipe_dir.get_processed_step_path(prev_step).joinpath("master_data_protection_exact.csv")
input_file_mar = pipe_dir.get_processed_step_path(prev_step).joinpath("habitats6.csv")
output_file = pipe_dir.get_processed_step_path(current_step).joinpath("habitats_all.csv")

# Download the terrestrial habitats table from the bucket
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name="vizzuality_processed_data/habitats/preprocess/master_data_protection_exact.csv",
    file=input_file_ter,
    operation="r",
)

# Download the marine habitats table from the bucket
writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name="vizzuality_processed_data/processed_statistic_tables/habitats6.csv",
    file=input_file_mar,
    operation="r",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token
DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token


In [5]:
habitat_mar = pd.read_csv(input_file_mar, na_values=['', 'NaN', 'NULL'])
habitat_mar['environment'] = 1
habitat_mar['location_id'] = habitat_mar['location_id'].fillna('NA')
habitat_mar.rename(columns={'location_id': 'location', 'habitat_name':'habitat'}, inplace=True)
habitat_mar['habitat'] = habitat_mar['habitat'].astype(str).str.lower()
habitat_mar

Unnamed: 0,location,protected_area,total_area,habitat,year,environment
0,ABNJ,427.048524,1893.871282,cold-water corals,2024,1
1,AGO,0.000000,3.395671,cold-water corals,2024,1
2,ALB,0.000000,5.986479,cold-water corals,2024,1
3,ARG,6.984226,61.826344,cold-water corals,2024,1
4,ATG,0.000000,0.997747,cold-water corals,2024,1
...,...,...,...,...,...,...
609,AS,21277.220000,74292.673146,mangroves,2020,1
610,EU,732.143750,1246.189677,mangroves,2020,1
611,,2097.740000,2415.418557,mangroves,2020,1
612,SA,27151.740000,39893.444608,mangroves,2020,1


In [6]:
habitat_ter = pd.read_csv(input_file_ter).drop(columns=['frac', 'perc_extent', 'total_area'])
habitat_ter.rename(columns={'habitats': 'habitat'}, inplace=True)
habitat_ter['habitat'] = habitat_ter['habitat'].astype(str).str.lower()
habitat_ter

Unnamed: 0,iso_3,habitat,total,protected
0,GLOB,artificial,2.814393e+09,1.709865e+08
1,GLOB,desert,1.103513e+10,7.445863e+08
2,GLOB,forest,4.894422e+09,8.934822e+08
3,GLOB,grassland,3.532339e+09,4.938337e+08
4,GLOB,other,3.129403e+07,1.039509e+07
...,...,...,...,...
1502,ZWE,grassland,3.384890e+05,2.174229e+04
1503,ZWE,rocky/mountains,5.100000e+01,
1504,ZWE,savanna,2.287906e+07,8.742367e+06
1505,ZWE,shrubland,5.252410e+06,2.875969e+05


In [7]:
# Remove all rows where "habitat_name" is "Other"
habitat_ter = habitat_ter[habitat_ter['habitat'] != 'other'].copy()

# calculate total_area by adding up "total" per iso_3
habitat_ter['total_area_country'] = habitat_ter.groupby('iso_3')['total'].transform('sum')
habitat_ter

Unnamed: 0,iso_3,habitat,total,protected,total_area_country
0,GLOB,artificial,2.814393e+09,1.709865e+08,2.658415e+10
1,GLOB,desert,1.103513e+10,7.445863e+08,2.658415e+10
2,GLOB,forest,4.894422e+09,8.934822e+08,2.658415e+10
3,GLOB,grassland,3.532339e+09,4.938337e+08,2.658415e+10
5,GLOB,rocky/mountains,3.550780e+08,4.447316e+07,2.658415e+10
...,...,...,...,...,...
1502,ZWE,grassland,3.384890e+05,2.174229e+04,4.148695e+07
1503,ZWE,rocky/mountains,5.100000e+01,,4.148695e+07
1504,ZWE,savanna,2.287906e+07,8.742367e+06,4.148695e+07
1505,ZWE,shrubland,5.252410e+06,2.875969e+05,4.148695e+07


In [8]:
# Assign territories to their soveraign countries
with open(scripts_dir.joinpath('data_commons/data/dependency_to_parent.json'), 'r') as json_file:
    dependency_to_parent = json.load(json_file)

mapping = {key: value[0] for key, value in dependency_to_parent.items()}

habitat_ter['iso_3'] = habitat_ter['iso_3'].map(mapping).fillna(habitat_ter['iso_3'])

In [9]:
# groupby country and habitats and sum the pixeles
habitat_ter_grouped = habitat_ter.groupby(['iso_3', 'habitat']).sum().reset_index()
habitat_ter_grouped

Unnamed: 0,iso_3,habitat,total,protected,total_area_country
0,AFG,artificial,5.623284e+06,8.474413e+04,7.775693e+07
1,AFG,desert,2.726139e+07,3.217268e+05,7.775693e+07
2,AFG,forest,3.825968e+05,2.001767e+04,7.775693e+07
3,AFG,grassland,2.910245e+07,8.725795e+05,7.775693e+07
4,AFG,rocky/mountains,1.370481e+07,1.567462e+06,7.775693e+07
...,...,...,...,...,...
1332,ZWE,grassland,3.384890e+05,2.174229e+04,4.148695e+07
1333,ZWE,rocky/mountains,5.100000e+01,0.000000e+00,4.148695e+07
1334,ZWE,savanna,2.287906e+07,8.742367e+06,4.148695e+07
1335,ZWE,shrubland,5.252410e+06,2.875969e+05,4.148695e+07


In [10]:
# Calculate the percentage of protected pixels and the percentage of extent of the habitat
habitat_ter_grouped['protected%'] = habitat_ter_grouped['protected']/habitat_ter_grouped['total']*100
habitat_ter_grouped['habitat%'] = habitat_ter_grouped['total']/habitat_ter_grouped['total_area_country']*100
habitat_ter_grouped

Unnamed: 0,iso_3,habitat,total,protected,total_area_country,protected%,habitat%
0,AFG,artificial,5.623284e+06,8.474413e+04,7.775693e+07,1.507022,7.231875
1,AFG,desert,2.726139e+07,3.217268e+05,7.775693e+07,1.180156,35.059751
2,AFG,forest,3.825968e+05,2.001767e+04,7.775693e+07,5.232055,0.492042
3,AFG,grassland,2.910245e+07,8.725795e+05,7.775693e+07,2.998303,37.427462
4,AFG,rocky/mountains,1.370481e+07,1.567462e+06,7.775693e+07,11.437317,17.625187
...,...,...,...,...,...,...,...
1332,ZWE,grassland,3.384890e+05,2.174229e+04,4.148695e+07,6.423338,0.815893
1333,ZWE,rocky/mountains,5.100000e+01,0.000000e+00,4.148695e+07,0.000000,0.000123
1334,ZWE,savanna,2.287906e+07,8.742367e+06,4.148695e+07,38.211218,55.147600
1335,ZWE,shrubland,5.252410e+06,2.875969e+05,4.148695e+07,5.475522,12.660390


In [11]:
# Add country's terrestrial area
add_total_area(habitat_ter_grouped, 'terrestrial')

Unnamed: 0,iso_3,habitat,total,protected,total_area_country,protected%,habitat%,total_terrestrial_area
0,AFG,artificial,5.623284e+06,8.474413e+04,7.775693e+07,1.507022,7.231875,644050.0
1,AFG,desert,2.726139e+07,3.217268e+05,7.775693e+07,1.180156,35.059751,644050.0
2,AFG,forest,3.825968e+05,2.001767e+04,7.775693e+07,5.232055,0.492042,644050.0
3,AFG,grassland,2.910245e+07,8.725795e+05,7.775693e+07,2.998303,37.427462,644050.0
4,AFG,rocky/mountains,1.370481e+07,1.567462e+06,7.775693e+07,11.437317,17.625187,644050.0
...,...,...,...,...,...,...,...,...
1332,ZWE,grassland,3.384890e+05,2.174229e+04,4.148695e+07,6.423338,0.815893,391235.0
1333,ZWE,rocky/mountains,5.100000e+01,0.000000e+00,4.148695e+07,0.000000,0.000123,391235.0
1334,ZWE,savanna,2.287906e+07,8.742367e+06,4.148695e+07,38.211218,55.147600,391235.0
1335,ZWE,shrubland,5.252410e+06,2.875969e+05,4.148695e+07,5.475522,12.660390,391235.0


In [12]:
# Estimate the total area and the protected area based on pixels proportions and the total terrestrial area
habitat_ter_grouped['total_area'] = habitat_ter_grouped['total_terrestrial_area']*habitat_ter_grouped['habitat%']/100
habitat_ter_grouped['protected_area'] = habitat_ter_grouped['total_area']*habitat_ter_grouped['protected%']/100

In [13]:
# Add regions
habitat_ter_grouped = add_region_iso(habitat_ter_grouped, 'iso_3')

regions = habitat_ter_grouped.groupby(['region', 'habitat']).agg({
    'total_area': 'sum',
    'protected_area': 'sum'
}).reset_index()

regions.rename(columns={'region': 'location'}, inplace=True)
habitat_ter_grouped.drop(columns=['total', 'protected', 'total_area_country', 'protected%', 'habitat%', 'total_terrestrial_area', 'region'], inplace=True)
habitat_ter_grouped = habitat_ter_grouped.rename(columns = {'iso_3':'location'})

In [14]:
# Concatenate regions and habitat_ter_grouped dataframes
habitats_terrestrial = pd.concat([regions, habitat_ter_grouped], ignore_index=True)

# fill protected_area and total_area with 0 if they are NaN
habitats_terrestrial['protected_area'] = habitats_terrestrial['protected_area'].fillna(0)
habitats_terrestrial['total_area'] = habitats_terrestrial['total_area'].fillna(0)

In [15]:
# Add year and environment columns
habitats_terrestrial['year'] = 2024
habitats_terrestrial['environment'] = 2
habitats_terrestrial

Unnamed: 0,location,habitat,total_area,protected_area,year,environment
0,AF,artificial,2.925993e+06,190150.539425,2024,2
1,AF,desert,9.875738e+06,658679.485018,2024,2
2,AF,forest,4.461370e+06,919656.960007,2024,2
3,AF,grassland,2.039020e+06,192683.234520,2024,2
4,AF,rocky/mountains,2.384026e+05,46963.725543,2024,2
...,...,...,...,...,...,...
1381,ZWE,grassland,3.192058e+03,205.036642,2024,2
1382,ZWE,rocky/mountains,4.809460e-01,0.000000,2024,2
1383,ZWE,savanna,2.157567e+05,82443.266468,2024,2
1384,ZWE,shrubland,4.953188e+04,2712.128821,2024,2


In [16]:
# Concatenate terrestrial and marine habitats
habitats_all = pd.concat([habitats_terrestrial, habitat_mar], ignore_index=True)
habitats_all

Unnamed: 0,location,habitat,total_area,protected_area,year,environment
0,AF,artificial,2.925993e+06,190150.539425,2024,2
1,AF,desert,9.875738e+06,658679.485018,2024,2
2,AF,forest,4.461370e+06,919656.960007,2024,2
3,AF,grassland,2.039020e+06,192683.234520,2024,2
4,AF,rocky/mountains,2.384026e+05,46963.725543,2024,2
...,...,...,...,...,...,...
1995,AS,mangroves,7.429267e+04,21277.220000,2020,1
1996,EU,mangroves,1.246190e+03,732.143750,2020,1
1997,,mangroves,2.415419e+03,2097.740000,2020,1
1998,SA,mangroves,3.989344e+04,27151.740000,2020,1


In [17]:
# change habitat to have the id of the habitat
habitat_dict = {
    'mangroves': 5,
    'seamounts': 6,
    'artificial': 43,
    'forest': 45,
    'grassland': 46,
    'wetlands/open water': 50,
    'seagrasses': 2,
    'cold-water corals': 4,
    'desert': 44,
    'rocky/mountains': 47,
    'savanna': 48,
    'shrubland': 49,
    'saltmarshes': 1,
    'warm-water corals': 3
}

habitats_all['habitat'] = habitats_all['habitat'].replace(habitat_dict)
habitats_all.rename(columns={'location': 'location_id'}, inplace=True)
habitats_all.head(10)

  habitats_all['habitat'] = habitats_all['habitat'].replace(habitat_dict)


Unnamed: 0,location_id,habitat,total_area,protected_area,year,environment
0,AF,43,2925993.0,190150.5,2024,2
1,AF,44,9875738.0,658679.5,2024,2
2,AF,45,4461370.0,919657.0,2024,2
3,AF,46,2039020.0,192683.2,2024,2
4,AF,47,238402.6,46963.73,2024,2
5,AF,48,8384999.0,1697340.0,2024,2
6,AF,49,1765385.0,159875.0,2024,2
7,AF,50,302189.5,50236.31,2024,2
8,AS,43,8063010.0,169383.2,2024,2
9,AS,44,3536380.0,299890.1,2024,2


In [18]:
output(habitats_all, 'location_id', {}, {}, ['location_id']).to_csv(output_file, index=True)

In [20]:
# Upload csv to bucket
remote_path = 'vizzuality_processed_data/strapi_tables/habitats_exact.csv'

writeReadGCP(
    credentials=mysettings.GCS_KEYFILE_JSON,
    bucket_name=mysettings.GCS_BUCKET,
    blob_name=remote_path,
    file=output_file,
    operation="w",
)

DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token
