### Set up

In [1]:
!pip install openpyxl

[0mDefaulting to user installation because normal site-packages is not writeable


In [4]:
import geopandas as gpd
import pandas as pd
from datetime import datetime

In [5]:
path_in = "../data/raw/"
path_out = "../data/processed/"

### Process habitats from [Ocean+](https://habitats.oceanplus.org/) (except mangroves)

**Country stats**

In [6]:
cold = pd.read_csv(path_in + "Ocean+HabitatsDownload_Global/coldwatercorals.csv")
salt = pd.read_csv(path_in + "Ocean+HabitatsDownload_Global/saltmarshes.csv")
sea = pd.read_csv(path_in + "Ocean+HabitatsDownload_Global/seagrasses.csv")
warm = pd.read_csv(path_in + "Ocean+HabitatsDownload_Global/warmwatercorals.csv")
glob = pd.read_excel(path_in + "Ocean+HabitatsDownload_Global/global-stats.xlsx")

In [7]:
cold2 = cold[['ISO3', 'protected_area', 'total_area']].rename(columns={'ISO3': 'location_id'})
salt2 = salt[['ISO3', 'protected_area', 'total_area']].rename(columns={'ISO3': 'location_id'})
sea2 = sea[['ISO3', 'protected_area', 'total_area']].rename(columns={'ISO3': 'location_id'})
warm2 = warm[['ISO3', 'protected_area', 'total_area']].rename(columns={'ISO3': 'location_id'})

In [8]:
# Remove rows with '-' in 'protected_area' or 'total_area'
cold2 = cold2[~cold2['protected_area'].str.contains('-') & ~cold2['total_area'].str.contains('-')]
salt2 = salt2[~salt2['protected_area'].str.contains('-') & ~salt2['total_area'].str.contains('-')]
sea2 = sea2[~sea2['protected_area'].str.contains('-') & ~sea2['total_area'].str.contains('-')]
warm2 = warm2[~warm2['protected_area'].str.contains('-') & ~warm2['total_area'].str.contains('-')]

In [9]:
# Replace 'ATA' with 'ABNJ' in 'location_id'
cold2 = cold2.replace('ATA', 'ABNJ')
salt2 = salt2.replace('ATA', 'ABNJ')
sea2 = sea2.replace('ATA', 'ABNJ')
warm2 = warm2.replace('ATA', 'ABNJ')

In [10]:
# Bring the eez file to get iso3 and parent_iso equivalences
eez = gpd.read_file(path_in + "World_EEZ_v11_20191118/eez_v11.shp")
eez.columns

Index(['MRGID', 'GEONAME', 'MRGID_TER1', 'POL_TYPE', 'MRGID_SOV1',
       'TERRITORY1', 'ISO_TER1', 'SOVEREIGN1', 'MRGID_TER2', 'MRGID_SOV2',
       'TERRITORY2', 'ISO_TER2', 'SOVEREIGN2', 'MRGID_TER3', 'MRGID_SOV3',
       'TERRITORY3', 'ISO_TER3', 'SOVEREIGN3', 'X_1', 'Y_1', 'MRGID_EEZ',
       'AREA_KM2', 'ISO_SOV1', 'ISO_SOV2', 'ISO_SOV3', 'UN_SOV1', 'UN_SOV2',
       'UN_SOV3', 'UN_TER1', 'UN_TER2', 'UN_TER3', 'geometry'],
      dtype='object')

In [11]:
# Initialize an empty dictionary
territory_iso = {}

# Iterate over each row in the DataFrame
for index, row in eez.iterrows():
    # Use TERRITORY1 as the key and ISO_SOV1 as the value
    territory = row['ISO_TER1']
    iso_sov = row['ISO_SOV1']
    
    # Check if the territory is not null or NaN
    if pd.notnull(territory) and pd.notnull(iso_sov):
        territory_iso[territory] = iso_sov

# Add ABNJ
territory_iso['ABNJ'] = 'ABNJ'  # Replace 'ABNJ_DEFAULT' with the desired default value


In [12]:
cold2[cold2['location_id']=='UMI']

Unnamed: 0,location_id,protected_area,total_area
120,UMI,9.38785685362166,9.38785685362166


In [13]:
# Create a mapping dictionary for ISO3-PARENT_ISO pairs and modify the 'location_id' column in the habitats dataframes
cold2['location_id'] = cold2['location_id'].map(territory_iso)
salt2['location_id'] = salt2['location_id'].map(territory_iso)
sea2['location_id'] = sea2['location_id'].map(territory_iso)
warm2['location_id'] = warm2['location_id'].map(territory_iso)

In [14]:
cold2[cold2['location_id']=='UMI']

Unnamed: 0,location_id,protected_area,total_area


In [15]:
# Convert the 'protected_area' and 'total_area' columns to numeric
dataframes = [cold2, salt2, sea2, warm2]

for df in dataframes:
    df['protected_area'] = pd.to_numeric(df['protected_area'])
    df['total_area'] = pd.to_numeric(df['total_area'])

In [16]:
# Group by 'location_id' and calculate the sum of 'protected_area' and 'total_area'
cold2_grouped = cold2.groupby('location_id').sum().reset_index()
salt2_grouped = salt2.groupby('location_id').sum().reset_index()
sea2_grouped = sea2.groupby('location_id').sum().reset_index()
warm2_grouped = warm2.groupby('location_id').sum().reset_index()

In [17]:
# Add the 'habitat_name' column
cold2_grouped['habitat_name'] = 'cold-water corals'
salt2_grouped['habitat_name'] = 'saltmarshes'
sea2_grouped['habitat_name'] = 'seagrasses'
warm2_grouped['habitat_name'] = 'warm-water corals'

In [18]:
# Concatenate the dataframes
habitats = pd.concat([cold2_grouped, salt2_grouped, sea2_grouped, warm2_grouped])
habitats['year'] = datetime.now().year
habitats.head(2)

Unnamed: 0,location_id,protected_area,total_area,habitat_name,year
0,ABNJ,427.048524,1893.871282,cold-water corals,2024
1,AGO,0.0,3.395671,cold-water corals,2024


**Global stats**

In [19]:
# Calculate global stats for habitats
habitats_global = glob[['habitat','protected_area', 'total_area']].rename(columns={'habitat': 'habitat_name'})
habitats_global['location_id'] = 'GLOB'
habitats_global['year'] = datetime.now().year
habitats_global = habitats_global[habitats_global['habitat_name'] != 'mangroves'] # remove mangroves
habitats_global

Unnamed: 0,habitat_name,protected_area,total_area,location_id,year
0,saltmarsh,111638.252564,224435.075094,GLOB,2024
1,seagrass,74787.44996,314001.9406,GLOB,2024
2,warmwater-corals,63259.49913,149886.974126,GLOB,2024
4,coldwater-corals,4400.140842,15336.97528,GLOB,2024


In [20]:
# Change the name of the habitats to match the ones in the habitats dataframe
habitat_name_mapping = {
    'saltmarsh': 'saltmarshes',
    'seagrass': 'seagrasses',
    'warmwater-corals': 'warm-water corals',
    'coldwater-corals': 'cold-water corals'
}
habitats_global['habitat_name'] = habitats_global['habitat_name'].replace(habitat_name_mapping)
habitats_global

Unnamed: 0,habitat_name,protected_area,total_area,location_id,year
0,saltmarshes,111638.252564,224435.075094,GLOB,2024
1,seagrasses,74787.44996,314001.9406,GLOB,2024
2,warm-water corals,63259.49913,149886.974126,GLOB,2024
4,cold-water corals,4400.140842,15336.97528,GLOB,2024


In [21]:
# Concatenate the global stats to the habitats dataframe
habitats = pd.concat([habitats, habitats_global])
habitats['habitat_name'].unique()

array(['cold-water corals', 'saltmarshes', 'seagrasses',
       'warm-water corals'], dtype=object)

**Regions stats**

In [23]:
# List of dictionaries for data in Region_ISO3_PP.txt (list of regions used in the Protected Planet database)
regions_data = [
    {
        'region_iso': 'AS',
        'region_name': 'Asia & Pacific',
        'country_iso_3s': [
            "AFG", "ASM", "AUS", "BGD", "BRN", "BTN", "CCK", "CHN", "COK", "CXR", "FJI", "FSM", "GUM", "HKG", "IDN",
            "IND", "IOT", "IRN", "JPN", "KHM", "KIR", "KOR", "LAO", "LKA", "MAC", "MDV", "MHL", "MMR", "MNG", "MNP",
            "MYS", "NCL", "NFK", "NIU", "NPL", "NRU", "NZL", "PAK", "PCN", "PHL", "PLW", "PNG", "PRK", "PYF", "SGP",
            "SLB", "THA", "TKL", "TLS", "TON", "TUV", "TWN", "VNM", "VUT", "WLF", "WSM"
        ]
    },
    {
        'region_iso': 'AF',
        'region_name': 'Africa',
        'country_iso_3s': [
            "AGO", "BDI", "BEN", "BFA", "BWA", "CAF", "CIV", "CMR", "COD", "COG", "COM", "CPV", "DJI", "DZA", "EGY",
            "ERI", "ESH", "ETH", "GAB", "GHA", "GIN", "GMB", "GNB", "GNQ", "KEN", "LBR", "LBY", "LSO", "MAR", "MDG",
            "MLI", "MOZ", "MRT", "MUS", "MWI", "MYT", "NAM", "NER", "NGA", "REU", "RWA", "SDN", "SEN", "SHN", "SLE",
            "SOM", "SSD", "STP", "SWZ", "SYC", "TCD", "TGO", "TUN", "TZA", "UGA", "ZAF", "ZMB", "ZWE"
        ]
    },
    {
        'region_iso': 'EU',
        'region_name': 'Europe',
        'country_iso_3s': [
            "ALA", "ALB", "AND", "ARM", "AUT", "AZE", "BEL", "BGR", "BIH", "BLR", "CHE", "CYP", "CZE", "DEU", "DNK",
            "ESP", "EST", "FIN", "FRA", "FRO", "GBR", "GEO", "GGY", "GIB", "GRC", "HRV", "HUN", "IMN", "IRL", "ISL",
            "ISR", "ITA", "JEY", "KAZ", "KGZ", "LIE", "LTU", "LUX", "LVA", "MCO", "MDA", "MKD", "MLT", "MNE", "NLD",
            "NOR", "POL", "PRT", "ROU", "RUS", "SJM", "SMR", "SRB", "SVK", "SVN", "SWE", "TJK", "TKM", "TUR", "UKR",
            "UZB", "VAT"
        ]
    },
    {
        'region_iso': 'SA',
        'region_name': 'Latin America & Caribbean',
        'country_iso_3s': [
            "ABW", "AIA", "ARG", "ATG", "BES", "BHS", "BLM", "BLZ", "BMU", "BOL", "BRA", "BRB", "CHL", "COL", "CRI",
            "CUB", "CUW", "CYM", "DMA", "DOM", "ECU", "FLK", "GLP", "GRD", "GTM", "GUF", "GUY", "HND", "HTI", "JAM",
            "KNA", "LCA", "MAF", "MEX", "MSR", "MTQ", "NIC", "PAN", "PER", "PRI", "PRY", "SLV", "SUR", "SXM", "TCA",
            "TTO", "UMI", "URY", "VCT", "VEN", "VGB", "VIR"
        ]
    },
    {
        'region_iso': 'PO',
        'region_name': 'Polar',
        'country_iso_3s': [
            "ATF", "BVT", "GRL", "HMD", "SGS"
        ]
    },
    {
        'region_iso': 'NA',
        'region_name': 'North America',
        'country_iso_3s': [
            "CAN", "SPM", "USA"
        ]
    },
    
    {
        'region_iso': 'WA',
        'region_name': 'West Asia',
        'country_iso_3s': [
            "ARE", "BHR", "IRQ", "JOR", "KWT", "LBN", "OMN", "PSE", "QAT", "SAU", "SYR", "YEM"
        ]
    }
]

# Convert the region data to a dictionary that maps each country to its region name
country_to_region = {}
for region in regions_data:
    for country in region['country_iso_3s']:
        country_to_region[country] = region['region_iso']

In [24]:
# Add regions field
habitats_regions = habitats.copy()
habitats_regions['region'] = habitats['location_id'].map(country_to_region)

# Calculate stats for each region
habitats_regions = habitats_regions.groupby(['region', 'habitat_name']).agg({'protected_area': 'sum', 'total_area': 'sum'}).reset_index()
habitats_regions['year'] = datetime.now().year
habitats_regions.rename(columns={'region': 'location_id'}, inplace=True)
habitats_regions

Unnamed: 0,location_id,habitat_name,protected_area,total_area,year
0,AF,cold-water corals,29.477984,377.605959,2024
1,AF,saltmarshes,6688.702879,19847.757498,2024
2,AF,seagrasses,6319.099491,63472.068792,2024
3,AF,warm-water corals,6591.340083,15615.193629,2024
4,AS,cold-water corals,428.357948,1714.559384,2024
5,AS,saltmarshes,11965.69391,44702.805187,2024
6,AS,seagrasses,29091.313202,123320.727798,2024
7,AS,warm-water corals,41328.384526,100117.415792,2024
8,EU,cold-water corals,2665.929517,7307.501117,2024
9,EU,saltmarshes,11399.882318,18450.550092,2024


In [25]:
# Concatenate region statistics to the habitats dataframe
habitats = pd.concat([habitats, habitats_regions])

In [26]:
# Add environment
habitats['environment'] = 'marine'
habitats.head(2)

Unnamed: 0,location_id,protected_area,total_area,habitat_name,year,environment
0,ABNJ,427.048524,1893.871282,cold-water corals,2024,marine
1,AGO,0.0,3.395671,cold-water corals,2024,marine


In [27]:
habitats.to_csv(path_out + "habitats/ocean+_processed.csv", index=False)

### Process seamounts from [UN WCMC](https://data.unep-wcmc.org/datasets/41)

In [28]:
# Read required data
seamounts = gpd.read_file(path_in + "Seamounts/seamounts.shp")
eez = gpd.read_file(path_out + "/administrative/eez_area_mollweide.shp")
hs = gpd.read_file(path_in + "/high_seas/high_seas.shp")
protected_areas = gpd.read_file(path_out + "wdpa/timeseries/protected_dissolved_2023.shp").to_crs("EPSG:4326")

In [29]:
# Keep relevant fields in eez and hs and merge them in one dataframe
eez = eez[['SOVEREIGN1', 'SOVEREIGN2', 'SOVEREIGN3','ISO_SOV1', 'ISO_SOV2', 'ISO_SOV3', 'geometry']]
hs = hs[['geometry']]
hs['SOVEREIGN1'] = 'High Seas'
hs['ISO_SOV1'] = 'ABNJ'
eez_hs = eez.merge(hs, how='outer')

In [30]:
# Join eez/highseas info to seamounts falling within eez polygons and drop those not associated with any of them
seamounts_eez = gpd.sjoin(seamounts, eez_hs, how="left", predicate="within")
seamounts_eez = seamounts_eez.dropna(subset=['ISO_SOV1'])

In [31]:
# Create new column "iso" with the iso_sov codes from eez/hs data
def concatenate_iso(row):
    iso_list = [row['ISO_SOV1']]
    if not pd.isna(row['ISO_SOV2']):
        iso_list.append(row['ISO_SOV2'])
    if not pd.isna(row['ISO_SOV3']):
        iso_list.append(row['ISO_SOV3'])
    return ';'.join(iso_list)

seamounts_eez['iso'] = seamounts_eez.apply(concatenate_iso, axis=1)

In [32]:
# Check which seamounts are protected
seamounts_wdpa = gpd.sjoin(seamounts, protected_areas, how="left", predicate="within")
seamounts_wdpa['protection'] = "no"  
seamounts_wdpa.loc[~seamounts_wdpa['index_right'].isna(), 'protection'] = "yes"

# Keep relevant fields
seamounts_wdpa = seamounts_wdpa[['PEAKID', 'protection']]

In [33]:
# Merge information about protection with seamounts_eez and keep only relevant fields
seamounts_eez_protection = seamounts_eez.merge(seamounts_wdpa, how="left", on='PEAKID')[['PEAKID', 'iso', 'AREA2D', 'protection', 'geometry']]
seamounts_eez_protection.head()

Unnamed: 0,PEAKID,iso,AREA2D,protection,geometry
0,26000,DNK,982.028337,no,POINT (2.76250 84.97974)
1,26157,ABNJ,348.473055,no,POINT (9.14306 84.93529)
2,26158,ABNJ,367.54038,no,POINT (9.18333 84.93807)
3,26228,ABNJ,299.443636,no,POINT (8.74861 84.90751)
4,26229,ABNJ,309.588492,no,POINT (8.88750 84.91307)


In [34]:
# All seamounts that have iso "ATA" should have iso "ABNJ"
seamounts_eez_protection.loc[seamounts_eez_protection['iso']=='ATA', 'iso'] = 'ABNJ'
seamounts_eez_protection[seamounts_eez_protection['iso']=='ATA']

Unnamed: 0,PEAKID,iso,AREA2D,protection,geometry


**Global stats**

In [35]:
# Calculate the total_area (sum of AREA2D for all seamounts)
total_area = seamounts_eez_protection['AREA2D'].sum()

# Calculate the protected_area (sum of AREA2D for seamounts where 'protection' is 'yes')
protected_area = seamounts_eez_protection.loc[seamounts_eez_protection['protection'] == 'yes', 'AREA2D'].sum()

# Create a DataFrame with the results
seamounts_global = pd.DataFrame({
    'habitat_name': ['seamounts'],
    'total_area': [total_area],
    'protected_area': [protected_area],
    'location_id': ['GLOB'],
    'year': [2011]
})

seamounts_global


Unnamed: 0,habitat_name,total_area,protected_area,location_id,year
0,seamounts,26908100.0,3426630.0,GLOB,2011


**Country stats**

In [36]:
# Split the 'iso_code' values and create separate rows only for those with multiple values
mask = seamounts_eez_protection['iso'].str.contains(';', na=False)
split_rows = seamounts_eez_protection[mask].copy()
split_rows['iso'] = split_rows['iso'].str.split(';')
split_rows = split_rows.explode('iso')

# Keep rows with single values in 'iso_code'
single_value_rows = seamounts_eez_protection[~mask]

# Concatenate the exploded rows with the single value rows
seamounts_eez_iso = pd.concat([single_value_rows, split_rows], ignore_index=True)

In [37]:
# Calculate the total_area (sum of AREA2D for all seamounts)
total_area_iso = seamounts_eez_iso.groupby(['iso']).agg({'AREA2D': 'sum'}).reset_index().rename(columns={'AREA2D': 'total_area'})

# Calculate the protected_area_iso (sum of AREA2D for seamounts where 'protection' is 'yes' grouped by 'iso')
protected_area_iso = seamounts_eez_iso.loc[seamounts_eez_iso['protection'] == 'yes'].groupby('iso')['AREA2D'].sum().reset_index().rename(columns={'AREA2D': 'protected_area'})

In [38]:
# Merge total_area_iso and protected_area_iso DataFrames on 'iso'
seamounts_iso = total_area_iso.merge(protected_area_iso, how='left', on='iso').rename(columns={'iso': 'location_id'})
seamounts_iso['habitat_name'] = 'seamounts'
seamounts_iso['year'] = 2011
seamounts_iso

Unnamed: 0,location_id,total_area,protected_area,habitat_name,year
0,ABNJ,1.518615e+07,308819.904730,seamounts,2011
1,AGO,9.556242e+03,,seamounts,2011
2,ARG,3.110730e+05,303902.727433,seamounts,2011
3,ATG,6.215895e+03,,seamounts,2011
4,AUS,4.772977e+05,250507.827932,seamounts,2011
...,...,...,...,...,...
87,VNM,4.421338e+04,,seamounts,2011
88,VUT,1.199475e+05,43501.694036,seamounts,2011
89,WSM,4.117997e+04,,seamounts,2011
90,YEM,6.294974e+04,2487.428050,seamounts,2011


**Regions stats**

In [39]:
seamounts_regions = seamounts_iso.copy()
seamounts_regions['region'] = seamounts_regions['location_id'].map(country_to_region)

# Calculate stats for each region
seamounts_regions = seamounts_regions.groupby(['region', 'habitat_name']).agg({'protected_area': 'sum', 'total_area': 'sum'}).reset_index()
seamounts_regions['year'] = 2011
seamounts_regions.rename(columns={'region': 'location_id'}, inplace=True)
seamounts_regions

Unnamed: 0,location_id,habitat_name,protected_area,total_area,year
0,AF,seamounts,173357.6,616235.1,2011
1,AS,seamounts,1114013.0,5433433.0,2011
2,EU,seamounts,747244.1,2641119.0,2011
3,,seamounts,554491.0,1664794.0,2011
4,SA,seamounts,847448.8,1655552.0,2011
5,WA,seamounts,2487.428,93847.65,2011


In [40]:
# Concatenate region and global stats to seamounts_iso2
seamounts_all = pd.concat([seamounts_iso, seamounts_regions, seamounts_global])

In [41]:
# Add environment
seamounts_all['environment'] = 'marine'

In [42]:
seamounts_all.to_csv(path_out + "habitats/seamounts_processed.csv", index=False)

### Process mangroves from GMW

In [43]:
mangroves = pd.read_csv(path_in + "mangroves/mangroves.csv")

In [44]:
# Change location_id to match parent_iso from eez
mangroves['location_id'] = mangroves['location_id'].map(territory_iso)

In [45]:
mangroves_iso = mangroves.groupby('location_id').agg({
    'habitat_name': 'first',  
    'year': 'first',          
    'protected_area': 'sum',  
    'total_area': 'sum'       
}).reset_index()


In [46]:
# Calculate global stats for mangroves
mangroves_global = mangroves_iso.groupby(['habitat_name']).agg({'protected_area': 'sum', 'total_area': 'sum'}).reset_index()
mangroves_global['location_id'] = 'GLOB'
mangroves_global['year'] = 2020
mangroves_global

Unnamed: 0,habitat_name,protected_area,total_area,location_id,year
0,mangroves,61287.20375,147358.990971,GLOB,2020


In [47]:
# Concatenate the global stats to the mangroves dataframe
mangroves_all = pd.concat([mangroves_iso, mangroves_global])

In [48]:
mangroves_regions = mangroves_iso.copy()
mangroves_regions['region'] = mangroves_iso['location_id'].map(country_to_region)

# Calculate stats for each region
mangroves_regions = mangroves_regions.groupby(['region', 'habitat_name']).agg({'protected_area': 'sum', 'total_area': 'sum'}).reset_index()
mangroves_regions['year'] = 2020
mangroves_regions.rename(columns={'region': 'location_id'}, inplace=True)
mangroves_regions

Unnamed: 0,location_id,habitat_name,protected_area,total_area,year
0,AF,mangroves,10000.53,29337.644045,2020
1,AS,mangroves,21277.22,74292.673146,2020
2,EU,mangroves,732.14375,1246.189677,2020
3,,mangroves,2097.74,2415.418557,2020
4,SA,mangroves,27151.74,39893.444608,2020
5,WA,mangroves,27.83,173.620938,2020


In [49]:
# Concatenate stats for regions with mangroves
mangroves_all = pd.concat([mangroves_all, mangroves_regions])

In [50]:
# Add environment
mangroves_all['environment'] = 'marine'

In [51]:
# Save file
mangroves_all.to_csv(path_out + "habitats/mangroves_processed.csv", index=False)

### Concatenate all habitats

In [52]:
# Concatenate the dataframes
habitats_all = pd.concat([habitats, seamounts_all, mangroves_all])
habitats_all

Unnamed: 0,location_id,protected_area,total_area,habitat_name,year,environment
0,ABNJ,427.048524,1893.871282,cold-water corals,2024,marine
1,AGO,0.000000,3.395671,cold-water corals,2024,marine
2,ALB,0.000000,5.986479,cold-water corals,2024,marine
3,ARG,6.984226,61.826344,cold-water corals,2024,marine
4,ATG,0.000000,0.997747,cold-water corals,2024,marine
...,...,...,...,...,...,...
1,AS,21277.220000,74292.673146,mangroves,2020,marine
2,EU,732.143750,1246.189677,mangroves,2020,marine
3,,2097.740000,2415.418557,mangroves,2020,marine
4,SA,27151.740000,39893.444608,mangroves,2020,marine


In [55]:
habitats_all[habitats_all['location_id'] == 'ABNJ']

Unnamed: 0,location_id,protected_area,total_area,habitat_name,year,environment
0,ABNJ,427.048524,1893.871,cold-water corals,2024,marine
0,ABNJ,0.0,6335.727,seagrasses,2024,marine
0,ABNJ,308819.90473,15186150.0,seamounts,2011,marine


In [56]:
habitats_all.to_csv(path_out + "tables/habitats6.csv", index=False, na_rep='NaN', encoding='utf-8', sep=',', decimal='.')