In [1]:
import pandas as pd
import os
import psycopg2 as pg
from psycopg2 import sql
from typing import Union

# Getting PAs from cep

In [2]:
valid_answer = True
while (valid_answer):
    answer = input('Use Cloud DB? (y/n):').lower()
    if answer.lower() == 'n' or answer.lower() == 'no':
        db_host = os.environ.get('DB_HOST')
        db_port = os.environ.get('DB_PORT')
        db_user = os.environ.get('DB_USER')
        db_password = os.environ.get('DB_PASSWORD')
        db_name = os.environ.get('DB_NAME')
        valid_answer = False
        print('Using local DB')
    elif answer.lower() == 'y' or answer.lower() =='yes':
        db_host = os.environ.get('DB_HOST')
        db_port = os.environ.get('DB_PORT')
        db_user = os.environ.get('DB_USER')
        db_password = os.environ.get('DB_PASSWORD')
        db_name = os.environ.get('DB_NAME')
        valid_answer = False
        print('Using Cloud DB')
    else:
        print('Invalid input. Please enter y or n.')

db_url = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}'

Using local DB


In [5]:
def connect_and_run_query(query: Union[str, sql.Composed], to_pandas: bool = False):
    conn = pg.connect(
        database=db_name,
        user=db_user,
        password=db_password,
        host=db_host,
        port=db_port
    )
    try:
        cursor = conn.cursor()
        cursor.execute(query)
        if to_pandas:
            results = pd.DataFrame(cursor.fetchall(), columns=[desc[0] for desc in cursor.description])
        else:
            results = cursor.fetchall()
        cursor.close()
    except Exception as e:
        print(e)
        conn.rollback()
    finally:
        conn.close()
    return results

In [5]:
# get all PAs. explode the list into a set datatype then print the length of the set
query = """
SELECT pa 
FROM cep_grouped
"""
all_pas = connect_and_run_query(query, to_pandas=False)
# for each pa explode the list into individual elements and add to a set
all_pas = [x[0].split(',') for x in all_pas]


In [13]:
#flattens the list of lists all_pas into a single set of unique elements.
pa_set = {int(x) for sublist in all_pas for x in sublist}
print("length of set: " + str(len(pa_set))) # len should be 275887
# make into a dataframe 
pa_df = pd.DataFrame(pa_set, columns=['pa'])
pa_df.set_index('pa', inplace=True)

# check for any duplicate index values
if len(pa_df[pa_df.index.duplicated()]) != 0:
    print('Duplicates found in index')
    pa_df = pa_df[~pa_df.index.duplicated(keep='first')]

# save to csv
pa_df.to_csv('../data/pa_list.csv')

length of set: 275887


0
1
2


# Comparing with WDPA


In [2]:
db_host = 'localhost'
db_port = 5432
db_name = 'db'
db_user = 'admin'
db_password = 'admin'

db_url = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}'

In [3]:
# read ./data/pa_list.csv into a pandas dataframe
pa_list = pd.read_csv('../data/pa_list.csv')
print(len(pa_list))
pa_list.head()

275887


Unnamed: 0,pa
0,555518370
1,103430
2,555517487
3,188243
4,555725436


In [None]:
#insert pa_list as a temporary table in the database (to only be used for the next query)
conn = pg.connect(
    database=db_name,
    user=db_user,
    password=db_password,
    host=db_host,
    port=5432
)

query = """
CREATE TABLE temp_pa_list (
    pa serial PRIMARY KEY
);
"""
cursor = conn.cursor()
cursor.execute(query)

# insert data into the temp_pa_list table
for index, row in pa_list.iterrows():
    query = f"""
    INSERT INTO temp_pa_list
    VALUES ({row['pa']});
    """
    cursor.execute(query)
conn.commit()
cursor.close()
conn.close()

In [39]:
query = """
SELECT wdpa."WDPAID", wdpa."REP_AREA", wdpa."NAME",wdpa."STATUS", wdpa."DESIG_ENG"
FROM wdpa_distinct_wdpaid AS wdpa
INNER JOIN temp_pa_list AS tpl
ON wdpa."WDPAID" = tpl.pa
WHERE wdpa."STATUS" IN ('Not Reported', 'Proposed')
OR wdpa."DESIG_ENG" LIKE '%UNESCO%'
OR wdpa."REP_AREA" = 0;
"""

wdpa_invalid_pa_polygons = connect_and_run_query(query, to_pandas=True)
wdpa_invalid_pa_polygons.set_index('WDPAID', inplace=True)
wdpa_invalid_pa_polygons.index = wdpa_invalid_pa_polygons.index.astype(int)


# now do points
query = """
SELECT wdpa."WDPAID", wdpa."REP_AREA", wdpa."NAME",wdpa."STATUS", wdpa."DESIG_ENG"
FROM wdpa_points AS wdpa
INNER JOIN temp_pa_list AS tpl
ON wdpa."WDPAID" = tpl.pa
WHERE wdpa."STATUS" IN ('Not Reported', 'Proposed')
OR wdpa."DESIG_ENG" LIKE '%UNESCO%'
OR wdpa."REP_AREA" = 0;
"""

wdpa_invalid_pa_points = connect_and_run_query(query, to_pandas=True)
wdpa_invalid_pa_points.set_index('WDPAID', inplace=True)
wdpa_invalid_pa_points.index = wdpa_invalid_pa_points.index.astype(int)

# concatenate the two dataframes
df = pd.concat([wdpa_invalid_pa_polygons, wdpa_invalid_pa_points])
df

  df = pd.concat([wdpa_invalid_pa_polygons, wdpa_invalid_pa_points])


Unnamed: 0_level_0,REP_AREA,NAME,STATUS,DESIG_ENG
WDPAID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
36564,0.0,Kagurna,Designated,Forest Reserve
300852,0.0,Classified Forest Name Unknown (CIV) No.46,Designated,Classified Forest
555707347,0.0,Peshhera Geograficheskogo Obshhestva,Designated,Natural Monument
36572,0.0,Puissa,Designated,Forest Reserve
300896,0.0,Forest Reserve Name Unknown (NGA) No.69,Designated,Forest Reserve
...,...,...,...,...
555592561,0.0,Monavale Wetland,Designated,"Ramsar Site, Wetland of International Importance"
36725,0.0,Kurmi Agori,Designated,Forest Reserve
36862,0.0,Ibaji-Ojok,Designated,Forest Reserve
36375,0.0,Dogan Dawa,Designated,Forest Reserve


In [42]:
df.loc[df['DESIG_ENG'].str.contains('UNESCO', case=False, na=False)]

Unnamed: 0_level_0,REP_AREA,NAME,STATUS,DESIG_ENG
WDPAID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
555596201,1149.801498,Troodos UNESCO GLOBAL Geopark,Designated,UNESCO Global Geopark


In [45]:
# drop temp_pa_list table
conn = pg.connect(
    database=db_name,
    user=db_user,
    password=db_password,
    host=db_host,
    port=5432
)

query = """
DROP TABLE temp_pa_list;
"""
cursor = conn.cursor()
cursor.execute(query)
conn.commit()
cursor.close()
conn.close()

# Verify the PAs exist in the cep layer

In [12]:
class FixingCEPAttributes():
    def __init__(self, cep_attributes_path):
        self.cep_df = pd.read_csv(cep_attributes_path, index_col='cid')

    def fix_NaN_country(self):
        """
        Fix the missing country values in the CEP attributes
        """
        # get rows where NaN or null country values
        missing_country = self.cep_df[self.cep_df['country_name'].isnull()].index.values

        # There should only be one missing country which is cid 1 and we can manually update it
        if len(missing_country) == 1:
            # update cid 1 with country (code): 171, country name: Lithuania, iso3: LTU
            self.cep_df.loc[1, 'country'] = 171
            self.cep_df.loc[1, 'country_name'] = 'Lithuania'
            self.cep_df.loc[1, 'iso3'] = 'LTU'

    def fix_missing_ids(self):
        """
        In the cid_index.csv there is one missing id when compared to the raster tiff files from 1 to 463710 ids (inclusive) which is 295147
        after investing the data in QGIS we found that the missing id is a single pixel in the conifer forests of Russia (should be the same as cid 339212)
        """
        # in cep_df check if there are any missing ids from 1 to 463710 ids (inclusive)
        missing_ids = set(range(1, 463711)) - set(self.cep_df.index)
        missing_ids # {cid: 295147}

        if missing_ids == {295147}:
            # maunally insert missing record for id 295147 (copy from 339212 with different id)
            self.cep_df.loc[295147] = self.cep_df.loc[339212].copy()

    def concatenate_PAs(self):
        """
        Concatenate the PAs of the same cid, country and ecoregion into a single row
        """
        dupes = self.cep_df.reset_index().groupby(['cid','country', 'eco']).filter(lambda x: len(x) > 1)
        dupes = dupes.groupby(['cid','country', 'eco']).agg(
            {
                'country_name':'first',
                'iso3':'first',
                'eco_name':'first',
                'is_marine':'first',
                # concatenate pa ids and pa names into strings NOT lists
                'pa': lambda x: ','.join(map(str, x)),
                'pa_name': lambda x: ','.join(map(str, x)),
                'is_protected': 'first'
            })
        dupes = dupes.reset_index()

        non_dupes = self.cep_df.reset_index().groupby(['cid','country', 'eco']).filter(lambda x: len(x) == 1)

        # combine dupes and non_dupes together
        self.cep_df = pd.concat([non_dupes, dupes])
        self.cep_df = self.cep_df.set_index('cid')
        self.cep_df

    def verify_fixes(self):

        missing_countries = self.cep_df[self.cep_df['country_name'].isnull()].index.values
        if len(missing_countries) != 0:
            print(f'Missing countries: {missing_countries}')

        # get max index
        max_index = self.cep_df.index.max()
        if len(self.cep_df) != max_index: 
            print(f'Index does not match cep_ids, max index: {max_index}, number of rows: {len(self.cep_df)}')
            missing_ids = set(range(1, max_index)) - set(self.cep_df.index) # should be empty
            print(f'Missing ids: {missing_ids}')

        print('All fixes verified')

    def fix_all(self):
        self.fix_NaN_country()
        self.fix_missing_ids()
        self.concatenate_PAs()
        self.verify_fixes()
        return self.cep_df

In [44]:
# estimated time to run: 27 seconds
cep_df = FixingCEPAttributes("C:\\Users\\riyad\\Documents\\Subjects\\WaterQualityContract\\JRC\\cep_oecm202302\\cep_oecm202302\\cid_index_202302.csv").fix_all()
# cep_df.head()
# check if ids are in cep_df
cep_df.loc[
    cep_df['pa'].isin(df.index.values)
]

Unnamed: 0_level_0,country,country_name,iso3,eco,eco_name,is_marine,pa,pa_name,is_protected
cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
68,1,Area Beyond National Jurisdiction,ABNJ,23,North Atlantic Current,True,555557112,North West Rockall Bank,True
118,1,Area Beyond National Jurisdiction,ABNJ,36,Subarctic Atlantic,True,555557017,Hatton Bank,True
162,3,Afghanistan,AFG,81004,Ghorat-Hazarajat alpine meadow,False,555705307,Bamyan Plateau,True
181,3,Afghanistan,AFG,81301,Afghan Mountains semi-desert,False,555705307,Bamyan Plateau,True
197,3,Afghanistan,AFG,81322,Paropamisus xeric woodlands,False,555705307,Bamyan Plateau,True
...,...,...,...,...,...,...,...,...,...
464364,303,Zimbabwe,ZWE,30719,Southern Miombo woodlands,False,555592561,Monavale Wetland,True
464579,303,Zimbabwe,ZWE,30725,Zambezian and Mopane woodlands,False,312892,Unknown 16,True
464584,303,Zimbabwe,ZWE,30725,Zambezian and Mopane woodlands,False,555592562,Victoria Falls National Park,True
464644,303,Zimbabwe,ZWE,30726,Zambezian Baikiaea woodlands,False,301723,Forest Reserve Name Unknown (ZWE) No.1,True


In [46]:
cep_df.loc[cep_df['pa'] == 555596201]

Unnamed: 0_level_0,country,country_name,iso3,eco,eco_name,is_marine,pa,pa_name,is_protected
cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
74082,72,Cyprus,CYP,81206,Cyprus Mediterranean forests,False,555596201,Troodos UNESCO GLOBAL Geopark,True


As in other global PA assessments (see e.g. [19–21]), we excluded from analysis those PAs with a “proposed” or “not reported” status, sites reported as points without an associated reported area, and UNESCO Man and the Biosphere Reserves

In [57]:
# calculate the area of the PA with id 555596201
query = """
SELECT ST_Area(wdpa.geom::geography) / 1000000 as area, wdpa."REP_AREA"
FROM wdpa_merged AS wdpa
WHERE wdpa."WDPAID" = 555596201;
"""
area = connect_and_run_query(query, to_pandas=False)
print(area)
# convert to square kilometers
area[0][0] / 1000000



[(1149.8014994103235, 1149.8014979)]


0.0011498014994103234