In [88]:
import pandas as pd
import os
import psycopg2 as pg
from psycopg2 import sql
import plotly.express as px
from typing import Union
from sqlalchemy import create_engine

In [78]:
valid_answer = True
while (valid_answer):
    answer = input('Use Cloud DB? (y/n):').lower()
    if answer.lower() == 'n' or answer.lower() == 'no':
        db_host = os.environ.get('DB_HOST')
        db_port = os.environ.get('DB_PORT')
        db_user = os.environ.get('DB_USER')
        db_password = os.environ.get('DB_PASSWORD')
        db_name = os.environ.get('DB_NAME')
        valid_answer = False
        print('Using local DB')
    elif answer.lower() == 'y' or answer.lower() =='yes':
        db_host = os.environ.get('DB_HOST')
        db_port = os.environ.get('DB_PORT')
        db_user = os.environ.get('DB_USER')
        db_password = os.environ.get('DB_PASSWORD')
        db_name = os.environ.get('DB_NAME')
        valid_answer = False
        print('Using Cloud DB')
    else:
        print('Invalid input. Please enter y or n.')

db_url = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}'

Using local DB


In [12]:
def connect_and_run_query(query: Union[str, sql.Composed], to_pandas: bool = False):
    results = None
    conn = pg.connect(
        database=db_name,
        user=db_user,
        password=db_password,
        host=db_host,
        port=db_port
    )
    try:
        cursor = conn.cursor()
        cursor.execute(query)
        if to_pandas:
            results = pd.DataFrame(cursor.fetchall(), columns=[desc[0] for desc in cursor.description])
        else:
            results = cursor.fetchall()
        cursor.close()
    except Exception as e:
        print(e)
        conn.rollback()
    finally:
        conn.close()
        return results 

In [36]:
conn = pg.connect(
    database=db_name,
    user=db_user,
    password=db_password,
    host=db_host,
    port=5432
)

# IF EXISTS THEN REPLACE cep_grouped "TABLE (extra"ct cid, country, eco, pa)
query = """
CREATE TABLE IF NOT EXISTS cep_water_grouped AS
SELECT cep_id, country, eco, pa, is_protected, cep_grouped."transition_0", cep_grouped."transition_1", cep_grouped."transition_2", cep_grouped."transition_3", cep_grouped."transition_4", cep_grouped."transition_5", cep_grouped."transition_6", cep_grouped."transition_7", cep_grouped."transition_8", cep_grouped."transition_9", cep_grouped."transition_10"
FROM cep_grouped
"""

cursor = conn.cursor()
cursor.execute(query)
conn.commit()


# Ecoregions lookup table

In [40]:
# make a new ecoregions table and relate it to the cep_water_grouped table
query = """
CREATE TABLE IF NOT EXISTS ecoregions AS (
SELECT DISTINCT eco, eco_name, is_marine
FROM cep_grouped
)
"""

cursor = conn.cursor()
cursor.execute(query)
conn.commit()

query = """
ALTER TABLE ecoregions
ADD CONSTRAINT pk_eco PRIMARY KEY (eco);
ALTER TABLE cep_water_grouped
ADD CONSTRAINT fk_eco FOREIGN KEY (eco) REFERENCES ecoregions(eco);
"""

cursor = conn.cursor()
cursor.execute(query)
conn.commit()
    

# Country lookup table

In [41]:
#make a new countries table and relate it to the cep_water_grouped table
query = """
CREATE TABLE IF NOT EXISTS countries AS (
SELECT 
    country,
    country_name,
    iso3,
    SUM(
    transition_0 + 
    + transition_1 
    + transition_2 
    + transition_3
    + transition_4
    + transition_5
    + transition_6
    + transition_7
    + transition_8
    + transition_9
    + transition_10
    ) / 1000000 
    AS calculated_area_km2
    FROM cep_grouped
    GROUP BY country_name,country,iso3
    ORDER BY country_name
)
"""

cursor = conn.cursor()
cursor.execute(query)
conn.commit()

# relate the countries table to the cep_water_grouped table
query = """
ALTER TABLE countries
ADD CONSTRAINT pk_country PRIMARY KEY (country);
ALTER TABLE cep_water_grouped
ADD CONSTRAINT fk_country FOREIGN KEY (country) REFERENCES countries(country);
"""

cursor = conn.cursor()
cursor.execute(query)
conn.commit()

# Getting all PA in cep table

In [None]:
# get all PAs. explode the list into a set datatype then print the length of the set
query = """
SELECT pa 
FROM cep_grouped
"""
all_pas = connect_and_run_query(query, to_pandas=False)
# for each pa explode the list into individual elements and add to a set
all_pas = [x[0].split(',') for x in all_pas]

#flattens the list of lists all_pas into a single set of unique elements.
all_pas = {int(x) for sublist in all_pas for x in sublist}
print("length of set: " + str(len(all_pas))) # len should be 275887
# make into a dataframe 
all_pas = pd.DataFrame(all_pas, columns=['pa'])
all_pas.set_index('pa', inplace=True)

# check for any duplicate index values
if len(all_pas[all_pas.index.duplicated()]) != 0:
    print('Duplicates found in index')
    all_pas = all_pas[~all_pas.index.duplicated(keep='first')]

# save to csv
all_pas.to_csv('../data/pa_list.csv')

# PA database

In [42]:
# read ./data/pa_list.csv into a pandas dataframe
pa_list = pd.read_csv('./data/pa_list.csv')
print(len(pa_list))
pa_list.head()

275887


Unnamed: 0,pa
0,555518370
1,103430
2,555517487
3,188243
4,555725436


In [43]:
db_host = 'localhost'
db_port = 5432
db_name = 'db'
db_user = 'admin'
db_password = 'admin'
db_url = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}'

In [44]:
#insert pa_list as a temporary table in the database (to only be used for the next query)
conn = pg.connect(
    database=db_name,
    user=db_user,
    password=db_password,
    host=db_host,
    port=5432
)

query = """
CREATE TABLE temp_pa_list (
    pa serial PRIMARY KEY
);
"""
cursor = conn.cursor()
cursor.execute(query)

# insert data into the temp_pa_list table
for index, row in pa_list.iterrows():
    query = f"""
    INSERT INTO temp_pa_list
    VALUES ({row['pa']});
    """
    cursor.execute(query)
conn.commit()
cursor.close()

In [70]:
# inner join the temp_pa_list table with the wdpa_merged table to get all fields of the wdpa_merged table for the PAs in the pa_list
query = """
SELECT "WDPAID", "STATUS", "STATUS_YR", "REP_AREA", "DESIG_ENG", "DESIG_TYPE", "METADATAID"
FROM wdpa_distinct_wdpaid
INNER JOIN temp_pa_list
ON wdpa_distinct_wdpaid."WDPAID" = temp_pa_list.pa
"""
pa_polygons = connect_and_run_query(query, to_pandas=True)

# inner join the temp_pa_list table with the wdpa_merged table to get all fields of the wdpa_merged table for the PAs in the pa_list
query = """
SELECT "WDPAID", "STATUS", "STATUS_YR", "REP_AREA", "DESIG_ENG", "DESIG_TYPE", "METADATAID"
FROM wdpa_points
INNER JOIN temp_pa_list
ON wdpa_points."WDPAID" = temp_pa_list.pa
"""
pa_points = connect_and_run_query(query, to_pandas=True)

# delete the temp_pa_list table
query = """
DROP TABLE temp_pa_list;
"""
cursor = conn.cursor()
cursor.execute(query)
conn.commit()
cursor.close()
conn.close()

pa_data = pd.concat([pa_polygons, pa_points], axis=0)
print(len(pa_polygons), len(pa_points), len(pa_data), len(pa_list))

Unnamed: 0,WDPAID,STATUS,STATUS_YR,REP_AREA,DESIG_ENG,DESIG_TYPE,METADATAID
0,188243.0,Designated,2003,0.0032,Woodland key habitat,National,2013
1,309943.0,Designated,2019,799.351852,National Park,National,1804
2,555587942.0,Designated,2015,2.563,Managed conservation zone of nature reserve,National,2013
3,148982.0,Designated,1991,0.046934,Federal Inventory of Raised and Transitional M...,National,2013
4,63661.0,Designated,1976,13.456533,Landscape Park,National,2013


In [77]:
#export the pa_data to a csv file
pa_data.to_csv('./data/cep_pa_metadata.csv')

# PA lookup table

In [86]:
# if pd_data does not exist then read it from the csv file
if 'pa_data' not in locals():
    print('Reading pa_data from csv file')

pa_data.set_index('WDPAID', inplace=True)
pa_data.index = pa_data.index.astype(int)
pa_data.head()

In [89]:
# df to postgis
engine = create_engine(db_url)
pa_data.to_sql('pa', engine, if_exists='replace')

810

In [None]:
# create a view that inner joins the temp_pa_list with wdpa_merged table and get all the columns 
# then export the view to a csv file 
# then bring it back into the cep database as a new table
# query status, rep_area designation type status_yr