# Connecting to db

In [None]:
import pandas as pd
import os
import psycopg2 as pg
from psycopg2 import sql
import plotly.express as px

In [None]:
valid_answer = True
while (valid_answer):
    answer = input('Use Cloud DB? (y/n):').lower()
    if answer.lower() == 'n' or answer.lower() == 'no':
        db_host = os.environ.get('DB_HOST')
        db_port = os.environ.get('DB_PORT')
        db_user = os.environ.get('DB_USER')
        db_password = os.environ.get('DB_PASSWORD')
        db_name = os.environ.get('DB_NAME')
        valid_answer = False
        print('Using local DB')
    elif answer.lower() == 'y' or answer.lower() =='yes':
        db_host = os.environ.get('DB_HOST')
        db_port = os.environ.get('DB_PORT')
        db_user = os.environ.get('DB_USER')
        db_password = os.environ.get('DB_PASSWORD')
        db_name = os.environ.get('DB_NAME')
        valid_answer = False
        print('Using Cloud DB')
    else:
        print('Invalid input. Please enter y or n.')

db_url = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}'

In [None]:
# connect to the database
conn = pg.connect(
    database=db_name,
    user=db_user,
    password=db_password,
    host=db_host,
    port=db_port
)

# Data Completeness & Consistency checks
- missing values (done)
- duplicates (done)
- data types (done)
- data ranges
- is marine (all permanent water bodies are marine)
### case studies https://www.nature.org/content/dam/tnc/nature/en/documents/Pathway_for_Inland_Waters_Nov_2022.pdf

In [None]:
# count number of rows in database table
cursor = conn.cursor()
cursor.execute('SELECT COUNT(*) FROM cep_water')
db_length = cursor.fetchone()[0]
cursor.close()

In [None]:
# check for any missing country (country = 0)
cursor = conn.cursor()
cursor.execute('SELECT * FROM cep_water WHERE country = 0')
result = cursor.fetchall()
for row in result:
    print(row)
cursor.close()

In [None]:
# check for duplicates
cursor = conn.cursor()
# check for duplicate cep_id and pa and eco
cursor.execute('SELECT cep_id, pa, eco, COUNT(*) FROM cep_water GROUP BY cep_id, pa, eco HAVING COUNT(*) > 1')
result = cursor.fetchall()
for row in result:
    print(row)
cursor.close()

### Checking if Marine only areas have no permanent water bodies 
#### Findings: some marine only areas only 

In [None]:
# check where is_marine true, it has large band 1 area the other values should be 0 or near 0 if it's near coastlines (e.g reefs)
cursor = conn.cursor()
# Select all rows where is_marine is true and transition_1 is less than 1
cursor.execute('SELECT * FROM cep_water WHERE is_marine = TRUE AND "transition_1" < 1')
# put the result in a dataframe and add the column names
df = pd.DataFrame(cursor.fetchall(), columns=[desc[0] for desc in cursor.description])
df.set_index('cep_id', inplace=True)
cursor.close()
percentage_of_marine_without_permanent_water = len(df) / db_length * 100
df.head(1)

In [None]:
# check where is_marine true, it has large band 1 area the other values should be 0 or near 0 if it's near coastlines (e.g reefs)
cursor = conn.cursor()
# Select all rows where is_marine is true and transition_1 is less than 1
cursor.execute('SELECT * FROM cep_water WHERE is_marine = TRUE AND "transition_1" > 1')
# put the result in a dataframe and add the column names
df = pd.DataFrame(cursor.fetchall(), columns=[desc[0] for desc in cursor.description])
df.set_index('cep_id', inplace=True)
cursor.close()
percentage_of_marine_with_permanent_water = len(df) / db_length * 100
df.head(1)

In [None]:
fig = px.pie(values=[percentage_of_marine_without_permanent_water, percentage_of_marine_with_permanent_water, 100 - percentage_of_marine_without_permanent_water - percentage_of_marine_with_permanent_water], names=['Marine with no permanent water', 'Marine with permanent water', 'Non-marine areas'], title='Percentage of marine areas with and without permanent water')
# change layout to dark theme make the chart square
fig.update_layout(template='plotly_dark', width=600, height=600)
fig.show()

In [None]:
conn.close()

# Analyisis

In [58]:
# import union
from typing import Union

In [59]:
def connect_and_run_query(query: Union[str, sql.Composed]):
    conn = pg.connect(
        database=db_name,
        user=db_user,
        password=db_password,
        host=db_host,
        port=db_port
    )
    try:
        cursor = conn.cursor()
        cursor.execute(query)
        result = cursor.fetchall()
        cursor.close()
    except Exception as e:
        print(e)
        conn.rollback()
    finally:
        conn.close()
    return result

In [60]:
query = """SELECT 
        cw.*, 
        c_qid.quantile_name
    FROM 
        (SELECT * FROM cep_water LIMIT 5) cw
    JOIN 
        cep_qid c_qid
    ON 
        cw.qid = c_qid.qid;"""
result = connect_and_run_query(query)
for row in result:
    print(row)


(1, 0, 895.792133, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 171, 'Lithuania', 'LTU', 80412, 'Central European mixed forests', False, 0, None, False, '20E_60N')
(2, 1, 63082050577.5139, 221373738181.0224, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1, 'Area Beyond National Jurisdiction', 'ABNJ', 1, 'Antarctic', True, 0, None, False, '0E_50S')
(2, 2, 63082050577.5139, 394363393841.4701, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1, 'Area Beyond National Jurisdiction', 'ABNJ', 1, 'Antarctic', True, 0, None, False, '100E_50S')
(2, 3, 0.0, 99875969.231487, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1, 'Area Beyond National Jurisdiction', 'ABNJ', 1, 'Antarctic', True, 0, None, False, '10E_40S')
(2, 4, 63105174929.44815, 633201708877.544, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1, 'Area Beyond National Jurisdiction', 'ABNJ', 1, 'Antarctic', True, 0, None, False, '10E_50S')


In [121]:
def get_summed_bands_by_col(group_by_col = ["country_name"],is_pa=False, is_marine=False, include_unassigned_land=False, convert_to_meters=False):
    df = None
    try:
        # connect to the database
        conn = pg.connect(
            database=db_name,
            user=db_user,
            password=db_password,
            host=db_host,
            port=db_port
        )
        # create a cursor object using the cursor() method
        cursor = conn.cursor()
        # query all rows transition_0  to transition_10, filter where pa is 0 and where marine is false, and group by country_name and sum the areas for each transition band
        bands_SUM = [f'SUM(transition_{i}) as "transition_{i}"' for i in range(11)]
        query = sql.SQL('SELECT {group_by_column}, {bands} FROM cep_grouped WHERE is_protected = {is_pa} {unassigned_land} AND is_marine = {is_marine} GROUP BY {group_by_column}').format(
            bands=sql.SQL(', ').join(map(sql.SQL, bands_SUM)),
            is_pa=sql.Literal(is_pa),
            is_marine=sql.Literal(is_marine),
            group_by_column= sql.Identifier(group_by_col), 
            unassigned_land = sql.SQL('AND eco != 10001') if not include_unassigned_land else sql.SQL(''),
   
        )
        # #print query as string
        # print(query.as_string(conn))                                                                                                                    
        # filter where 
        cursor.execute(query)
        result = cursor.fetchall()
        cursor.close()

        # convert to df and set index to group_by_col
        df = pd.DataFrame(result, columns=[group_by_col]+[f'transition_{i}' for i in range(11)])
        df.set_index(group_by_col, inplace=True)
        # convert to meters
        if convert_to_meters:
            df = df.apply(lambda x: x/1000000)
        #return result
    except Exception as e:
        print("error: ", e)
    finally:
        conn.rollback()
        conn.close()

    return df


In [122]:
# df_protected = get_summed_bands_by_col(group_by_col = "country_name",is_pa=True, is_marine=False, convert_to_meters=True)
# df_unprotected = get_summed_bands_by_col(group_by_col = "country_name",is_pa=False, is_marine=False, convert_to_meters=True)
df_terrestrial_eco = get_summed_bands_by_col(group_by_col = "eco", is_pa=False, is_marine=False, convert_to_meters=True)

error:  "None of ['eco'] are in the columns"


In [68]:
df_protected.head(1)

Unnamed: 0_level_0,transition_0,transition_1,transition_2,transition_3,transition_4,transition_5,transition_6,transition_7,transition_8,transition_9,transition_10
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Bangladesh,11144.104112,1276.686258,289.684649,31.614954,171.221675,128.951815,62.985299,54.733566,62.610513,1.768153,45.913789


In [69]:
df_unprotected.head(1)

Unnamed: 0_level_0,transition_0,transition_1,transition_2,transition_3,transition_4,transition_5,transition_6,transition_7,transition_8,transition_9,transition_10
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Afghanistan,611004.227111,194.299419,174.080562,756.569597,243.086282,538.600089,654.62639,12.474034,136.500569,2005.066034,2363.970579


In [115]:
df_terrestrial_eco.head(1)

Unnamed: 0_level_0,transition_0,transition_1,transition_2,transition_3,transition_4,transition_5,transition_6,transition_7,transition_8,transition_9,transition_10
eco,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9998,16329.370486,798938.09879,7224.348308,31084.582217,470.584544,1060.467029,3797.589527,725.706714,1207.040411,331.338682,991.485527
