In [1]:
import pandas as pd
import os
import psycopg2 as pg
from psycopg2 import sql
import plotly.express as px
from typing import Union

In [2]:
valid_answer = True
while (valid_answer):
    answer = input('Use Cloud DB? (y/n):').lower()
    if answer.lower() == 'n' or answer.lower() == 'no':
        db_host = os.environ.get('DB_HOST')
        db_port = os.environ.get('DB_PORT')
        db_user = os.environ.get('DB_USER')
        db_password = os.environ.get('DB_PASSWORD')
        db_name = os.environ.get('DB_NAME')
        valid_answer = False
        print('Using local DB')
    elif answer.lower() == 'y' or answer.lower() =='yes':
        db_host = os.environ.get('DB_HOST')
        db_port = os.environ.get('DB_PORT')
        db_user = os.environ.get('DB_USER')
        db_password = os.environ.get('DB_PASSWORD')
        db_name = os.environ.get('DB_NAME')
        valid_answer = False
        print('Using Cloud DB')
    else:
        print('Invalid input. Please enter y or n.')

db_url = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}'

Using local DB


In [7]:
def connect_and_run_query(query: Union[str, sql.Composed], to_pandas: bool = False):
    conn = pg.connect(
        database=db_name,
        user=db_user,
        password=db_password,
        host=db_host,
        port=db_port
    )
    try:
        cursor = conn.cursor()
        cursor.execute(query)
        if to_pandas:
            results = pd.DataFrame(cursor.fetchall(), columns=[desc[0] for desc in cursor.description])
        else:
            results = cursor.fetchall()
        cursor.close()
    except Exception as e:
        print(e)
        conn.rollback()
    finally:
        conn.close()
    return results

In [26]:
query = """WITH groupings AS (
    SELECT 
		cep_grouped.cep_id,
        -- SUM and convert FROM M2 to KM2 
        SUM(transition_1 + transition_3 + transition_8) / 1000000 AS seasonal_2015, 
        SUM(transition_1 + transition_2 + transition_7) / 1000000 AS permanent_2015
	FROM 
		cep_grouped
    WHERE 
        is_marine = FALSE 
    AND 
        eco != 10001
	GROUP BY 
		cep_grouped.cep_id
),
grouped_cep_summaries AS (
    -- join temp groupings table with cep_water table, but don't select transition columns
    SELECT 
        cep_grouped.cep_id,
        country_name,
        is_protected,
        groupings.seasonal_2015,
        groupings.permanent_2015
    FROM
        cep_grouped
    JOIN
        groupings
    ON
        cep_grouped.cep_id = groupings.cep_id
),
--group by protected and unprotected 
is_protected_groupings AS (
    SELECT 
        country_name,
        is_protected,
        SUM(seasonal_2015) AS seasonal_2015,
        SUM(permanent_2015) AS permanent_2015
    FROM
        grouped_cep_summaries
    GROUP BY
        country_name,
        is_protected
),

protected_groupings AS(
    SELECT
        is_protected_groupings.country_name,
        is_protected_groupings.is_protected,
        is_protected_groupings.seasonal_2015,
        is_protected_groupings.permanent_2015,
        SUM(is_protected_groupings.seasonal_2015 + is_protected_groupings.permanent_2015) AS total_inland_water_km2
    FROM
        is_protected_groupings
    WHERE
        is_protected_groupings.is_protected = TRUE
    GROUP BY
        is_protected_groupings.country_name,
        is_protected_groupings.is_protected,
        is_protected_groupings.seasonal_2015,
        is_protected_groupings.permanent_2015
),

unprotected_groupings AS(
    SELECT
        is_protected_groupings.country_name,
        is_protected_groupings.is_protected,
        is_protected_groupings.seasonal_2015,
        is_protected_groupings.permanent_2015,
        SUM(is_protected_groupings.seasonal_2015 + is_protected_groupings.permanent_2015) AS total_inland_water_km2
    FROM
        is_protected_groupings
    WHERE
        is_protected_groupings.is_protected = FALSE
    GROUP BY
        is_protected_groupings.country_name,
        is_protected_groupings.is_protected,
        is_protected_groupings.seasonal_2015,
        is_protected_groupings.permanent_2015
),
countries_with_areas AS(
    SELECT name0, km2_tot as rep_area_km2
    FROM gaul0 
),
inland_water_groupings AS(
    SELECT
        protected_groupings.country_name,
        --calculate as percentages of land cover
        (SUM(protected_groupings.total_inland_water_km2 + unprotected_groupings.total_inland_water_km2) / countries_with_areas.rep_area_km2) * 100 AS "% of country's land area that is IW",
        (SUM(protected_groupings.permanent_2015 + unprotected_groupings.permanent_2015) / countries_with_areas.rep_area_km2) * 100 AS "% of country's land area that is IPW",
        (SUM(protected_groupings.seasonal_2015 + unprotected_groupings.seasonal_2015) / countries_with_areas.rep_area_km2) * 100 AS "% of country's land area that is ISW",
        (protected_groupings.permanent_2015 / SUM(protected_groupings.permanent_2015 + unprotected_groupings.permanent_2015)) * 100 AS "% of IPW that is protected",
        (protected_groupings.seasonal_2015 / SUM(protected_groupings.seasonal_2015 + unprotected_groupings.seasonal_2015))* 100  AS "% of ISW that is protected",
        (SUM(protected_groupings.permanent_2015 + unprotected_groupings.permanent_2015) / SUM(protected_groupings.total_inland_water_km2 + unprotected_groupings.total_inland_water_km2)) * 100 AS "% of IW that is protected"

        -- protected_groupings.seasonal_2015 AS protected_seasonal_2015,
        -- protected_groupings.permanent_2015 AS protected_permanent_2015,
        -- protected_groupings.total_inland_water_km2 AS protected_total_inland_water_km2,
        -- unprotected_groupings.seasonal_2015 AS unprotected_seasonal_2015,
        -- unprotected_groupings.permanent_2015 AS unprotected_permanent_2015,
        -- unprotected_groupings.total_inland_water_km2 AS unprotected_total_inland_water_km2,
        -- countries_with_areas.rep_area_km2
    FROM
        protected_groupings
    JOIN
        unprotected_groupings
    ON
        protected_groupings.country_name = unprotected_groupings.country_name
    JOIN 
        countries_with_areas
    ON
        protected_groupings.country_name = countries_with_areas.name0
    GROUP BY
        protected_groupings.country_name,
        countries_with_areas.rep_area_km2,
        protected_groupings.total_inland_water_km2,
        protected_groupings.permanent_2015,
        protected_groupings.seasonal_2015,
        protected_groupings.is_protected,
        unprotected_groupings.total_inland_water_km2,
        unprotected_groupings.permanent_2015,
        unprotected_groupings.seasonal_2015
)

SELECT *
FROM inland_water_groupings
ORDER BY country_name"""
results = connect_and_run_query(query, to_pandas=True)

In [27]:
results = results.set_index('country_name')#
# change datatypes of results columns to float
results = results.astype(float)
results = results.round(2)
results.head()

Unnamed: 0_level_0,% of country's land area that is IW,% of country's land area that is IPW,% of country's land area that is ISW,% of IPW that is protected,% of ISW that is protected,% of IW that is protected
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,0.29,0.09,0.2,34.89,13.45,31.77
Albania,3.61,1.89,1.72,72.03,76.62,52.29
Algeria,0.03,0.02,0.01,11.73,16.46,62.86
Andorra,0.25,0.11,0.14,35.16,35.21,45.06
Angola,0.2,0.11,0.1,1.03,1.26,53.35


In [28]:
# import 2016 results
filepath = 'C:\\Users\\riyad\\Downloads\\Work\\2016\\plos_One_2016_results.csv'

#read csv and skip 1st row
df = pd.read_csv(filepath, skiprows=1, index_col=0)
df.head()

Unnamed: 0_level_0,% of country's land area that is IW,% of country's land area that is IPW,% of country's land area that is ISW,% of IPW that is protected,% of ISW that is protected,% of IW that is protected,% of IPW protected (including point buffers),% of ISW protected (including point buffers),% of IW protected (including point buffers),Additional estimated percentage from buffers
Country name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Afghanistan,0.33,0.09,0.24,11.23,5.11,6.78,11.23,5.11,6.78,0.0
Akrotiri and Dhekelia,5.48,0.93,4.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Aland,10.11,8.84,1.28,3.58,3.28,3.54,3.58,3.28,3.54,0.0
Albania,2.39,1.99,0.41,76.2,31.23,68.56,77.95,32.35,70.21,1.64
Algeria,0.13,0.02,0.11,5.63,3.4,3.82,6.97,65.5,54.34,50.51


In [29]:
# check which indexes are different between the two dataframes
print(results.index.difference(df.index))

Index(['Brunei Darussalam', 'Congo', 'Côte d'Ivoire',
       'Falkland Islands (Malvinas)', 'Faroe Islands', 'Guinea-Bissau',
       'Iran (Islamic Republic of)', 'Lao People's Democratic Republic',
       'Micronesia (Federated States of)', 'Niue', 'Russian Federation',
       'Réunion', 'Sao Tome and Principe',
       'South Georgia and the South Sandwich Islands', 'Syrian Arab Republic',
       'Timor-Leste', 'United States of America', 'Viet Nam'],
      dtype='object')


In [48]:
df.loc['Iran']

% of country's land area that is IW               0.70
% of country's land area that is IPW              0.19
% of country's land area that is ISW              0.51
% of IPW that is protected                      100.00
% of ISW that is protected                       51.70
% of IW that is protected                        64.99
% of IPW protected (including point buffers)    100.00
% of ISW protected (including point buffers)     52.61
% of IW protected (including point buffers)      65.65
Additional estimated percentage from buffers      0.66
Name: Iran, dtype: object

In [None]:
# TODO fix country names so they match (e.g. United States vs United States of America or Iran vs Iran, Islamic Republic of)

In [40]:
#merge the two dataframes first on the index and subtract the matching columns
merged = pd.merge(results, df, left_index=True, right_index=True)
merged = merged.astype(float)
merged['% of country\'s land area that is IW'] = merged['% of country\'s land area that is IW_x'] - merged['% of country\'s land area that is IW_y']
merged['% of country\'s land area that is IPW'] = merged['% of country\'s land area that is IPW_x'] - merged['% of country\'s land area that is IPW_y']
merged['% of country\'s land area that is ISW'] = merged['% of country\'s land area that is ISW_x'] - merged['% of country\'s land area that is ISW_y']
merged['% of IPW that is protected'] = merged['% of IPW that is protected_x'] - merged['% of IPW that is protected_y']
merged['% of ISW that is protected'] = merged['% of ISW that is protected_x'] - merged['% of ISW that is protected_y']
merged['% of IW that is protected'] = merged['% of IW that is protected_x'] - merged['% of IW that is protected_y']

# drop all but the last 6 columns
merged = merged.iloc[:,-6:]
merged.head()


Unnamed: 0_level_0,% of country's land area that is IW,% of country's land area that is IPW,% of country's land area that is ISW,% of IPW that is protected,% of ISW that is protected,% of IW that is protected
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,-0.04,0.0,-0.04,23.66,8.34,24.99
Albania,1.22,-0.1,1.31,-4.17,45.39,-16.27
Algeria,-0.1,0.0,-0.1,6.1,13.06,59.04
Andorra,0.09,-0.03,0.12,23.87,25.21,33.95
Angola,-0.08,0.02,-0.09,-0.01,-26.0,34.74


In [41]:
# in merged dataframe, find any country with zero values in all columns
zero_values = merged[(merged == 0).all(axis=1)]
zero_values.head()

Unnamed: 0_level_0,% of country's land area that is IW,% of country's land area that is IPW,% of country's land area that is ISW,% of IPW that is protected,% of ISW that is protected,% of IW that is protected
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
