In [19]:
import pandas as pd
import os
import psycopg2 as pg
from psycopg2 import sql
import plotly.express as px

In [20]:
valid_answer = True
while (valid_answer):
    answer = input('Use Cloud DB? (y/n):').lower()
    if answer.lower() == 'n' or answer.lower() == 'no':
        db_host = os.environ.get('DB_HOST')
        db_port = os.environ.get('DB_PORT')
        db_user = os.environ.get('DB_USER')
        db_password = os.environ.get('DB_PASSWORD')
        db_name = os.environ.get('DB_NAME')
        valid_answer = False
        print('Using local DB')
    elif answer.lower() == 'y' or answer.lower() =='yes':
        db_host = os.environ.get('DB_HOST')
        db_port = os.environ.get('DB_PORT')
        db_user = os.environ.get('DB_USER')
        db_password = os.environ.get('DB_PASSWORD')
        db_name = os.environ.get('DB_NAME')
        valid_answer = False
        print('Using Cloud DB')
    else:
        print('Invalid input. Please enter y or n.')

db_url = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}'

Using local DB


In [21]:
# read country_areas csv into pandas dataframe
cdf = pd.read_csv('data/country_areas.csv', index_col=0).sort_values(by='name')
cdf.drop(columns=['reported_area'], inplace=True)
cdf.rename(columns={'calculated_area': 'area'}, inplace=True)
# read as whole numbers instead of exponential
# cdf['area'] = cdf['area'].apply(lambda x: int(x))
cdf.head()

Unnamed: 0_level_0,area
name,Unnamed: 1_level_1
Abyei,9978.891
Afghanistan,642049.5
Aksai Chin,30667.55
Albania,28643.9
Algeria,2312184.0


In [22]:
# connect to the database
conn = pg.connect(
    database=db_name,
    user=db_user,
    password=db_password,
    host=db_host,
    port=db_port
)

In [23]:
# read country_areas table into pandas dataframe
sql = '''
SELECT
    country_name,
    SUM(
    transition_0 + 
    + transition_1 
    + transition_2 
    + transition_3
    + transition_4
    + transition_5
    + transition_6
    + transition_7
    + transition_8
    + transition_9
    + transition_10
    ) / 1000000 
    AS calculated_area_km2_with_unassigned_land
    FROM cep_water
    WHERE is_marine = FALSE
    GROUP BY country_name
    ORDER BY country_name
'''
uacepdf = pd.read_sql_query(sql, conn).set_index('country_name')
uacepdf.head()

  uacepdf = pd.read_sql_query(sql, conn).set_index('country_name')


Unnamed: 0_level_0,calculated_area_km2_with_unassigned_land
country_name,Unnamed: 1_level_1
Afghanistan,641837.7
Albania,28692.14
Algeria,2308858.0
American Samoa,209.2911
Andorra,474.6219


In [24]:
# read country_areas table into pandas dataframe
sql = '''
SELECT
    country_name,
    SUM(
    transition_0 + 
    + transition_1 
    + transition_2 
    + transition_3
    + transition_4
    + transition_5
    + transition_6
    + transition_7
    + transition_8
    + transition_9
    + transition_10
    ) / 1000000 
    AS calculated_area_km2
    FROM cep_water
    WHERE is_marine = FALSE
    AND eco != 100001
    GROUP BY country_name
    ORDER BY country_name
'''
lcepdf = pd.read_sql_query(sql, conn).set_index('country_name')
lcepdf.head()

  lcepdf = pd.read_sql_query(sql, conn).set_index('country_name')


Unnamed: 0_level_0,calculated_area_km2
country_name,Unnamed: 1_level_1
Afghanistan,641837.7
Albania,28424.27
Algeria,2308794.0
American Samoa,131.3471
Andorra,474.6219


In [25]:
df = uacepdf.join(lcepdf, how='inner')
df.head()

Unnamed: 0_level_0,calculated_area_km2_with_unassigned_land,calculated_area_km2
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,641837.7,641837.7
Albania,28692.14,28424.27
Algeria,2308858.0,2308794.0
American Samoa,209.2911,131.3471
Andorra,474.6219,474.6219


In [26]:
# merge the two dataframes and calculate the percentage difference between the two area columns
df = cdf.merge(df, left_index=True, right_index=True, how='inner')
df['diff_unassigned'] = ( df['calculated_area_km2_with_unassigned_land']- df['area'] ) / df['area'] * 100
df['diff'] = ( df['calculated_area_km2']- df['area']) / df['area'] * 100
# rename columns
df.columns = ['GEE_area', 'CEP_area_unassigned','CEP_area' ,'diff_unassigned_percent','diff_percent']
df.head()

Unnamed: 0,GEE_area,CEP_area_unassigned,CEP_area,diff_unassigned_percent,diff_percent
Afghanistan,642049.5,641837.7,641837.7,-0.032982,-0.032982
Albania,28643.9,28692.14,28424.27,0.16844,-0.766746
Algeria,2312184.0,2308858.0,2308794.0,-0.143845,-0.146586
American Samoa,200.7186,209.2911,131.3471,4.270926,-34.561587
Andorra,473.8402,474.6219,474.6219,0.164972,0.164972


In [27]:
# get row where max difference
df.loc[df['diff_percent'].idxmax()]

GEE_area                    8.417049
CEP_area_unassigned        10.563452
CEP_area                   10.563452
diff_unassigned_percent    25.500671
diff_percent               25.500671
Name: Monaco, dtype: float64

In [28]:
# get slovenia row
df.loc['Slovenia']

GEE_area                   20269.978613
CEP_area_unassigned        20327.427286
CEP_area                   20313.114365
diff_unassigned_percent        0.283418
diff_percent                   0.212806
Name: Slovenia, dtype: float64

In [29]:
df.loc[df['diff_percent'].idxmin()]

GEE_area                   224.203857
CEP_area_unassigned        301.889311
CEP_area                    14.389841
diff_unassigned_percent     34.649473
diff_percent               -93.581805
Name: Maldives, dtype: float64

In [30]:
# get rows where difference is greater than 50% or less than -50%
df[(df['diff_percent'] > 50) | (df['diff_percent'] < -50)]

Unnamed: 0,GEE_area,CEP_area_unassigned,CEP_area,diff_unassigned_percent,diff_percent
Bermuda,63.236821,72.25517,27.115404,14.26123,-57.120862
British Indian Ocean Territory,49.706153,26.456033,16.626793,-46.775135,-66.549829
Christmas Island,136.34108,141.520392,36.158384,3.798791,-73.479465
Cocos (Keeling) Islands,15.832597,17.806543,3.973882,12.46761,-74.900633
Kiribati,1024.711471,1025.835391,178.625771,0.109682,-82.568189
Maldives,224.203857,301.889311,14.389841,34.649473,-93.581805
Marshall Islands,269.104485,277.9851,43.183604,3.300062,-83.952849
Seychelles,501.426576,483.185715,207.727583,-3.637793,-58.572682
Tuvalu,47.722799,41.387317,5.204732,-13.275587,-89.093825


In [31]:
# total area difference
a = df['GEE_area'].sum()
b = df['CEP_area'].sum()
print(f'Total area difference: {((a - b) / ((a+b)/2) * 100).round(2)}%')

Total area difference: 0.31%


In [32]:
print(len(df), len(cdf), len(uacepdf), len(lcepdf))

216 276 276 265


In [33]:
# check which countries are missing
missing = uacepdf[~uacepdf.index.isin(cdf.index)]
missing

Unnamed: 0_level_0,calculated_area_km2_with_unassigned_land
country_name,Unnamed: 1_level_1
Argentina|Falkland Islands (Malvinas),569.0371
Argentina|South Georgia and the South Sandwich Islands,271.1528
Argentina|Uruguay,0.1917254
Australia|Papua New Guinea,20.88737
Bolivia (Plurinational State of),1084467.0
"Bonaire, Sint Eustatius and Saba",320.984
British Indian Ocean Territory|Mauritius,41.96682
Brunei Darussalam|China|Malaysia|Philippines|Viet Nam,0.3970889
Cabo Verde,4125.213
China|India,102156.8


In [36]:
# see algeria in normal numbers instead of exponential
print(df.loc['Turkmenistan'])

GEE_area                   554569.004490
CEP_area_unassigned        555052.515853
CEP_area                   555052.515853
diff_unassigned_percent         0.087187
diff_percent                    0.087187
Name: Turkmenistan, dtype: float64


In [None]:
# save to csv
df.to_csv('data/area_diff.csv')

In [None]:
conn.close()