## Load Packages

In [1]:
# Link to Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Connect to Earth Engine
#import ee
#ee.Authenticate()
#ee.Initialize()

In [3]:
!pip install geopandas
!pip install geojson

import os
from glob import glob
import numpy as np
import pandas as pd
import geopandas as gpd
import geojson


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting geopandas
  Downloading geopandas-0.10.2-py2.py3-none-any.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 13.4 MB/s 
[?25hCollecting fiona>=1.8
  Downloading Fiona-1.8.21-cp37-cp37m-manylinux2014_x86_64.whl (16.7 MB)
[K     |████████████████████████████████| 16.7 MB 58.0 MB/s 
Collecting pyproj>=2.2.0
  Downloading pyproj-3.2.1-cp37-cp37m-manylinux2010_x86_64.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 57.7 MB/s 
Collecting click-plugins>=1.0
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Collecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Collecting munch
  Downloading munch-2.5.0-py2.py3-none-any.whl (10 kB)
Installing collected packages: munch, cligj, click-plugins, pyproj, fiona, geopandas
Successfully installed click-plugins-1.1.1 cligj-0.7.2 fiona-1.8.21 geopandas-0.10.2 munch-2.5.0 pyproj-3.2.1
Lo

## Tidy up Datasets before Mapping/Plotting

### v0.Group zonalStats dataframes by country

In [None]:
id_KHM = [541, 670]
id_VTM = [530,531,532,533,534,535,536,537,538,540,541]
id_THA = [655,656,657,665,666,667,668,669,670,671,672,695]
id_MYS = [651,658,659,660,662,663,664,618,619,626,627,628,629,630,631]
id_MMR = [655,693,694,695,696,697,698,699,700,701,702]

In [None]:
def filterRegion(list_id, country):
  """ Merge GeoJSONs and Subset to one country

  Parameters
  ----------
  list_id: list
    List of parcel ids
  country: string
    Country Name
  
  Returns
  ----------
  Exported GeoJSON of one country
  """
  os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/0.Initial/")
  files = [glob(os.path.join(os.getcwd(), '*'+str(i)+'*')) for i in list_id]
  files_flat = [item for sublist in files for item in sublist]
  gdfs = [gpd.read_file(i) for i in files_flat]
  gdfs_filtered = [i[i['NAME_0']==country] for i in gdfs]
  gdf = pd.concat([i for i in gdfs_filtered])

  # Data type of zonStats columns to "uint8"
  #keys = [i for i in gdf.columns if 'majority' in i]
  #values = [np.uint8]*len(keys)
  #dictCol = dict(zip(keys, values))
  #gdf = gdf.astype(dictCol)

  outPath = '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/1.by_country/'
  return gdf.to_file(os.path.join(outPath, "zonStats_"+str(country)+".geojson"), driver = 'GeoJSON')

In [None]:
# Write file
#filterRegion(id_KHM, "Cambodia")
filterRegion(id_VTM, "Vietnam")
filterRegion(id_THA, "Thailand")
filterRegion(id_MYS, "Malaysia")
filterRegion(id_MMR, "Myanmar")

In [None]:
# Draft
#os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/0.Initial/")
#files = [glob(os.path.join(os.getcwd(), '*'+str(i)+'*')) for i in id_cambodia]
#files_flat = [item for sublist in files for item in sublist]
#gdfs = [gpd.read_file(i) for i in files_flat]
#gdfs_filtered = [i[i['NAME_0']=="Cambodia"] for i in gdfs]
#gdf_KHM = pd.concat([i for i in gdfs_filtered])

#keys = [i for i in gdf_KHM.columns if 'majority' in i]
#values = [np.uint8]*len(keys)
#dictCol = dict(zip(keys, values))
#gdf_KHM = gdf_KHM.astype(dictCol)

### v0. Tidy up ZonStats using GEE
  * Calculate Pond Area, adding as Attributes
  * Transfer Geometry Type from polygon to point

#### Flow

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/1.by_country/")
with open('zonStats_Cambodia.geojson') as f:
  geojson_ini = geojson.load(f)

In [None]:
# Subset big GeoJSON into processible parts (2500 items/part)
geojson_tiled = [tuple(geojson_ini[i:i+2500]) for i in range(0, len(geojson_ini['features']), 2500)]
len(geojson_tiled[-1])

158

In [None]:
# Load GeoJSON as feature collection
fc = [ee.FeatureCollection(i) for i in geojson_tiled]
type(fc)

list

In [None]:
# Pond Area in square meters
fc_area = [i.map(lambda feature: feature.set({'Area': feature.area()})) for i in fc]
type(fc_area)

list

In [None]:
# Get Centroid of Pond Polygons, thus transfer geometry type from polygon to point
fc_point= [i.map(lambda feature: feature.centroid()) for i in fc_area]
type(fc_point)

list

In [None]:
# Export Feature Collection as GeoJSON
for i in range(len(fc_point)):
  task_config = {
          'description': 'zonStats_KHM_tidy',
          'fileNamePrefix': 'zonStats_KHM_tidy_'+str(i),
          'folder': '2.area_centroid',
          'fileFormat': 'GeoJSON'}

  task = ee.batch.Export.table.toDrive(fc_point[i], **task_config)
  task.start()

In [None]:
task.status()

#### Function

In [None]:
def tidy_up(geojson_path):

  # Load GeoJSON
  with open(geojson_path) as f:
    geojson_ini = geojson.load(f)
  
  # Subset big GeoJSON into processible parts (2500 items/part)
  geojson_tiled = [tuple(geojson_ini[i:i+2500]) for i in range(0, len(geojson_ini['features']), 2500)]

  # Load GeoJSON as feature collection
  fc = [ee.FeatureCollection(i) for i in geojson_tiled]

  # Pond Area in square meters
  fc_area = [i.map(lambda feature: feature.set({'Area': feature.area()})) for i in fc]

  # Get Centroid of Pond Polygons, thus transfer geometry type from polygon to point
  fc_point= [i.map(lambda feature: feature.centroid()) for i in fc_area]

  return fc_point

In [None]:
#countries = ['Malaysia', 'Thailand', 'Vietnam', 'Myanmar']
countries = ['Myanmar']

def genFun(n): # len(countries)

  os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/1.by_country/")
  files = [glob(os.path.join(os.getcwd(), '*'+i+'*.geojson')) for i in countries]

  i = 0
  while i < n:
    input = files[i][0]
    output = tidy_up(input) # output is a list

    for j in range(len(output)):
      task_config = {
              'description': 'zonStats_tidy',
              'fileNamePrefix': 'zonStats_'+countries[i]+'_tidy_'+str(j),
              'folder': '2.area_centroid',
              'fileFormat': 'GeoJSON'}
      task = ee.batch.Export.table.toDrive(output[j], **task_config)
      yield task.start()
      
    i += 1

gen = genFun(len(countries))

In [None]:
# Exhaust Generator
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

### v1.Revise: Group zonStats dataframes by country

#### Flow

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/0.Initial/")
files = glob(os.path.join(os.getcwd(), '*540*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/0.Initial/ponds_pa_540.geojson']

In [None]:
parcelID = files[0].split("/")[-1].split('.')[0].split('_')[-1]
parcelID

'540'

In [None]:
gdf = gpd.read_file(files[0])

In [None]:
# Drop rows where its value in column "GID_0" is None
gdf = gdf.dropna(axis=0, subset=['GID_0'])

print("parcel-", parcelID, " droped ", len(gdf[gdf['GID_0'].isna()].index), " nan records.")

parcel- 540  droped  0  nan records.


In [None]:
gids = list(set(gdf['GID_0']))
gids

['KHM', 'VNM']

In [None]:
# Split gdf by country
gdfs = [gdf[gdf['GID_0']==i] for i in gids]

In [None]:
outpath = "/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/1.0.split_by_country/"

for i in range(len(gids)):
  gdfs[i].to_file(os.path.join(outpath, str(gids[i])+'_'+parcelID+".geojson"), driver = 'GeoJSON')

#### Function

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/0.Initial/")
files = glob(os.path.join(os.getcwd(), '*'))
files = sorted(files)[9:]
#files

In [None]:
outpath = "/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/1.0.split_by_country/"

def genFun(n): # n = len(files)

  i = 0
  while i < n:
    # get parcel id
    parcelID = files[i].split("/")[-1].split('.')[0].split('_')[-1]

    gdf = gpd.read_file(files[i])

    # Drop rows where its value in column "GID_0" is None
    gdf = gdf.dropna(axis=0, subset=['GID_0'])
    #print("parcel-", parcelID, " droped ", len(gdf[gdf['GID_0'].isna()].index), " nan records.")

    # get gids
    gids = list(set(gdf['GID_0']))

    # Split gdf by country
    gdfs = [gdf[gdf['GID_0']==k] for k in gids]

    j = 0
    while j < len(gids):
      yield gdfs[j].to_file(os.path.join(outpath, str(gids[j])+'_'+parcelID+".geojson"), driver = 'GeoJSON')
      j += 1

    i += 1

gen = genFun(len(files))

In [None]:
# Exhaust Generator
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

### v1. Tidy up using GEE

#### Function

In [None]:
def tidy_up(geojson_path):

  # Load GeoJSON
  with open(geojson_path) as f:
    geojson_ini = geojson.load(f)
  
  # Subset big GeoJSON into processible parts (2500 items/part)
  geojson_tiled = [tuple(geojson_ini[i:i+2500]) for i in range(0, len(geojson_ini['features']), 2500)]

  # Load GeoJSON as feature collection
  fc = [ee.FeatureCollection(i) for i in geojson_tiled]

  # Pond Area in square meters
  fc_area = [i.map(lambda feature: feature.set({'pondSize_m2': feature.area()})) for i in fc]

  # Get Centroid of Pond Polygons, thus transfer geometry type from polygon to point
  fc_point= [i.map(lambda feature: feature.centroid()) for i in fc_area]

  return fc_point

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/1.v1.split_by_country/")
files = sorted(glob(os.path.join(os.getcwd(), '*')))

def genFun(n): # len(files)

  i = 0
  while i < n:
    input = files[i]
    output = tidy_up(input) # output is a list

    outfile_namePrefix = files[i].split('/')[-1].split('.')[0]

    for j in range(len(output)):
      task_config = {
              'description': 'zonStats_tidy',
              'fileNamePrefix': outfile_namePrefix+'_'+str(j),
              'folder': '2.v1.area_centroid',
              'fileFormat': 'GeoJSON'}
      task = ee.batch.Export.table.toDrive(output[j], **task_config)
      yield task.start()
      
    i += 1

gen = genFun(len(files))

In [None]:
# Exhaust Generator
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

### Merge Dataframes by Country

In [None]:
#countries = ['Malaysia', 'Thailand', 'Vietnam', 'Myanmar', 'Cambodia']
countries = ['KHM', 'MMR', 'MYS', 'THA', 'VNM']

def genFun(n):
  inPath = "/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/2.v1.area_centroid/"
  outPath = "/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/"

  i = 0
  while i < n:
    files = glob(os.path.join(inPath, '*'+countries[i]+'*.geojson'))
    dfs = [gpd.read_file(i) for i in files]
    df = pd.concat([i for i in dfs])
    yield df.to_file(os.path.join(outPath, countries[i]+"_merged.geojson"))
    i += 1

gen = genFun(len(countries))

In [None]:
# Exhaust Generator
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

### De-geometry

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/v1/")
files = glob(os.path.join(os.getcwd(), '*merged*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/v1/KHM_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/v1/MMR_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/v1/MYS_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/v1/THA_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/v1/VNM_merged.geojson']

In [None]:
#gdf = gpd.read_file(files[0])

In [None]:
#df = gdf.drop('geometry', 1)

  """Entry point for launching an IPython kernel.


In [None]:
def genFun(n): # n=len(files)
  outpath = "/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/"
  j = 0
  while j < n:
    gdf = gpd.read_file(files[j])
    df = gdf.drop('geometry', 1)
    country = files[j].split('/')[-1].split('_')[0]
    yield df.to_csv(os.path.join(outpath, country+'_degeom.csv'), index=False)
    j += 1

gen = genFun(len(files))

In [None]:
# Exhaust Generator
# Reference: https://stackoverflow.com/questions/47456631/simpler-way-to-run-a-generator-function-without-caring-about-items
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

  
  
  
  
  


### Calculate admin region Area using GEE

#### Flow

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfB_admin/lev.1/")
files = glob(os.path.join(os.getcwd(), '*')) # Admin level-1 \ -2
countries = [i.split('/')[-1].split('_')[0] for i in files]
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfB_admin/lev.1/Cambodia_lev1.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfB_admin/lev.1/Thailand_lev1.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfB_admin/lev.1/Vietnam_lev1.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfB_admin/lev.1/Malaysia_lev1.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfB_admin/lev.1/Myanmar_lev1.geojson']

In [None]:
with open(files[1]) as f:
  geojson_ini = geojson.load(f)

In [None]:
len(geojson_ini['features'])

77

In [None]:
# Subset big GeoJSON into processible parts (2500 items/part)
geojson_tiled = [tuple(geojson_ini[i:i+20]) for i in range(0, len(geojson_ini['features']), 20)]
len(geojson_tiled[-1])

17

In [None]:
# Load GeoJSON as feature collection
fc = [ee.FeatureCollection(i) for i in geojson_tiled]
type(fc)

list

In [None]:
# Pond Area in square meters
fc_area = [i.map(lambda feature: feature.set({'Area_m2': feature.area()})) for i in fc]
type(fc_area)

list

In [None]:
# Get Centroid of Pond Polygons, thus transfer geometry type from polygon to point
#fc_point= [i.map(lambda feature: feature.centroid()) for i in fc_area]
#type(fc_point)

list

In [None]:
# Export Feature Collection as GeoJSON
for i in range(len(fc_area)):
  task_config = {
              'description': 'add_area',
              'fileNamePrefix': countries[1]+'_lev1_areaAd_'+str(i),
              'folder': 'add_area',
              'fileFormat': 'GeoJSON'}

  task = ee.batch.Export.table.toDrive(fc_area[i], **task_config)
  task.start()

In [None]:
task.status()

#### Function

In [None]:
def add_area(geojson_path):

  # Load GeoJSON
  with open(geojson_path) as f:
    geojson_ini = geojson.load(f)

  # Subset big GeoJSON into processible parts (20 items/part)
  geojson_tiled = [tuple(geojson_ini[i:i+10]) for i in range(0, len(geojson_ini['features']), 10)]
  
  # Load GeoJSON as feature collection
  fc = [ee.FeatureCollection(i) for i in geojson_tiled]

  # Pond Area in square meters
  fc_area = [i.map(lambda feature: feature.set({'Area_m2': feature.area()})) for i in fc]

  return fc_area

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfB_admin/lev.1/")

#files = glob(os.path.join(os.getcwd(), '*')) # Admin level-1 \ -2
#countries = [i.split('/')[-1].split('_')[0] for i in files]

countries = ['Malaysia', 'Vietnam', 'Myanmar']
files = [glob(os.path.join(os.getcwd(), '*'+i+'*.geojson')) for i in countries]
from itertools import chain
files = list(chain(*files))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfB_admin/lev.1/Malaysia_lev1.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfB_admin/lev.1/Vietnam_lev1.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfB_admin/lev.1/Myanmar_lev1.geojson']

In [None]:
def genFun(n): # len(countries)

  i = 0
  while i < n:
    input = files[i]
    output = add_area(input) # output is a list

    for j in range(len(output)):
      task_config = {
              'description': 'add_area',
              'fileNamePrefix': countries[i]+'_lev1_areaAd_'+str(j),
              'folder': 'add_area',
              'fileFormat': 'GeoJSON'}
      task = ee.batch.Export.table.toDrive(output[j], **task_config)
      yield task.start()
      
    i += 1

gen = genFun(len(countries))

In [None]:
# Exhaust Generator
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

## Plot: time series of active pond numbers & pond area

In [22]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1")
files = glob(os.path.join(os.getcwd(), '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/KHM_degeom.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/MMR_degeom.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/MYS_degeom.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/THA_degeom.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/VNM_degeom.csv']

**Note**: There might be ponds whose time series start in 1987, the others in 1988, so the "1987" column might be `NA` for those whose time series start in 1988.

### Flow

#### By Country

In [23]:
df = pd.read_csv(files[1])
# Fill NA of first column
#df[[df.columns[0]]] = df[[df.columns[0]]].fillna(value=2)
# Convert first column to integer
#df = df.astype({df.columns[0]:int})

In [25]:
tsCol = [i for i in df.columns if 'majority' in i]

In [None]:
active_ponds = [len(df[df[i]==1]) for i in tsCol]

In [None]:
active_area_km2 = [round(sum(df[df[i]==1]['Area'])/1e6, 2) for i in tsCol]

In [28]:
years = [i.split('_')[0] for i in tsCol]

In [33]:
country = [files[1].split('/')[-1].split('_')[0]]*len(years)
len(country)

33

In [37]:
country2 = list(set(list(df['NAME_0'])))*len(years)
len(country2)

33

In [None]:
data = {'country': country, 
        'year': years,
        'active_count': active_ponds,
        'active_area_km2': active_area_km2}

df_plot = pd.DataFrame(data)

#### By Province / State

In [None]:
countries = [i.split("/")[-1].split(".")[0].split("_")[0] for i in files]
countries

['Malaysia', 'Thailand', 'Vietnam', 'Myanmar', 'Cambodia']

In [None]:
df = pd.read_csv(files[0])

In [None]:
col_ini = [i for i in df.columns if 'majority' in i]

In [None]:
# Get unique provinces
provinces = list(set(df['NAME_1']))

In [None]:
# Subset df by province
df_grouped = [df[df['NAME_1']==i] for i in provinces]

In [None]:
# Column "count" of active ponds sorted by lev-1 province, lev-2 years
col_count = [[len(i[i[j]==1].index) for j in col_ini] for i in df_grouped]

In [None]:
# Column "area" of active ponds sorted by lev-1 province, lev-2 years 
col_area = [[round(i[i[j]==1]['Area'].sum()/1e4, 2) for j in col_ini] for i in df_grouped]

In [None]:
col_province = [[i]*len(col_ini) for i in provinces] # Column "province"
col_yr = [[i.split('_')[0] for i in col_ini]]*len(col_province) # Column "year"

In [None]:
from itertools import chain
# Unnest nested lists
col_count = list(chain(*col_count))
col_area = list(chain(*col_area))
col_yr = list(chain(*col_yr))
col_province = list(chain(*col_province))

col_country = [countries[0]]*len(col_province)

In [None]:
print(len(col_count), len(col_area), len(col_yr), len(col_province), len(col_country))

495 495 495 495 495


In [None]:
df_output = pd.DataFrame({"country": col_country,
                          "province": col_province,
                          "year": col_yr,
                          "ponds_count": col_count,
                          "ponds_area_ha": col_area})

In [None]:
df_output

Unnamed: 0,country,province,year,ponds_count,ponds_area_ha
0,Malaysia,Pahang,1987,53,15.39
1,Malaysia,Pahang,1988,555,253.35
2,Malaysia,Pahang,1989,582,285.75
3,Malaysia,Pahang,1990,659,365.79
4,Malaysia,Pahang,1991,742,400.02
...,...,...,...,...,...
490,Malaysia,Kuala Lumpur,2015,93,151.72
491,Malaysia,Kuala Lumpur,2016,89,148.51
492,Malaysia,Kuala Lumpur,2017,93,151.70
493,Malaysia,Kuala Lumpur,2018,95,151.76


### Function

#### By Country

In [38]:
def genFun(n): # n=len(files)
  outpath = "/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/6.plot_count_area/byCountry_km2/v1"
  j = 0
  while j < n:
    df = pd.read_csv(files[j])
    # Fill NA of first column
    #df[[df.columns[0]]] = df[[df.columns[0]]].fillna(value=2)
    # Convert first column to integer
    #df = df.astype({df.columns[0]:int})

    tsCol = [i for i in df.columns if 'majority' in i]
    active_ponds = [len(df[df[i]==1]) for i in tsCol]
    active_area = [round(sum(df[df[i]==1]['pondSize_m2'])/1e6, 2) for i in tsCol]
    years = [i.split('_')[0] for i in tsCol]
    country = list(set(list(df['NAME_0'])))*len(years)

    data = {'country': country, 
            'year': years,
            'pond_count': active_ponds,
            'pond_area_km2': active_area}
    df_plot = pd.DataFrame(data)

    yield df_plot.to_csv(os.path.join(outpath, country[0]+'_plot_count_area.csv'), index=False)
    j += 1

gen = genFun(len(files))

In [39]:
# Exhaust Generator
# Reference: https://stackoverflow.com/questions/47456631/simpler-way-to-run-a-generator-function-without-caring-about-items
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

#### By Province

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/")
files = glob(os.path.join(os.getcwd(), '*'))
countries = [i.split("/")[-1].split(".")[0].split("_")[0] for i in files]

In [None]:
outpath = "/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/6.plot_count_area/byProvince_ha"

In [None]:
def genFun(n): # n=len(files)
  
  k = 0
  while k < n:
    df = pd.read_csv(files[k])
    
    col_ini = [i for i in df.columns if 'majority' in i]

    # Get unique provinces
    provinces = list(set(df['NAME_1']))

    # Subset df by province
    df_grouped = [df[df['NAME_1']==i] for i in provinces]

    # Column "count" of active ponds sorted by lev-1 province, lev-2 years
    col_count = [[len(i[i[j]==1].index) for j in col_ini] for i in df_grouped]
    # Column "area" of active ponds sorted by lev-1 province, lev-2 years 
    col_area = [[round(i[i[j]==1]['Area'].sum()/1e4, 2) for j in col_ini] for i in df_grouped]
    # Column "province"
    col_province = [[i]*len(col_ini) for i in provinces] 
    # Column "year"
    col_yr = [[i.split('_')[0] for i in col_ini]]*len(col_province) 

    from itertools import chain
    # Unnest nested lists
    col_count = list(chain(*col_count))
    col_area = list(chain(*col_area))
    col_yr = list(chain(*col_yr))
    col_province = list(chain(*col_province))

    col_country = [countries[k]]*len(col_province)

    df_output = pd.DataFrame({"country": col_country,
                          "province": col_province,
                          "year": col_yr,
                          "ponds_count": col_count,
                          "ponds_area_ha": col_area})

    yield df_output.to_csv(os.path.join(outpath, countries[k]+'_count_area_byProvince.csv'), index=False)
    k += 1

gen = genFun(len(files))

In [None]:
# Exhaust Generator
# Reference: https://stackoverflow.com/questions/47456631/simpler-way-to-run-a-generator-function-without-caring-about-items
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

### Merge

In [43]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/6.plot_count_area/byCountry_km2/v1")
files = glob(os.path.join(os.getcwd(), '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/6.plot_count_area/byCountry_km2/v1/Cambodia_plot_count_area.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/6.plot_count_area/byCountry_km2/v1/Myanmar_plot_count_area.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/6.plot_count_area/byCountry_km2/v1/Malaysia_plot_count_area.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/6.plot_count_area/byCountry_km2/v1/Thailand_plot_count_area.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/6.plot_count_area/byCountry_km2/v1/Vietnam_plot_count_area.csv']

In [44]:
dfs = [pd.read_csv(i) for i in files]

In [45]:
# Add new column: OO percent area of area in 2019
dfs = [i.assign(area_percent_of2019 = lambda x: round(x['pond_area_km2'] / float(x['pond_area_km2'][x['year']==2019]) * 100, 2)) for i in dfs]
# Add new column: OO percent count of count in 2019
dfs = [i.assign(count_percent_of2019 = lambda x: round(x['pond_count'] / float(x['pond_count'][x['year']==2019]) * 100, 2)) for i in dfs]

In [46]:
#dfs = [pd.read_csv(i) for i in files]
df = pd.concat([i for i in dfs])

In [47]:
df.to_csv(os.path.join(os.getcwd(), 'all_count_area_byCountry.csv'), index=False)

## Plot: Statistics of NA in ponds
*(Do it again using de-geom csv file after filling NA with 2 for first time stamp)*

In [48]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/")
files = glob(os.path.join(os.getcwd(), '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/KHM_degeom.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/MMR_degeom.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/MYS_degeom.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/THA_degeom.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/VNM_degeom.csv']

### Flow

In [None]:
df = gpd.read_file(files[0])

In [None]:
files[-1].split('/')[-1].split('_')[0]

'Cambodia'

In [None]:
# Count of Rows
len(df.index)

17658

In [None]:
col_stats = [i for i in df.columns if 'majority' in i]
years = [i.split('_')[0] for i in col_stats]
na_rate = [len(df[df[i]==2].index)/len(df.index)*100 for i in col_stats]
country = ['Cambodia']*len(years)

In [None]:
data = {'country': country, 
        'year': years,
        'na_rate': na_rate}

df_NA = pd.DataFrame(data)

### Function

In [50]:
def genFun(n): # n=len(files)
  j = 0
  #lst = []
  while j < n:
    df = pd.read_csv(files[j])
    col_stats = [i for i in df.columns if 'majority' in i]
    years = [i.split('_')[0] for i in col_stats]
    na_rate = [len(df[df[i]==2].index)/len(df.index)*100 for i in col_stats]
    country = list(set(list(df['NAME_0'])))*len(years)
    data = {'country': country, 
            'year': years,
            'na_rate': na_rate}
    df_NA = pd.DataFrame(data)
    yield df_NA.to_csv(os.path.join("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/5.NA_Stats/v1/", country[0]+'_naStats.csv'), index=False)
    j += 1

gen = genFun(len(files))

In [51]:
# Exhaust Generator
# Reference: https://stackoverflow.com/questions/47456631/simpler-way-to-run-a-generator-function-without-caring-about-items
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

### Explore

In [52]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/5.NA_Stats/v1/")
files = glob(os.path.join(os.getcwd(), '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/5.NA_Stats/v1/Cambodia_naStats.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/5.NA_Stats/v1/Myanmar_naStats.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/5.NA_Stats/v1/Malaysia_naStats.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/5.NA_Stats/v1/Thailand_naStats.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/5.NA_Stats/v1/Vietnam_naStats.csv']

In [55]:
dfs = [pd.read_csv(i) for i in files]
df = pd.concat([i for i in dfs])
#df = df.drop('Unnamed: 0', 1)

In [56]:
df.to_csv("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/5.NA_Stats/v1/naStats_merged_v1.csv", index=False)

## Map: aquaculture pond development by admin region

### Flows

#### 1.1: Total Area of Ponds by Provinces / Districts

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
countries = ['Cambodia', 'Vietnam', 'Thailand', 'Myanmar', 'Malaysia']

admin_level = 'admin_lev1'
gid = 'GID_1'

filesA = glob(os.path.join(os.getcwd(), '*dfA*', '*')) # Pond zonStats
filesB = glob(os.path.join(os.getcwd(), '*dfB*', '*1*', 'v1.1', "*")) # Admin level-1 \ -2

files = [[i for i in filesA+filesB if j in i] for j in countries]

In [None]:
dfA = pd.read_csv(files[1][0]) # dfA is df of pond zonStats
dfB = gpd.read_file(files[1][1]) # dfB is gdf of admin region

In [None]:
# Subset admin gdf to regions owning ponds
dfB_filtered = dfB[dfB[gid].isin(dfA[gid])]
GIDs = dfB_filtered[gid].tolist()

In [None]:
colnames = [col for col in dfA.columns if 'majority' in col]

In [None]:
# Remove years with data gap > 5%
for i in range(len(colnames)):
  if len(dfA[dfA[colnames[i]]==2]) / len(dfA[colnames[i]]) > 0.05:
    dfA = dfA.drop(colnames[i], axis=1)

In [None]:
colnames_updated = [col for col in dfA.columns if 'majority' in col]
years = [i.split('_')[0] for i in colnames_updated]

In [None]:
# Sum pond area by admin_lev for each year, add this as attribute of admin region.
dfB_joined = dfB_filtered
for y in years:
  dfA_1Yactive = dfA[dfA[y+'_majority']==1]

  df_area_1Y = dfA_1Yactive.groupby([gid], as_index=False)['Area'].sum()
  df_area_1Y['Area'] = round(df_area_1Y['Area']/10e4, 2)
  df_area_1Y = df_area_1Y.rename(columns={'Area':'Area_ha_'+y})

  dfB_joined = pd.merge(dfB_joined, df_area_1Y, on=gid, how='left')

In [None]:
# Fill NA with 0
colArea = [col for col in dfB_joined.columns if 'Area_ha' in col]
dfB_joined[colArea] = dfB_joined[colArea].fillna(0)

#### 1.2: Density of Pond Area by Provinces / Districts

In [66]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
countries = ['Cambodia', 'Vietnam', 'Thailand', 'Myanmar', 'Malaysia']

admin_level = 'admin_lev1'
gid = 'GID_1'

filesA = glob(os.path.join(os.getcwd(), '*dfA*', '*')) # Pond zonStats
filesB = glob(os.path.join(os.getcwd(), '*dfB*', '*1*', 'v1.1', "*")) # Admin level-1 \ -2

files = [[i for i in filesA+filesB if j in i] for j in countries]

In [68]:
dfA = pd.read_csv(files[0][0]) # dfA is df of pond zonStats
dfB = gpd.read_file(files[0][1]) # dfB is gdf of admin region

In [69]:
# Subset admin gdf to regions owning ponds
dfB_filtered = dfB[dfB[gid].isin(dfA[gid])]
GIDs = dfB_filtered[gid].tolist()

In [70]:
colnames = [col for col in dfA.columns if 'majority' in col]

In [71]:
# Remove years with data gap > 5%
for i in range(len(colnames)):
  if len(dfA[dfA[colnames[i]]==2]) / len(dfA[colnames[i]]) > 0.05:
    dfA = dfA.drop(colnames[i], axis=1)

In [72]:
colnames_updated = [col for col in dfA.columns if 'majority' in col]
years = [i.split('_')[0] for i in colnames_updated]

In [81]:
# Sum pond area by admin_lev for each year, add this as attribute of admin region.
dfB_joined = dfB_filtered
for y in years:
  dfA_1Yactive = dfA[dfA[y+'_majority']==1]

  df_area_1Y = dfA_1Yactive.groupby([gid], as_index=False)['pondSize_m2'].sum()
  df_area_1Y['pondSize_m2'] = round(df_area_1Y['pondSize_m2']/10e4, 2)
  df_area_1Y = df_area_1Y.rename(columns={'pondSize_m2':'pondArea_ha_'+y})

  dfB_joined = pd.merge(dfB_joined, df_area_1Y, on=gid, how='left')

In [83]:
# Fill NA with 0
colArea = [col for col in dfB_joined.columns if 'pondArea_ha' in col]
dfB_joined[colArea] = dfB_joined[colArea].fillna(0)

In [84]:
# Calculate new column: pond area in ha/km2
#colname = [i for i in dfB_joined.columns if "pond_area_ha" in i]

for i in colArea:
   new_colname = "density_(m2/ha)_" + i.split("_")[-1]
   dfB_joined[new_colname] = (dfB_joined[i] * 10e4) / (dfB_joined['Area_m2'] / 10e4)

#### 2: Annual Rate of Increase in Pond Area by Provinces / Districts
$ R = \frac{A_{Y} - A_{Y-1}}{A_{Y-1}} \times 100 $

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
#countries = ['Cambodia', 'Vietnam', 'Thailand', 'Myanmar', 'Malaysia']

files = glob(os.path.join(os.getcwd(), '*dfC*', '*2*', '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_admin_zonStats_aggregated/admin_lev2/Cambodia_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_admin_zonStats_aggregated/admin_lev2/Vietnam_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_admin_zonStats_aggregated/admin_lev2/Thailand_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_admin_zonStats_aggregated/admin_lev2/Myanmar_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_admin_zonStats_aggregated/admin_lev2/Malaysia_zonStats_admin2.geojson']

In [None]:
df = gpd.read_file(files[1])

In [None]:
colnames = [col for col in df.columns if 'Area_ha' in col]
years = [i.split('_')[-1] for i in colnames]

In [None]:
# Annual Increase Rate
df_increase = df
for i in range(len(colnames)-1):
  increase_rate = round((df[colnames[i+1]] - df[colnames[i]]) / df[colnames[i]]*100, 1)
  df_increase['increase_rate(%)_'+years[i+1]] = increase_rate  

In [None]:
# Replace infinity with nan
df_increase.replace([np.inf, -np.inf], np.nan, inplace=True)

#### 3.1 Five-year Average Rate of Increase in Pond Area by Provinces / Districts:

$ avg.R = \frac{A_{T.tail} - A_{T.head}}{A_{T.head}} \times \frac{1}{T_{length}} \times 100 $

* Reference: https://sciencing.com/calculate-average-percent-change-5485263.html

* Output `None` if, for a time period, either "Area" is 0 for all years or only one year has "Area"!=0

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
#countries = ['Cambodia', 'Vietnam', 'Thailand', 'Myanmar', 'Malaysia']

files = glob(os.path.join(os.getcwd(), '*dfC*', '*2*', '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Cambodia_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Vietnam_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Thailand_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Myanmar_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Malaysia_zonStats_admin2.geojson']

In [None]:
df = gpd.read_file(files[2])

In [None]:
x = range(1985, 2020)
y = [x[n:n+6] for n in range(0, len(x), 5)]
category_yrs = [['Area_ha_'+str(i) for i in j] for j in y]

In [None]:
colnames = [col for col in df.columns if 'Area_ha' in col]
new_col = ['5y_avgGr_'+i[-1].split("_")[-1] for i in category_yrs]

period = [[i for i in j if i in colnames] for j in category_yrs]

In [None]:
def my_function(row):
  """For one row (admin region in the case), For every time period, 
  calculate the growth rate between the last value and the first non-zero value.
  
  This function is to be used with apply() function for on a DF"""

  new_attr = []

  for i in range(len(period)):
    cols = list(row[period[i]]) 
    # Select columns of a time period
    nzero_index = next((i for i, x in enumerate(cols) if x != 0), None) 
    # Find the index of the first non-zero element
  
    if nzero_index != None and len(cols[nzero_index:]) > 1:
      avgIn = (cols[-1] - cols[nzero_index]) / cols[nzero_index] / (len(cols[nzero_index:])-1) *100
      avgIn = round(avgIn, 1)
    else: 
      avgIn = None

    new_attr = new_attr + [avgIn]
  return new_attr

In [None]:
new_attrs_byrow = df.apply(my_function, axis=1) # axis=1 determines row-wise operation
# Each item are attributes for one row

In [None]:
new_attrs_bycol = [[i[j] for i in new_attrs_byrow] for j in range(len(new_attrs_byrow[0]))]
# Each item are the attributes for one column

In [None]:
df[new_col] = pd.Series(new_attrs_bycol)

In [None]:
df.head()

Unnamed: 0,OBJECTID,GID_0,NAME_0,GID_1,NAME_1,ENGTYPE_1,GID_2,NAME_2,ENGTYPE_2,Shape_Leng,...,Area_ha_2018,Area_ha_2019,geometry,5y_avgGr_1990,5y_avgGr_1995,5y_avgGr_2000,5y_avgGr_2005,5y_avgGr_2010,5y_avgGr_2015,5y_avgGr_2019
0,603,THA,Thailand,THA.13_1,Chumphon,Province,THA.13.1_1,Lamae,District,1.010119,...,4.46,4.7,"MULTIPOLYGON (((99.13953 9.82591, 99.13917 9.8...",7.8,91.9,7.9,2.0,6.1,-0.2,3.7
1,604,THA,Thailand,THA.13_1,Chumphon,Province,THA.13.2_1,Lang Suan,District,1.424844,...,8.21,8.64,"MULTIPOLYGON (((99.18861 10.05694, 99.18916 10...",-2.4,0.1,23.9,-4.0,4.0,0.2,9.0
2,605,THA,Thailand,THA.13_1,Chumphon,Province,THA.13.3_1,Muang Chumphon,District,1.878261,...,45.82,46.23,"MULTIPOLYGON (((99.30019 10.32064, 99.30055 10...",30.3,9.7,4.9,-2.4,3.1,1.0,1.8
3,606,THA,Thailand,THA.13_1,Chumphon,Province,THA.13.5_1,Phato,District,1.437504,...,0.0,0.0,"MULTIPOLYGON (((98.88230 10.00423, 98.88304 10...",,,,,,,
4,607,THA,Thailand,THA.13_1,Chumphon,Province,THA.13.6_1,Sawi,District,1.750744,...,14.29,14.46,"MULTIPOLYGON (((99.24962 10.19962, 99.24889 10...",37.2,33.4,6.0,1.1,1.2,0.1,2.3


#### 3.2 Average Annual Growth Rate
* $ avg.R = \frac{R_{T.head}+...+R_{T.tail}}{T_{length}} $
* Reference: https://www.investopedia.com/terms/a/aagr.asp

In [None]:
# Source: https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

years_category_5y = list(chunks(range(1986, 2020), 5))
years_category_10y = list(chunks(range(1981, 2020), 10))
colnames_category_5y = [['increase_rate(%)_'+str(i) for i in j] for j in years_category_5y]
colnames_category_10y = [['increase_rate(%)_'+str(i) for i in j] for j in years_category_10y]
#colnames_category

In [None]:
colnames_2 = [col for col in df_increase.columns if 'increase' in col]
colnames_5y_chunked = [[i for i in j if i in colnames_2] for j in colnames_category_5y]
colnames_10y_chunked = [[i for i in j if i in colnames_2] for j in colnames_category_10y]

In [None]:
df_avgIncrease = df_increase

for i in range(len(colnames_5y_chunked)):
  new_colname = '5YavgIncrease(%)_period_'+str(years_category_5y[i][-1])
  df_avgIncrease[new_colname] = round(df_avgIncrease[colnames_5y_chunked[i]].mean(axis=1), 1)

for i in range(len(colnames_10y_chunked)):
  new_colname = '10YavgIncrease(%)_period_' + str(years_category_10y[i][-1])
  df_avgIncrease[new_colname] = round(df_avgIncrease[colnames_10y_chunked[i]].mean(axis=1), 1)

#### 4. Ten-year Average Rate of Increase in Pond Area by Provinces / Districts

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
#countries = ['Cambodia', 'Vietnam', 'Thailand', 'Myanmar', 'Malaysia']

files = glob(os.path.join(os.getcwd(), '*dfC*', '*2*', '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Cambodia_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Vietnam_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Thailand_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Myanmar_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Malaysia_zonStats_admin2.geojson']

In [None]:
df = gpd.read_file(files[1])

In [None]:
x = range(1985, 2020)
x = [str(i) for i in x]

from more_itertools import split_after
y = list(split_after(x, lambda x: x.endswith('0')))
for i in range(1, len(y)):
  y[i] = [y[i-1][-1]] + y[i]

category_yrs = [['Area_ha_'+str(i) for i in j] for j in y]

In [None]:
colnames = [col for col in df.columns if 'Area_ha' in col]
new_col = ['10y_avgGr_'+i[-1].split("_")[-1] for i in category_yrs]

period = [[i for i in j if i in colnames] for j in category_yrs]

In [None]:
def my_function(row):
  """For one row (admin region in the case), For every time period, 
  calculate the growth rate between the last value and the first non-zero value.
  
  This function is to be used with apply() function for on a DF"""

  new_attr = []

  for i in range(len(period)):
    cols = list(row[period[i]]) 
    # Select columns of a time period
    nzero_index = next((i for i, x in enumerate(cols) if x != 0), None) 
    # Find the index of the first non-zero element
  
    if nzero_index != None and len(cols[nzero_index:]) > 1:
      avgIn = (cols[-1] - cols[nzero_index]) / cols[nzero_index] / (len(cols[nzero_index:])-1) *100
      avgIn = round(avgIn, 1)
    else: 
      avgIn = None

    new_attr = new_attr + [avgIn]
  return new_attr

In [None]:
new_attrs_byrow = df.apply(my_function, axis=1) # axis=1 determines row-wise operation
# Each item are attributes for one row

In [None]:
new_attrs_bycol = [[i[j] for i in new_attrs_byrow] for j in range(len(new_attrs_byrow[0]))]
# Each item are the attributes for one column

In [None]:
df[new_col] = pd.Series(new_attrs_bycol)

In [None]:
df.head()

Unnamed: 0,OBJECTID,GID_0,NAME_0,GID_1,NAME_1,ENGTYPE_1,GID_2,NAME_2,ENGTYPE_2,Shape_Leng,...,Area_ha_2015,Area_ha_2016,Area_ha_2017,Area_ha_2018,Area_ha_2019,geometry,10y_avgGr_1990,10y_avgGr_2000,10y_avgGr_2010,10y_avgGr_2019
0,4033,VNM,Vietnam,VNM.1_1,An Giang,Province,VNM.1.1_1,An PhÃº,District,0.703803,...,11.18,11.19,11.26,11.35,11.44,"MULTIPOLYGON (((105.13100 10.92130, 105.13191 ...",-0.1,-0.5,0.2,0.4
1,4034,VNM,Vietnam,VNM.1_1,An Giang,Province,VNM.1.10_1,Thoáº¡i SÆ¡n,District,0.990901,...,5.08,4.3,6.15,6.06,7.27,"MULTIPOLYGON (((105.12011 10.37594, 105.12595 ...",-4.8,15.1,-4.1,8.5
2,4035,VNM,Vietnam,VNM.1_1,An Giang,Province,VNM.1.11_1,Tri TÃ´n,District,1.054701,...,1.49,3.83,4.38,4.38,4.6,"MULTIPOLYGON (((105.11389 10.43545, 105.11304 ...",90.9,10.4,-7.9,148.6
3,4036,VNM,Vietnam,VNM.1_1,An Giang,Province,VNM.1.2_1,Chá»£ Má»›i,District,0.838008,...,10.32,10.45,10.5,10.85,11.18,"MULTIPOLYGON (((105.55319 10.51669, 105.55775 ...",0.5,3.3,-0.2,2.4
4,4037,VNM,Vietnam,VNM.1_1,An Giang,Province,VNM.1.3_1,ChÃ¢u Äá»‘c,City,0.454041,...,0.97,0.96,0.97,0.98,0.99,"MULTIPOLYGON (((105.12457 10.70979, 105.12977 ...",-13.0,0.6,-3.5,7.2


### Functions

#### 1.1. Total Area of Ponds in Provinces / Districts

In [17]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
countries = ['Cambodia', 'Vietnam', 'Thailand', 'Myanmar', 'Malaysia']

filesA = glob(os.path.join(os.getcwd(), '*dfA*', '*')) # Pond zonStats
#filesB = glob(os.path.join(os.getcwd(), '*dfB*', '*1*', '*')) # Admin level-1 \ -2
filesB = glob(os.path.join(os.getcwd(), '*dfB*', '*1*', '*1.1*', '*'))

files = [[i for i in filesA+filesB if j in i] for j in countries]

gidLevl = 'GID_1'

In [19]:
def genFun(n): # n=len(files)
  j = 0
  while j < n:
    dfA = pd.read_csv(files[j][0]) # dfA is df of pond zonStats
    dfB = gpd.read_file(files[j][1]) # dfB is gdf of admin region

    dfB_filtered = dfB[dfB[gidLevl].isin(dfA[gidLevl])] # Keep only admin regions containing ponds
    GIDs = dfB_filtered[gidLevl].tolist()

    colnames = [col for col in dfA.columns if 'majority' in col]
    
    # Remove years with data gap > 5%
    for i in range(len(colnames)):
      if len(dfA[dfA[colnames[i]]==2]) / len(dfA[colnames[i]]) > 0.05:
        dfA = dfA.drop(colnames[i], axis=1)

    colnames_updated = [col for col in dfA.columns if 'majority' in col]
    years = [i.split('_')[0] for i in colnames_updated]

    dfB_joined = dfB_filtered
    for y in years:
      dfA_1Yactive = dfA[dfA[y+'_majority']==1] # For year Y, select ponds that are active 

      df_area_1Y = dfA_1Yactive.groupby([gidLevl], as_index=False)['pondSize_m2'].sum() # Group active ponds by GID and calculate sum of pond area for each GID
      df_area_1Y['pondSize_m2'] = round(df_area_1Y['pondSize_m2']/10e4, 2) # Transfer pond area from m2 to ha
      df_area_1Y = df_area_1Y.rename(columns={'pondSize_m2':'pondArea_ha_'+y}) # Rename column

      dfB_joined = pd.merge(dfB_joined, df_area_1Y, on=gidLevl, how='left') # Join "pond area by GID" of year Y into admin df

      colArea = [col for col in dfB_joined.columns if 'pondArea_ha' in col]   
      dfB_joined[colArea] = dfB_joined[colArea].fillna(0)
      # NA filled with 0

    yield dfB_joined.to_file(os.path.join(os.getcwd(), 'dfC_pondArea_byRegion', 'admin_lev1', countries[j]+'_zonStats_admin1.geojson'), driver='GeoJSON')
    j += 1

gen = genFun(len(files))

In [20]:
# Exhaust Generator
# Reference: https://stackoverflow.com/questions/47456631/simpler-way-to-run-a-generator-function-without-caring-about-items
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

#### 1.2. Density of Pond Area by Province / District

In [86]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
countries = ['Cambodia', 'Vietnam', 'Thailand', 'Myanmar', 'Malaysia']

filesA = glob(os.path.join(os.getcwd(), '*dfA*', '*')) # Pond zonStats
filesB = glob(os.path.join(os.getcwd(), '*dfB*', '*1*', 'v1.1', '*')) # Admin level-1 \ -2

files = [[i for i in filesA+filesB if j in i] for j in countries]

gidLevl = 'GID_1'

In [87]:
def genFun(n): # n=len(files)
  j = 0
  while j < n:
    dfA = pd.read_csv(files[j][0]) # dfA is df of pond zonStats
    dfB = gpd.read_file(files[j][1]) # dfB is gdf of admin region

    dfB_filtered = dfB[dfB[gidLevl].isin(dfA[gidLevl])] # Keep only admin regions containing ponds
    GIDs = dfB_filtered[gidLevl].tolist()

    colnames = [col for col in dfA.columns if 'majority' in col]
    
    # Remove years with data gap > 5%
    for i in range(len(colnames)):
      if len(dfA[dfA[colnames[i]]==2]) / len(dfA[colnames[i]]) > 0.05:
        dfA = dfA.drop(colnames[i], axis=1)

    colnames_updated = [col for col in dfA.columns if 'majority' in col]
    years = [i.split('_')[0] for i in colnames_updated]

    dfB_joined = dfB_filtered
    for y in years:
      dfA_1Yactive = dfA[dfA[y+'_majority']==1] # For year Y, select ponds that are active 

      df_area_1Y = dfA_1Yactive.groupby([gid], as_index=False)['pondSize_m2'].sum() # Group active ponds by GID and calculate sum of pond area for each GID
      df_area_1Y['pondSize_m2'] = round(df_area_1Y['pondSize_m2']/10e4, 2) # Transfer pond area from m2 to ha
      df_area_1Y = df_area_1Y.rename(columns={'pondSize_m2':'pondArea_ha_'+y}) # Rename column

      dfB_joined = pd.merge(dfB_joined, df_area_1Y, on=gidLevl, how='left') # Join "pond area by GID" of year Y into admin df

      colArea = [col for col in dfB_joined.columns if 'pondArea_ha' in col]   
      dfB_joined[colArea] = dfB_joined[colArea].fillna(0)
      # NA filled with 0

      # Calculate new column: pond area in m2/ha
      for i in colArea:
        new_colname = "density_(m2/ha)_" + i.split("_")[-1]
        dfB_joined[new_colname] = (dfB_joined[i] * 10e4) / (dfB_joined['Area_m2'] / 10e4)

    yield dfB_joined.to_file(os.path.join(os.getcwd(), 'dfF_pondDensity_byRegion', 'lev1', countries[j]+'_pondDensity_lev1.geojson'), driver='GeoJSON')
    j += 1

gen = genFun(len(files))

In [88]:
# Exhaust Generator
# Reference: https://stackoverflow.com/questions/47456631/simpler-way-to-run-a-generator-function-without-caring-about-items
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

#### 2. Annual Growth Rate of Pond Area by Provinces / Districts

In [22]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
admin_level = "admin_lev2"

files = glob(os.path.join(os.getcwd(), '*dfC*', admin_level, '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Cambodia_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Vietnam_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Thailand_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Myanmar_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Malaysia_zonStats_admin2.geojson']

In [23]:
countries = [i.split('/')[-1].split('_')[0] for i in files]

In [24]:
def genFun(n): # n=len(files)
  j = 0
  while j < n:
    df = gpd.read_file(files[j])

    # Annual Increase Rate
    colnames = [col for col in df.columns if 'pondArea_ha' in col]
    years = [i.split('_')[-1] for i in colnames]

    df_increase = df
    for i in range(len(colnames)-1):
      increase_rate = round((df[colnames[i+1]] - df[colnames[i]]) / df[colnames[i]]*100, 1)
      df_increase['1yGr_'+years[i+1]] = increase_rate
    # Replace infinity with nan
    df_increase.replace([np.inf, -np.inf], np.nan, inplace=True)

    yield df_increase.to_file(os.path.join(os.getcwd(), 'dfD_GrR_byRegion', admin_level, "1y_Gr", countries[j]+'_1yGr.geojson'), driver='GeoJSON')
    j += 1

gen = genFun(len(files))

In [25]:
# Exhaust Generator
# Reference: https://stackoverflow.com/questions/47456631/simpler-way-to-run-a-generator-function-without-caring-about-items
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

#### 3. Five-year avg. Rate of Increase in Pond Area by Provinces / Districts

$ \frac{A_{T.tail} - A_{T.head}}{A_{T.head}} \times \frac{1}{T} \times 100 $

In [31]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")

admin_level = 'admin_lev2'
files = glob(os.path.join(os.getcwd(), '*dfC*', admin_level, '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Cambodia_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Vietnam_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Thailand_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Myanmar_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Malaysia_zonStats_admin2.geojson']

In [35]:
countries = [i.split('/')[-1].split('_')[0] for i in files]
output_path = os.path.join(os.getcwd(), 'dfD_GrR_byRegion', admin_level, '5y_Gr')

In [37]:
def genFun(n): # n=len(files)

  j = 0
  while j < n:
    df = gpd.read_file(files[j])

    x = range(1985, 2020)
    y = [x[n:n+6] for n in range(0, len(x), 5)]
    category_yrs = [['pondArea_ha_'+str(i) for i in j] for j in y]

    colnames = [col for col in df.columns if 'pondArea_ha' in col]
    new_col = ['5y_avgGr_'+i[-1].split("_")[-1] for i in category_yrs]

    period = [[i for i in j if i in colnames] for j in category_yrs]

    #---------------------------------------------------
    def my_function(row):
      """For one row (admin region in the case), For every time period, 
      calculate the growth rate between the last value and the first non-zero value.
      
      This function is to be used with apply() function for on a DF"""

      new_attr = []

      for i in range(len(period)):
        cols = list(row[period[i]]) 
        # Select columns of a time period
        nzero_index = next((i for i, x in enumerate(cols) if x != 0), None) 
        # Find the index of the first non-zero element
      
        if nzero_index != None and len(cols[nzero_index:]) > 1:
          avgIn = (cols[-1] - cols[nzero_index]) / cols[nzero_index] / (len(cols[nzero_index:])-1) *100
          avgIn = round(avgIn, 1)
        else: 
          avgIn = None

        new_attr = new_attr + [avgIn]
      return new_attr
    #---------------------------------------------------

    new_attrs_byrow = df.apply(my_function, axis=1)

    new_attrs_bycol = [[i[j] for i in new_attrs_byrow] for j in range(len(new_attrs_byrow[0]))]

    df[new_col] = pd.Series(new_attrs_bycol)

    yield df.to_file(os.path.join(output_path, countries[j]+'_5yGr.geojson'), driver='GeoJSON')
    j += 1

gen = genFun(len(files))

In [38]:
# Exhaust Generator
# Reference: https://stackoverflow.com/questions/47456631/simpler-way-to-run-a-generator-function-without-caring-about-items
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

#### 4. Ten-year avg. Rate of Increase in Pond Area by Provinces / Districts

$ \frac{A_{T.tail} - A_{T.head}}{A_{T.head}} \times \frac{1}{T} \times 100 $

In [47]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")

admin_level = 'admin_lev1'
files = glob(os.path.join(os.getcwd(), '*dfC*', admin_level, '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev1/Cambodia_zonStats_admin1.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev1/Vietnam_zonStats_admin1.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev1/Thailand_zonStats_admin1.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev1/Myanmar_zonStats_admin1.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev1/Malaysia_zonStats_admin1.geojson']

In [48]:
output_path = os.path.join(os.getcwd(), 'dfD_GrR_byRegion', admin_level, '10y_Gr')
countries = [i.split('/')[-1].split('_')[0] for i in files]

In [49]:
def genFun(n): # n=len(files)

  j = 0
  while j < n:
    df = gpd.read_file(files[j])

    x = range(1985, 2020)
    x = [str(i) for i in x]

    from more_itertools import split_after
    y = list(split_after(x, lambda x: x.endswith('0')))
    for i in range(1, len(y)):
      y[i] = [y[i-1][-1]] + y[i]

    category_yrs = [['pondArea_ha_'+str(i) for i in j] for j in y]

    colnames = [col for col in df.columns if 'pondArea_ha' in col]
    new_col = ['10y_avgGr_'+i[-1].split("_")[-1] for i in category_yrs]

    period = [[i for i in j if i in colnames] for j in category_yrs]

    #---------------------------------------------------
    def my_function(row):
      """For one row (admin region in the case), For every time period, 
      calculate the growth rate between the last value and the first non-zero value.
      
      This function is to be used with apply() function for on a DF"""

      new_attr = []

      for i in range(len(period)):
        cols = list(row[period[i]]) 
        # Select columns of a time period
        nzero_index = next((i for i, x in enumerate(cols) if x != 0), None) 
        # Find the index of the first non-zero element
      
        if nzero_index != None and len(cols[nzero_index:]) > 1:
          avgIn = (cols[-1] - cols[nzero_index]) / cols[nzero_index] / (len(cols[nzero_index:])-1) *100
          avgIn = round(avgIn, 1)
        else: 
          avgIn = None

        new_attr = new_attr + [avgIn]
      return new_attr
    #---------------------------------------------------

    new_attrs_byrow = df.apply(my_function, axis=1)

    new_attrs_bycol = [[i[j] for i in new_attrs_byrow] for j in range(len(new_attrs_byrow[0]))]

    df[new_col] = pd.Series(new_attrs_bycol)

    yield df.to_file(os.path.join(output_path, countries[j]+'_10yGr.geojson'), driver='GeoJSON')
    j += 1

gen = genFun(len(files))

In [50]:
# Exhaust Generator
# Reference: https://stackoverflow.com/questions/47456631/simpler-way-to-run-a-generator-function-without-caring-about-items
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

## Plot: Violin + Boxplot of Growth Rate

In [22]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
files = glob(os.path.join(os.getcwd(), "*dfD*", '*lev2*', '*5y*', '*all*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfD_GrR_byRegion/admin_lev2/5y_Gr/all_5yGr_lev2_v1.geojson']

In [23]:
gdf = gpd.read_file(files[0])

In [24]:
colnames = [i for i in gdf.columns if '5y' in i]

In [25]:
ls = []
for i in range(len(colnames)):
  df_new = gdf[['GID_0', 'NAME_0', 'GID_1', 'NAME_1', 'GID_2', 'NAME_2']]
  df_new['stats_item'] = [colnames[i]]*len(gdf.index)
  df_new['year'] = [i.split('_')[-1] for i in df_new['stats_item']]
  df_new['stats_value'] = gdf[colnames[i]]
  ls.append(df_new)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: 

In [26]:
df_reshaped = pd.concat([i for i in ls])

In [27]:
print(gdf.columns)
print(df_reshaped.columns)

Index(['OBJECTID', 'GID_0', 'NAME_0', 'GID_1', 'NAME_1', 'ENGTYPE_1', 'GID_2',
       'NAME_2', 'ENGTYPE_2', 'Shape_Leng', 'Shape_Area', 'pondArea_ha_1988',
       'pondArea_ha_1989', 'pondArea_ha_1990', 'pondArea_ha_1991',
       'pondArea_ha_1992', 'pondArea_ha_1993', 'pondArea_ha_1994',
       'pondArea_ha_1995', 'pondArea_ha_1996', 'pondArea_ha_1997',
       'pondArea_ha_1998', 'pondArea_ha_1999', 'pondArea_ha_2000',
       'pondArea_ha_2001', 'pondArea_ha_2002', 'pondArea_ha_2003',
       'pondArea_ha_2004', 'pondArea_ha_2005', 'pondArea_ha_2006',
       'pondArea_ha_2007', 'pondArea_ha_2008', 'pondArea_ha_2009',
       'pondArea_ha_2010', 'pondArea_ha_2011', 'pondArea_ha_2012',
       'pondArea_ha_2013', 'pondArea_ha_2014', 'pondArea_ha_2015',
       'pondArea_ha_2016', 'pondArea_ha_2017', 'pondArea_ha_2018',
       'pondArea_ha_2019', '5y_avgGr_1990', '5y_avgGr_1995', '5y_avgGr_2000',
       '5y_avgGr_2005', '5y_avgGr_2010', '5y_avgGr_2015', '5y_avgGr_2019',
       'pondArea_ha_

In [28]:
df_reshaped.to_csv(os.path.join(os.getcwd(), 'dfE_tidy_for_plot', 'lev2_all_5yGr_v1.csv'), index=False)

## Plot: Pond developments by distance to shoreline

### Flow

In [30]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/v1/")
files = glob(os.path.join(os.getcwd(), '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/v1/KHM_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/v1/MMR_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/v1/MYS_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/v1/THA_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/v1/VNM_merged.geojson']

In [31]:
gdf = gpd.read_file(files[0])

In [36]:
gdf.columns

Index(['1987_majority', '1988_majority', '1989_majority', '1990_majority',
       '1991_majority', '1992_majority', '1993_majority', '1994_majority',
       '1995_majority', '1996_majority', '1997_majority', '1998_majority',
       '1999_majority', '2000_majority', '2001_majority', '2002_majority',
       '2003_majority', '2004_majority', '2005_majority', '2006_majority',
       '2007_majority', '2008_majority', '2009_majority', '2010_majority',
       '2011_majority', '2012_majority', '2013_majority', '2014_majority',
       '2015_majority', '2016_majority', '2017_majority', '2018_majority',
       '2019_majority', 'ENGTYPE_1', 'GID_0', 'GID_1', 'GID_2', 'NAME_0',
       'NAME_1', 'NAME_2', 'distance', 'elevation', 'id', 'parcel_number',
       'pondSize_m2', 'geometry'],
      dtype='object')

In [None]:
countries = [i.split("/")[-1].split(".")[0].split("_")[0] for i in files]
countries

['Malaysia', 'Thailand', 'Vietnam', 'Myanmar', 'Cambodia']

In [None]:
cols_ini = [i for i in gdf.columns if 'majority' in i]
cols_yr = [i.split('_')[0] for i in cols_ini]

In [None]:
class_distance = {"[0,5)":range(0,5000), "[5,20)":range(5000,20000), "[20,50)":range(20000,50000), "[50,100)":range(50000,100000), "[100,200)":range(100000,200000)}

In [None]:
# Subset gdf by distance in predefined category
gdf_grouped = [gdf[gdf['distance'].isin(i)] for i in class_distance.values()]

In [None]:
# Column "year"
year = cols_yr * len(gdf_grouped)

# Column "distance class"
distance_class = [[i]*len(cols_yr) for i in list(class_distance.keys())]
# Column "number of ponds"
N_ponds = [[len(i[i[j]==1].index) for j in cols_ini] for i in gdf_grouped]
# Column "Area sum"
Area_ha = [[round(i[i[j] == 1]['Area'].sum()/10000, 2) for j in cols_ini] for i in gdf_grouped]

# Unnest nested lists
from itertools import chain
distance_class = list(chain(*distance_class))
N_ponds = list(chain(*N_ponds))
Area_ha = list(chain(*Area_ha))

In [None]:
# Column "country"
country = [countries[1]]*len(year)

In [None]:
df = pd.DataFrame({'country': country, 'year':year, 'distance(km)':distance_class, 'pond_number':N_ponds, 'pond_area(ha)':Area_ha})

In [None]:
df

Unnamed: 0,country,year,distance(km),pond_number,pond_area(ha)
0,Thailand,1987,"[0,5)",36621,28291.31
1,Thailand,1988,"[0,5)",39331,29471.68
2,Thailand,1989,"[0,5)",39735,29176.44
3,Thailand,1990,"[0,5)",44606,31419.07
4,Thailand,1991,"[0,5)",52287,33283.81
...,...,...,...,...,...
160,Thailand,2015,"[100,200)",6290,5921.24
161,Thailand,2016,"[100,200)",6352,6056.15
162,Thailand,2017,"[100,200)",7632,6799.91
163,Thailand,2018,"[100,200)",8363,7017.17


In [None]:
#ls=[]
#for i in range(len(gdf_grouped)):
#  year = cols_yr
#  km_to_shore = [list(class_distance.keys())[i]]*len(year)
#  N_ponds = [len(gdf_grouped[i][gdf_grouped[i][j]==1].index) for j in cols_ini]
#  Area_ha = [round(gdf_grouped[i][gdf_grouped[i][j] == 1]['Area'].sum()/10000, 2) for j in cols_ini]
#  df = pd.DataFrame({'year':year, 'distance(km)':km_to_shore, 'pond_number':N_ponds, 'pond_area_ha':Area_ha})

### Function

In [37]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/v1")
files = glob(os.path.join(os.getcwd(), '*'))
files
#countries = [i.split("/")[-1].split(".")[0].split("_")[0] for i in files]

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/v1/KHM_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/v1/MMR_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/v1/MYS_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/v1/THA_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/v1/VNM_merged.geojson']

In [38]:
output_path = "/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/8.plot_distance/"

In [39]:
def genFun(n): # n=len(files)

  j = 0
  while j < n:
    gdf = gpd.read_file(files[j])
    #country = files[j].split("/")[-1].split(".")[0].split("_")[0]

    # Get needed column names
    cols_ini = [i for i in gdf.columns if 'majority' in i]
    cols_yr = [i.split('_')[0] for i in cols_ini]
    # Define Distance Category
    class_distance = {"[0,5)":range(0,5000), "[5,20)":range(5000,20000), "[20,50)":range(20000,50000), "[50,100)":range(50000,100000), "[100,200)":range(100000,200000)}

    # Subset gdf by distance in predefined category
    gdf_grouped = [gdf[gdf['distance'].isin(i)] for i in class_distance.values()]

    # Column "year"
    year = cols_yr * len(gdf_grouped)
    # Column "country"
    country = list(set(gdf['NAME_0']))*len(year)

    # Column "distance class"
    distance_class = [[i]*len(cols_yr) for i in list(class_distance.keys())]
    # Column "number of ponds"
    N_ponds = [[len(i[i[j]==1].index) for j in cols_ini] for i in gdf_grouped]
    # Column "Area sum"
    Area_ha = [[round(i[i[j] == 1]['pondSize_m2'].sum()/10000, 2) for j in cols_ini] for i in gdf_grouped]

    # Unnest nested lists
    from itertools import chain
    distance_class = list(chain(*distance_class))
    N_ponds = list(chain(*N_ponds))
    Area_ha = list(chain(*Area_ha))
    
    df = pd.DataFrame({'country': country, 'year':year, 'distance(km)':distance_class, 'pond_number':N_ponds, 'pond_area(ha)':Area_ha})

    yield df.to_csv(os.path.join(output_path, country[0]+'_distance_v1.csv'), index=False)
    j += 1

gen = genFun(len(files))

In [40]:
# Exhaust Generator
# Reference: https://stackoverflow.com/questions/47456631/simpler-way-to-run-a-generator-function-without-caring-about-items
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

## Accuracy Assessment

### Overall Accuracy

In [54]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/")
files = glob(os.path.join(os.getcwd(), '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/KHM_degeom.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/MMR_degeom.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/MYS_degeom.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/THA_degeom.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/VNM_degeom.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/all_deGeom.csv']

In [55]:
dfs = [pd.read_csv(i) for i in files]
dfs = [i[i['2019_majority']!=2] for i in dfs]

In [56]:
oa = [round(len(i[i['2019_majority']==1]) / len(i.index) * 100, 1) for i in dfs]
oa

[87.6, 93.0, 81.3, 91.9, 96.5, 94.4]

### Regional Accuracy

#### Flow

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
countries = ['Cambodia', 'Vietnam', 'Thailand', 'Myanmar', 'Malaysia']

filesA = glob(os.path.join(os.getcwd(), '*dfA*', '*')) # Pond zonStats
filesB = glob(os.path.join(os.getcwd(), '*dfB*', '*1*', 'v1.1', '*')) # Admins

files = [[i for i in filesA+filesB if j in i] for j in countries]

gidLevl = 'GID_1'

In [None]:
dfA = pd.read_csv(files[0][0]) # dfA is df of pond zonStats
dfB = gpd.read_file(files[0][1]) # dfB is gdf of admin region

dfB_filtered = dfB[dfB[gidLevl].isin(dfA[gidLevl])] # Keep only admin regions containing ponds
GIDs = dfB_filtered[gidLevl].tolist()

In [None]:
# Count ponds by GID_1 (province)
dfA_cp = dfA.groupby([gidLevl], as_index=False).size()
dfA_cp = dfA_cp.rename({"size":"count_ponds_all"}, axis=1)

In [None]:
# Count Active Ponds in 2019 by GID_1
dfA_act2019 = dfA[dfA['2019_majority']==1]
dfA_cp_act2019 = dfA_act2019.groupby([gidLevl], as_index=False).size()
dfA_cp_act2019 = dfA_cp_act2019.rename({'size':'count_ponds_act2019'}, axis=1)

In [None]:
# Join two dfs of counts
dfA_join = pd.merge(dfA_cp, dfA_cp_act2019, on=gidLevl, how='left')

# Join df of pond counts to df of admin
dfB_join = pd.merge(dfB_filtered, dfA_join, on = gidLevl, how='left')

In [None]:
# Classification Accuracy for 2019
dfB_join['accuracy_2019_%'] = round(dfB_join['count_ponds_act2019'] / dfB_join['count_ponds_all'] * 100, 1)

#### Function

In [60]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
countries = ['Cambodia', 'Vietnam', 'Thailand', 'Myanmar', 'Malaysia']

filesA = glob(os.path.join(os.getcwd(), '*dfA*', '*')) # Pond zonStats
filesB = glob(os.path.join(os.getcwd(), '*dfB*', '*1*', 'v1.1', '*')) # Admins

files = [[i for i in filesA+filesB if j in i] for j in countries]

gidLevl = 'GID_1'

outpath = ("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/9.discussion_Stats/regionalAccuracy/v1/")

In [61]:
def genFun(n): # n=len(files)
  j = 0
  while j < n:
    dfA = pd.read_csv(files[j][0]) # dfA is df of pond zonStats
    dfB = gpd.read_file(files[j][1]) # dfB is gdf of admin region

    dfB_filtered = dfB[dfB[gidLevl].isin(dfA[gidLevl])] # Keep only admin regions containing ponds
    GIDs = dfB_filtered[gidLevl].tolist()

    # Count ponds by GID_1 (province)
    dfA_cp = dfA.groupby([gidLevl], as_index=False).size()
    dfA_cp = dfA_cp.rename({"size":"count_ponds_all"}, axis=1)

    # Count Active Ponds in 2019 by GID_1
    dfA_act2019 = dfA[dfA['2019_majority']==1]
    dfA_cp_act2019 = dfA_act2019.groupby([gidLevl], as_index=False).size()
    dfA_cp_act2019 = dfA_cp_act2019.rename({'size':'count_ponds_act2019'}, axis=1)

    # Join two dfs of counts
    dfA_join = pd.merge(dfA_cp, dfA_cp_act2019, on=gidLevl, how='left')
    # Join df of pond counts to df of admin
    dfB_join = pd.merge(dfB_filtered, dfA_join, on = gidLevl, how='left')

    # Classification Accuracy for 2019
    dfB_join['accuracy_2019_%'] = round(dfB_join['count_ponds_act2019'] / dfB_join['count_ponds_all'] * 100, 1)

    yield dfB_join.to_file(os.path.join(outpath, countries[j]+'_regionAccuracy_adm1_v1.geojson'), driver='GeoJSON')
    j += 1

gen = genFun(len(files))

In [62]:
# Exhaust Generator
# Reference: https://stackoverflow.com/questions/47456631/simpler-way-to-run-a-generator-function-without-caring-about-items
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

## Compare Pond Area

### Plot: classfication accuracy ~ pond size

In [71]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/")
files = glob(os.path.join(os.getcwd(), '*'))[:-1]
files
#countries = [i.split("/")[-1].split(".")[0].split("_")[0] for i in files]
#countries

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/KHM_degeom.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/MMR_degeom.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/MYS_degeom.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/THA_degeom.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/v1/VNM_degeom.csv']

In [72]:
dfs = [pd.read_csv(i) for i in files]
# Leave out no-data ponds
dfs = [i[i['2019_majority']!=2] for i in dfs]

In [73]:
df_ls = []
for i in range(len(dfs)):
  df = dfs[i]

  area_tp = df[df['2019_majority']==1]['pondSize_m2']
  area_fn = df[df['2019_majority']==0]['pondSize_m2']

  note_tp = ['tp']*len(area_tp)
  note_fn = ['fn']*len(area_fn)

  area = area_tp.append(area_fn)
  note = note_tp + note_fn
  #country = [countries[i]]*len(df.index)
  country = list(set(df['NAME_0']))*len(df.index)

  df_area = pd.DataFrame({'country':country, 'area':area, 'note':note})

  df_ls = df_ls + [df_area]

In [74]:
df_merged = pd.concat([i for i in df_ls])

In [75]:
outpath = "/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/9.discussion_Stats/"
df_merged.to_csv(os.path.join(outpath, 'compare_area_v1.csv'), index=False)

### [temp. discarded] Map: Small Pond Density

how many small-area ponds (<900m2) are existing within unit area of land (1km2)?

#### Flow

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
countries = ['Cambodia', 'Vietnam', 'Thailand', 'Myanmar', 'Malaysia']

#admin_level = 'admin_lev1'
gid = 'GID_1'

filesA = glob(os.path.join(os.getcwd(), '*dfA*', '*')) # Pond zonStats
filesB = glob(os.path.join(os.getcwd(), '*dfB*', '*1*', 'v1.1', "*")) # Admin level-1 \ -2

files = [[i for i in filesA+filesB if j in i] for j in countries]

In [None]:
dfA = pd.read_csv(files[1][0]) # dfA is df of pond zonStats
dfB = gpd.read_file(files[1][1]) # dfB is gdf of admin region

In [None]:
# Subset admin gdf to regions owning ponds
dfB_filtered = dfB[dfB[gid].isin(dfA[gid])]
GIDs = dfB_filtered[gid].tolist()

In [None]:
# Count small-area ponds in 2019 by GID_1
dfA_small = dfA[dfA['Area']<900]
dfA_count_small = dfA_small.groupby([gid], as_index=False).size()
dfA_count_small = dfA_count_small.rename({'size':'count_small_ponds'}, axis=1)

In [None]:
# Join df of small pond count to df of admin region
dfB_join = pd.merge(dfB_filtered, dfA_count_small, on = gid, how = 'left')

In [None]:
dfB_join['cpSmall_per_km2'] = round(dfB_join['count_small_ponds'] / (dfB_join['Area_m2'] / 10e6), 0)

#### Function

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
countries = ['Cambodia', 'Vietnam', 'Thailand', 'Myanmar', 'Malaysia']

filesA = glob(os.path.join(os.getcwd(), '*dfA*', '*')) # Pond zonStats
filesB = glob(os.path.join(os.getcwd(), '*dfB*', '*1*', 'v1.1', '*')) # Admins

files = [[i for i in filesA+filesB if j in i] for j in countries]

gidLevl = 'GID_1'

outpath = ("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/9.discussion_Stats/smallPond_density/")

In [None]:
def genFun(n): # n=len(files)
  j = 0
  while j < n:
    dfA = pd.read_csv(files[j][0]) # dfA is df of pond zonStats
    dfB = gpd.read_file(files[j][1]) # dfB is gdf of admin region

    dfB_filtered = dfB[dfB[gidLevl].isin(dfA[gidLevl])] # Keep only admin regions containing ponds
    GIDs = dfB_filtered[gidLevl].tolist()

    # Count small-area ponds in 2019 by GID_1
    dfA_small = dfA[dfA['Area']<900]
    dfA_count_small = dfA_small.groupby([gidLevl], as_index=False).size()
    dfA_count_small = dfA_count_small.rename({'size':'count_small_ponds'}, axis=1)

    # Join df of small pond count to df of admin region
    dfB_join = pd.merge(dfB_filtered, dfA_count_small, on = gid, how = 'left')

    # Calc how many small-ponds are within 1 km2
    dfB_join['cpSmall_per_km2'] = round(dfB_join['count_small_ponds'] / (dfB_join['Area_m2'] / 10e6), 0)

    yield dfB_join.to_file(os.path.join(outpath, countries[j]+'_smallPondDensity_adm1.geojson'), driver='GeoJSON')
    j += 1

gen = genFun(len(files))

In [None]:
# Exhaust Generator
# Reference: https://stackoverflow.com/questions/47456631/simpler-way-to-run-a-generator-function-without-caring-about-items
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

## FAO Statistics

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/10.FAO_stats/")
files = glob(os.path.join(os.getcwd(), '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/10.FAO_stats/aquaculture_production_quantity_1950_2020.csv']

In [None]:
df = pd.read_csv(files[0])

years = [str([i]) for i in range(1988,2020)]
# Select necessary columns
cols = [i for i in df.columns if i in years]
cols = [df.columns[0]] + cols

# Remove unneeded rows
df = df[cols][:-2]

In [None]:
countries = list(set(list(df['Country (Name)'])))
# Subset by country
dfs = [df[df['Country (Name)'] == i] for i in countries]
# Sum all productions by year
dfs_aggr = [i.sum() for i in dfs]

In [None]:
df_ls = []
for i in range(len(dfs_aggr)):

  tlw = list(dfs_aggr[i])[1:]
  years = list(range(1988, 2020))
  country = countries[i]

  df_tidy = pd.DataFrame({'country':country, 'year':years, 'tonnes_live_weight': tlw})

  # Calculate percentage difference between production of year_y and production of 2019
  tlw2019 = float(df_tidy[df_tidy['year']==2019]['tonnes_live_weight'])
  df_tidy['percDif_to_2019'] = round(df_tidy['tonnes_live_weight'] / tlw2019 * 100, 1)

  df_ls = df_ls + [df_tidy]


In [None]:
df_merged = pd.concat([i for i in df_ls])

# Rewrite country name for Vietnam
df_merged['country'] = df_merged['country'].replace({'Viet Nam':'Vietnam'})

In [None]:
df_merged.to_csv(os.path.join(os.getcwd(), 'fao_stats_processed.csv'), index=False)

## Merge Outputs

In [63]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/9.discussion_Stats/regionalAccuracy/v1/")
files = glob(os.path.join(os.getcwd(), '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/9.discussion_Stats/regionalAccuracy/v1/Cambodia_regionAccuracy_adm1_v1.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/9.discussion_Stats/regionalAccuracy/v1/Vietnam_regionAccuracy_adm1_v1.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/9.discussion_Stats/regionalAccuracy/v1/Thailand_regionAccuracy_adm1_v1.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/9.discussion_Stats/regionalAccuracy/v1/Myanmar_regionAccuracy_adm1_v1.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/9.discussion_Stats/regionalAccuracy/v1/Malaysia_regionAccuracy_adm1_v1.geojson']

In [52]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
admin_level = "*lev1*"
files = glob(os.path.join(os.getcwd(), '*dfD*', admin_level, '*10y*', '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfD_GrR_byRegion/admin_lev1/10y_Gr/Cambodia_10yGr.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfD_GrR_byRegion/admin_lev1/10y_Gr/Vietnam_10yGr.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfD_GrR_byRegion/admin_lev1/10y_Gr/Myanmar_10yGr.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfD_GrR_byRegion/admin_lev1/10y_Gr/Thailand_10yGr.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfD_GrR_byRegion/admin_lev1/10y_Gr/Malaysia_10yGr.geojson']

In [64]:
dfs = [gpd.read_file(i) for i in files]
#dfs = [pd.read_csv(i) for i in files]

In [65]:
df_concat = pd.concat([i for i in dfs])

In [66]:
df_concat.to_file(os.path.join(os.getcwd(), 'all_regionalAccuracy_adm1_v1.geojson'), driver='GeoJSON')
#df_concat.to_csv(os.path.join(os.getcwd(), 'all_deGeom.csv'), index=False)

## Hexagon Map under test

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/")
hexgrid_ini = gpd.read_file(os.path.join(os.getcwd(), 'hexGrid_KHM.geojson'))
ponds = gpd.read_file(os.path.join(os.getcwd(), 'Cambodia_merged.geojson'))

In [None]:
hexgrid_ini['hexagonID'] = range(len(hexgrid_ini))

* https://gis.stackexchange.com/questions/358735/how-to-obtain-mean-maximum-and-mininum-of-all-points-located-within-polygons-u

In [None]:
ponds_sj = gpd.sjoin(ponds, hexgrid_ini, how='left', predicate='within')

In [None]:
pondsOn_1987 = ponds_sj[ponds_sj['1987_majority']==1]

In [None]:
col_to_aggregate = [i for i in ponds_sj.columns if 'majority' in i]
new_colnames = ['Area_'+i.split('_')[0] for i in col_to_aggregate]
pondsOn = [ponds_sj[ponds_sj[i]==1] for i in col_to_aggregate]

In [None]:
d = {'a':1, 'b':2}
for key,val in d.items():
  exec(key + '=val')

In [None]:
ponds_stats = [i.groupby('hexagonID')['Area'].agg(['sum']) for i in pondsOn]
#ponds_stats = [i.rename({'sum':'Area'}, axis=1, inplace=True) for i,j in zip(ponds_stats, new_colnames)]
len(ponds_stats)

33

In [None]:
ponds_stats[3].info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 662 entries, 32.0 to 3204.0
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   sum     662 non-null    float64
dtypes: float64(1)
memory usage: 10.3 KB


In [None]:
ponds_stats[32]

In [None]:

ponds_stats = pondsOn_1987.groupby('hexagonID').agg(Area_1987 = ('Area','sum'))
ponds_stats

Unnamed: 0_level_0,Area_1987
hexagonID,Unnamed: 1_level_1
32.0,7.204881e+05
33.0,1.019850e+06
34.0,8.708514e+04
35.0,5.076738e+03
78.0,4.843661e+04
...,...
2996.0,2.247707e+03
3050.0,2.838944e+03
3051.0,4.004482e+03
3101.0,6.823182e+02


## Explore: Average Rate of Increase

In [None]:
v = [10, 10, 20, 10, 2, 50]

In [None]:
# Annual Growth Rate
ar = [(v[i+1]-v[i])/v[i]*100 for i in range(len(v)-1)]
ar

[0.0, 100.0, -50.0, -80.0, 2400.0]

In [None]:
# Average Annual Growth Rate
# https://www.investopedia.com/terms/a/aagr.asp
aagr = np.mean(ar)
print("Average Annual Growth Rate is "+str(aagr)+" %")

Average Annual Growth Rate is 474.0 %


In [None]:
# Compounded Annual Growth Rate
# https://www.investopedia.com/terms/c/cagr.asp
cagr = ((v[-1]/v[0])**(1/(len(v)-1))-1)*100
print("Compounded Annual Growth Rate is "+str(round(cagr,2))+" %")

Compounded Annual Growth Rate is 37.97 %


In [None]:
# Compounded Annual Growth Rate v2?
((v[-1]-v[0])/v[0]) / (len(v)-1) * 100

80.0