## Load Packages

In [1]:
# Link to Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Connect to Earth Engine
#import ee
#ee.Authenticate()
#ee.Initialize()

In [3]:
!pip install geopandas
!pip install geojson

import os
from glob import glob
import numpy as np
import pandas as pd
import geopandas as gpd
import geojson


Collecting geopandas
  Downloading geopandas-0.10.2-py2.py3-none-any.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 7.7 MB/s 
[?25hCollecting fiona>=1.8
  Downloading Fiona-1.8.21-cp37-cp37m-manylinux2014_x86_64.whl (16.7 MB)
[K     |████████████████████████████████| 16.7 MB 381 kB/s 
Collecting pyproj>=2.2.0
  Downloading pyproj-3.2.1-cp37-cp37m-manylinux2010_x86_64.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 27.7 MB/s 
[?25hCollecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Collecting munch
  Downloading munch-2.5.0-py2.py3-none-any.whl (10 kB)
Collecting click-plugins>=1.0
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Installing collected packages: munch, cligj, click-plugins, pyproj, fiona, geopandas
Successfully installed click-plugins-1.1.1 cligj-0.7.2 fiona-1.8.21 geopandas-0.10.2 munch-2.5.0 pyproj-3.2.1
Collecting geojson
  Downloading geojson-2.5.0-py2.py3-none-any.whl (14 kB)
Installing collected 

## Tidy up Datasets before Mapping/Plotting

### Group zonalStats dataframes by country

In [None]:
id_KHM = [541, 670]
id_VTM = [530,531,532,533,534,535,536,537,538,540,541]
id_THA = [655,656,657,665,666,667,668,669,670,671,672,695]
id_MYS = [651,658,659,660,662,663,664,618,619,626,627,628,629,630,631]
id_MMR = [655,693,694,695,696,697,698,699,700,701,702]

In [None]:
def filterRegion(list_id, country):
  """ Merge GeoJSONs and Subset to one country

  Parameters
  ----------
  list_id: list
    List of parcel ids
  country: string
    Country Name
  
  Returns
  ----------
  Exported GeoJSON of one country
  """
  os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/0.Initial/")
  files = [glob(os.path.join(os.getcwd(), '*'+str(i)+'*')) for i in list_id]
  files_flat = [item for sublist in files for item in sublist]
  gdfs = [gpd.read_file(i) for i in files_flat]
  gdfs_filtered = [i[i['NAME_0']==country] for i in gdfs]
  gdf = pd.concat([i for i in gdfs_filtered])

  # Data type of zonStats columns to "uint8"
  #keys = [i for i in gdf.columns if 'majority' in i]
  #values = [np.uint8]*len(keys)
  #dictCol = dict(zip(keys, values))
  #gdf = gdf.astype(dictCol)

  outPath = '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/1.by_country/'
  return gdf.to_file(os.path.join(outPath, "zonStats_"+str(country)+".geojson"), driver = 'GeoJSON')

In [None]:
# Write file
#filterRegion(id_KHM, "Cambodia")
filterRegion(id_VTM, "Vietnam")
filterRegion(id_THA, "Thailand")
filterRegion(id_MYS, "Malaysia")
filterRegion(id_MMR, "Myanmar")

In [None]:
# Draft
#os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/0.Initial/")
#files = [glob(os.path.join(os.getcwd(), '*'+str(i)+'*')) for i in id_cambodia]
#files_flat = [item for sublist in files for item in sublist]
#gdfs = [gpd.read_file(i) for i in files_flat]
#gdfs_filtered = [i[i['NAME_0']=="Cambodia"] for i in gdfs]
#gdf_KHM = pd.concat([i for i in gdfs_filtered])

#keys = [i for i in gdf_KHM.columns if 'majority' in i]
#values = [np.uint8]*len(keys)
#dictCol = dict(zip(keys, values))
#gdf_KHM = gdf_KHM.astype(dictCol)

### Tidy up using GEE
  * Calculate Pond Area, adding as Attributes
  * Transfer Geometry Type from polygon to point

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/1.by_country/")
with open('zonStats_Cambodia.geojson') as f:
  geojson_ini = geojson.load(f)

In [None]:
# Subset big GeoJSON into processible parts (2500 items/part)
geojson_tiled = [tuple(geojson_ini[i:i+2500]) for i in range(0, len(geojson_ini['features']), 2500)]
len(geojson_tiled[-1])

158

In [None]:
# Load GeoJSON as feature collection
fc = [ee.FeatureCollection(i) for i in geojson_tiled]
type(fc)

list

In [None]:
# Pond Area in square meters
fc_area = [i.map(lambda feature: feature.set({'Area': feature.area()})) for i in fc]
type(fc_area)

list

In [None]:
# Get Centroid of Pond Polygons, thus transfer geometry type from polygon to point
fc_point= [i.map(lambda feature: feature.centroid()) for i in fc_area]
type(fc_point)

list

In [None]:
# Export Feature Collection as GeoJSON
for i in range(len(fc_point)):
  task_config = {
          'description': 'zonStats_KHM_tidy',
          'fileNamePrefix': 'zonStats_KHM_tidy_'+str(i),
          'folder': '2.area_centroid',
          'fileFormat': 'GeoJSON'}

  task = ee.batch.Export.table.toDrive(fc_point[i], **task_config)
  task.start()

In [None]:
task.status()

### Tidy up using GEE: Function

In [None]:
def tidy_up(geojson_path):

  # Load GeoJSON
  with open(geojson_path) as f:
    geojson_ini = geojson.load(f)
  
  # Subset big GeoJSON into processible parts (2500 items/part)
  geojson_tiled = [tuple(geojson_ini[i:i+2500]) for i in range(0, len(geojson_ini['features']), 2500)]

  # Load GeoJSON as feature collection
  fc = [ee.FeatureCollection(i) for i in geojson_tiled]

  # Pond Area in square meters
  fc_area = [i.map(lambda feature: feature.set({'Area': feature.area()})) for i in fc]

  # Get Centroid of Pond Polygons, thus transfer geometry type from polygon to point
  fc_point= [i.map(lambda feature: feature.centroid()) for i in fc_area]

  return fc_point

In [None]:
#countries = ['Malaysia', 'Thailand', 'Vietnam', 'Myanmar']
countries = ['Myanmar']

def genFun(n): # len(countries)

  os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/1.by_country/")
  files = [glob(os.path.join(os.getcwd(), '*'+i+'*.geojson')) for i in countries]

  i = 0
  while i < n:
    input = files[i][0]
    output = tidy_up(input) # output is a list

    for j in range(len(output)):
      task_config = {
              'description': 'zonStats_tidy',
              'fileNamePrefix': 'zonStats_'+countries[i]+'_tidy_'+str(j),
              'folder': '2.area_centroid',
              'fileFormat': 'GeoJSON'}
      task = ee.batch.Export.table.toDrive(output[j], **task_config)
      yield task.start()
      
    i += 1

gen = genFun(len(countries))

In [None]:
# Exhaust Generator
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

### Merge Dataframes by Country

In [None]:
countries = ['Malaysia', 'Thailand', 'Vietnam', 'Myanmar', 'KHM']

def genFun(n):
  inPath = "/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/2.area_centroid/"
  outPath = "/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/"

  i = 0
  while i < n:
    files = glob(os.path.join(inPath, '*'+countries[i]+'*.geojson'))
    dfs = [gpd.read_file(i) for i in files]
    df = pd.concat([i for i in dfs])
    yield df.to_file(os.path.join(outPath, countries[i]+"_merged.geojson"))
    i += 1

gen = genFun(len(countries))

In [None]:
# Exhaust Generator
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

### De-geometry

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/")
files = glob(os.path.join(os.getcwd(), '*merged*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/Malaysia_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/Thailand_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/Vietnam_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/Myanmar_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/Cambodia_merged.geojson']

In [None]:
gdf = gpd.read_file(files[0])

In [None]:
df = gdf.drop('geometry', 1)

  """Entry point for launching an IPython kernel.


In [None]:
def genFun(n): # n=len(files)
  outpath = "/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/"
  j = 0
  while j < n:
    gdf = gpd.read_file(files[j])
    df = gdf.drop('geometry', 1)
    country = files[j].split('/')[-1].split('_')[0]
    yield df.to_csv(os.path.join(outpath, country+'_degeom.csv'), index=False)
    j += 1

gen = genFun(len(files))

In [None]:
# Exhaust Generator
# Reference: https://stackoverflow.com/questions/47456631/simpler-way-to-run-a-generator-function-without-caring-about-items
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

  
  
  
  
  


## Plot: time series of active pond numbers & pond area

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/")
files = glob(os.path.join(os.getcwd(), '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/Malaysia_degeom.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/Thailand_degeom.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/Vietnam_degeom.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/Myanmar_degeom.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.deGeom/Cambodia_degeom.csv']

**Note**: There might be ponds whose time series start in 1987, the others in 1988, so the "1987" column might be `NA` for those whose time series start in 1988.

### Flow

In [None]:
df = pd.read_csv(files[0])
# Fill NA of first column
df[[df.columns[0]]] = df[[df.columns[0]]].fillna(value=2)
# Convert first column to integer
df = df.astype({df.columns[0]:int})

In [None]:
tsCol = [i for i in df.columns if 'majority' in i]

In [None]:
active_ponds = [len(df[df[i]==1]) for i in tsCol]

In [None]:
active_area_km2 = [round(sum(df[df[i]==1]['Area'])/1e6, 2) for i in tsCol]

In [None]:
years = [i.split('_')[0] for i in tsCol]
country = [files[0].split('/')[-1].split('_')[0]]*len(years)

In [None]:
data = {'country': country, 
        'year': years,
        'active_count': active_ponds,
        'active_area_km2': active_area_km2}

df_plot = pd.DataFrame(data)

### Function

In [None]:
def genFun(n): # n=len(files)
  outpath = "/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/6.plot_count_area/"
  j = 0
  while j < n:
    df = pd.read_csv(files[j])
    # Fill NA of first column
    df[[df.columns[0]]] = df[[df.columns[0]]].fillna(value=2)
    # Convert first column to integer
    df = df.astype({df.columns[0]:int})

    tsCol = [i for i in df.columns if 'majority' in i]
    active_ponds = [len(df[df[i]==1]) for i in tsCol]
    active_area_km2 = [round(sum(df[df[i]==1]['Area'])/1e6, 2) for i in tsCol]
    years = [i.split('_')[0] for i in tsCol]
    country = [files[j].split('/')[-1].split('_')[0]]*len(years)

    data = {'country': country, 
            'year': years,
            'active_count': active_ponds,
            'active_area_km2': active_area_km2}
    df_plot = pd.DataFrame(data)

    yield df_plot.to_csv(os.path.join(outpath, country[0]+'_plot_count_area.csv'), index=False)
    j += 1

gen = genFun(len(files))

In [None]:
# Exhaust Generator
# Reference: https://stackoverflow.com/questions/47456631/simpler-way-to-run-a-generator-function-without-caring-about-items
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

### Merge

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/6.plot_count_area/")
files = glob(os.path.join(os.getcwd(), '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/6.plot_count_area/Malaysia_plot_count_area.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/6.plot_count_area/Thailand_plot_count_area.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/6.plot_count_area/Vietnam_plot_count_area.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/6.plot_count_area/Myanmar_plot_count_area.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/6.plot_count_area/Cambodia_plot_count_area.csv']

In [None]:
dfs = [pd.read_csv(i) for i in files]
df = pd.concat([i for i in dfs])

In [None]:
#df.to_csv(os.path.join(os.getcwd(), 'plot_count_area_all.csv'), index=False)

## Plot: Statistics of NA in ponds
*(Do it again using de-geom csv file after filling NA with 2 for first time stamp)*

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.geGeom/")
files = glob(os.path.join(os.getcwd(), '*merged*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/Malaysia_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/Thailand_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/Vietnam_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/Myanmar_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/Cambodia_merged.geojson']

### Flow

In [None]:
df = gpd.read_file(files[0])

In [None]:
files[-1].split('/')[-1].split('_')[0]

'Cambodia'

In [None]:
# Count of Rows
len(df.index)

17658

In [None]:
col_stats = [i for i in df.columns if 'majority' in i]
years = [i.split('_')[0] for i in col_stats]
na_rate = [len(df[df[i]==2].index)/len(df.index)*100 for i in col_stats]
country = ['Cambodia']*len(years)

In [None]:
data = {'country': country, 
        'year': years,
        'na_rate': na_rate}

df_NA = pd.DataFrame(data)

### Function

In [None]:
def genFun(n): # n=len(files)
  j = 0
  #lst = []
  while j < n:
    df = gpd.read_file(files[j])
    col_stats = [i for i in df.columns if 'majority' in i]
    years = [i.split('_')[0] for i in col_stats]
    na_rate = [len(df[df[i]==2].index)/len(df.index)*100 for i in col_stats]
    country = [files[j].split('/')[-1].split('_')[0]]*len(years)
    data = {'country': country, 
            'year': years,
            'na_rate': na_rate}
    df_NA = pd.DataFrame(data)
    yield df_NA.to_csv(os.path.join("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/5.NA_Stats/", country[0]+'_naStats.csv'), index=False)
    j += 1

gen = genFun(len(files))

In [None]:
# Exhaust Generator
# Reference: https://stackoverflow.com/questions/47456631/simpler-way-to-run-a-generator-function-without-caring-about-items
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

### Explore

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.NA_Stats/")
files = glob(os.path.join(os.getcwd(), '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.NA_Stats/Malaysia_naStats.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.NA_Stats/Thailand_naStats.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.NA_Stats/Vietnam_naStats.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.NA_Stats/Myanmar_naStats.csv',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.NA_Stats/Cambodia_naStats.csv']

In [None]:
dfs = [pd.read_csv(i) for i in files]
df = pd.concat([i for i in dfs])
df = df.drop('Unnamed: 0', 1)

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
#df.to_csv("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/4.NA_Stats/naStats_merged.csv", index=False)

## Map: Pond developments by distance to shoreline

### Flow

In [4]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/")
files = glob(os.path.join(os.getcwd(), '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/Malaysia_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/Thailand_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/Vietnam_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/Myanmar_merged.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/Cambodia_merged.geojson']

In [5]:
gdf = gpd.read_file(files[1])

In [71]:
countries = [i.split("/")[-1].split(".")[0].split("_")[0] for i in files]
countries

['Malaysia', 'Thailand', 'Vietnam', 'Myanmar', 'Cambodia']

In [20]:
cols_ini = [i for i in gdf.columns if 'majority' in i]
cols_yr = [i.split('_')[0] for i in cols_ini]

In [35]:
class_distance = {"[0,5)":range(0,5000), "[5,20)":range(5000,20000), "[20,50)":range(20000,50000), "[50,100)":range(50000,100000), "[100,200)":range(100000,200000)}

In [15]:
# Subset gdf by distance in predefined category
gdf_grouped = [gdf[gdf['distance'].isin(i)] for i in class_distance.values()]

In [68]:
# Column "year"
year = cols_yr * len(gdf_grouped)

# Column "distance class"
distance_class = [[i]*len(cols_yr) for i in list(class_distance.keys())]
# Column "number of ponds"
N_ponds = [[len(i[i[j]==1].index) for j in cols_ini] for i in gdf_grouped]
# Column "Area sum"
Area_ha = [[round(i[i[j] == 1]['Area'].sum()/10000, 2) for j in cols_ini] for i in gdf_grouped]

# Unnest nested lists
from itertools import chain
distance_class = list(chain(*distance_class))
N_ponds = list(chain(*N_ponds))
Area_ha = list(chain(*Area_ha))

In [72]:
# Column "country"
country = [countries[1]]*len(year)

In [75]:
df = pd.DataFrame({'country': country, 'year':year, 'distance(km)':distance_class, 'pond_number':N_ponds, 'pond_area(ha)':Area_ha})

In [76]:
df

Unnamed: 0,country,year,distance(km),pond_number,pond_area(ha)
0,Thailand,1987,"[0,5)",36621,28291.31
1,Thailand,1988,"[0,5)",39331,29471.68
2,Thailand,1989,"[0,5)",39735,29176.44
3,Thailand,1990,"[0,5)",44606,31419.07
4,Thailand,1991,"[0,5)",52287,33283.81
...,...,...,...,...,...
160,Thailand,2015,"[100,200)",6290,5921.24
161,Thailand,2016,"[100,200)",6352,6056.15
162,Thailand,2017,"[100,200)",7632,6799.91
163,Thailand,2018,"[100,200)",8363,7017.17


In [46]:
#ls=[]
#for i in range(len(gdf_grouped)):
#  year = cols_yr
#  km_to_shore = [list(class_distance.keys())[i]]*len(year)
#  N_ponds = [len(gdf_grouped[i][gdf_grouped[i][j]==1].index) for j in cols_ini]
#  Area_ha = [round(gdf_grouped[i][gdf_grouped[i][j] == 1]['Area'].sum()/10000, 2) for j in cols_ini]
#  df = pd.DataFrame({'year':year, 'distance(km)':km_to_shore, 'pond_number':N_ponds, 'pond_area_ha':Area_ha})

### Function

In [77]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/")
files = glob(os.path.join(os.getcwd(), '*'))
countries = [i.split("/")[-1].split(".")[0].split("_")[0] for i in files]

In [78]:
output_path = "/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/8.plot_distance/"

In [79]:
def genFun(n): # n=len(files)

  j = 0
  while j < n:
    gdf = gpd.read_file(files[j])
    country = files[j].split("/")[-1].split(".")[0].split("_")[0]

    # Get needed column names
    cols_ini = [i for i in gdf.columns if 'majority' in i]
    cols_yr = [i.split('_')[0] for i in cols_ini]
    # Define Distance Category
    class_distance = {"[0,5)":range(0,5000), "[5,20)":range(5000,20000), "[20,50)":range(20000,50000), "[50,100)":range(50000,100000), "[100,200)":range(100000,200000)}

    # Subset gdf by distance in predefined category
    gdf_grouped = [gdf[gdf['distance'].isin(i)] for i in class_distance.values()]

    # Column "year"
    year = cols_yr * len(gdf_grouped)
    # Column "country"
    country = [country]*len(year)

    # Column "distance class"
    distance_class = [[i]*len(cols_yr) for i in list(class_distance.keys())]
    # Column "number of ponds"
    N_ponds = [[len(i[i[j]==1].index) for j in cols_ini] for i in gdf_grouped]
    # Column "Area sum"
    Area_ha = [[round(i[i[j] == 1]['Area'].sum()/10000, 2) for j in cols_ini] for i in gdf_grouped]

    # Unnest nested lists
    from itertools import chain
    distance_class = list(chain(*distance_class))
    N_ponds = list(chain(*N_ponds))
    Area_ha = list(chain(*Area_ha))
    
    df = pd.DataFrame({'country': country, 'year':year, 'distance(km)':distance_class, 'pond_number':N_ponds, 'pond_area(ha)':Area_ha})

    yield df.to_csv(os.path.join(output_path, country[0]+'_distance.csv'), index=False)
    j += 1

gen = genFun(len(files))

In [80]:
# Exhaust Generator
# Reference: https://stackoverflow.com/questions/47456631/simpler-way-to-run-a-generator-function-without-caring-about-items
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

## Map: aquaculture pond development by admin region

### Flows

#### 1: Total Area of Ponds by Provinces / Districts

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
countries = ['Cambodia', 'Vietnam', 'Thailand', 'Myanmar', 'Malaysia']

filesA = glob(os.path.join(os.getcwd(), '*dfA*', '*')) # Pond zonStats
filesB = glob(os.path.join(os.getcwd(), '*dfB*', '*2*', '*')) # Admin level-1 \ -2

files = [[i for i in filesA+filesB if j in i] for j in countries]

In [None]:
dfA = pd.read_csv(files[0][0]) # dfA is df of pond zonStats
dfB = gpd.read_file(files[0][1]) # dfB is gdf of admin region

In [None]:
dfB_filtered = dfB[dfB['GID_2'].isin(dfA['GID_2'])]
GIDs = dfB_filtered['GID_2'].tolist()

colnames = [col for col in dfA.columns if 'majority' in col]

In [None]:
# Remove years with data gap > 5%
for i in range(len(colnames)):
  if len(dfA[dfA[colnames[i]]==2]) / len(dfA[colnames[i]]) > 0.05:
    dfA = dfA.drop(colnames[i], axis=1)

In [None]:
colnames_updated = [col for col in dfA.columns if 'majority' in col]
years = [i.split('_')[0] for i in colnames_updated]

In [None]:
dfB_joined = dfB_filtered
for y in years:
  dfA_1Yactive = dfA[dfA[y+'_majority']==1]

  df_area_1Y = dfA_1Yactive.groupby(['GID_2'], as_index=False)['Area'].sum()
  df_area_1Y['Area'] = round(df_area_1Y['Area']/10e4, 2)
  df_area_1Y = df_area_1Y.rename(columns={'Area':'Area_ha_'+y})

  dfB_joined = pd.merge(dfB_joined, df_area_1Y, on='GID_2', how='left')

In [None]:
colArea = [col for col in dfB_joined.columns if 'Area_ha' in col]
dfB_joined[colArea] = dfB_joined[colArea].fillna(0)

#### 2: Annual Rate of Increase in Pond Area by Provinces / Districts
$ R = \frac{A_{Y} - A_{Y-1}}{A_{Y-1}} \times 100 $

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
#countries = ['Cambodia', 'Vietnam', 'Thailand', 'Myanmar', 'Malaysia']

files = glob(os.path.join(os.getcwd(), '*dfC*', '*2*', '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_admin_zonStats_aggregated/admin_lev2/Cambodia_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_admin_zonStats_aggregated/admin_lev2/Vietnam_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_admin_zonStats_aggregated/admin_lev2/Thailand_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_admin_zonStats_aggregated/admin_lev2/Myanmar_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_admin_zonStats_aggregated/admin_lev2/Malaysia_zonStats_admin2.geojson']

In [None]:
df = gpd.read_file(files[1])

In [None]:
colnames = [col for col in df.columns if 'Area_ha' in col]
years = [i.split('_')[-1] for i in colnames]

In [None]:
# Annual Increase Rate
df_increase = df
for i in range(len(colnames)-1):
  increase_rate = round((df[colnames[i+1]] - df[colnames[i]]) / df[colnames[i]]*100, 1)
  df_increase['increase_rate(%)_'+years[i+1]] = increase_rate  

In [None]:
# Replace infinity with nan
df_increase.replace([np.inf, -np.inf], np.nan, inplace=True)

#### 3.1 Five-year Average Rate of Increase in Pond Area by Provinces / Districts:

$ avg.R = \frac{A_{T.tail} - A_{T.head}}{A_{T.head}} \times \frac{1}{T_{length}} \times 100 $

* Reference: https://sciencing.com/calculate-average-percent-change-5485263.html

* Output `None` if, for a time period, either "Area" is 0 for all years or only one year has "Area"!=0

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
#countries = ['Cambodia', 'Vietnam', 'Thailand', 'Myanmar', 'Malaysia']

files = glob(os.path.join(os.getcwd(), '*dfC*', '*2*', '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Cambodia_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Vietnam_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Thailand_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Myanmar_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Malaysia_zonStats_admin2.geojson']

In [None]:
df = gpd.read_file(files[2])

In [None]:
x = range(1985, 2020)
y = [x[n:n+6] for n in range(0, len(x), 5)]
category_yrs = [['Area_ha_'+str(i) for i in j] for j in y]

In [None]:
colnames = [col for col in df.columns if 'Area_ha' in col]
new_col = ['5y_avgGr_'+i[-1].split("_")[-1] for i in category_yrs]

period = [[i for i in j if i in colnames] for j in category_yrs]

In [None]:
def my_function(row):
  """For one row (admin region in the case), For every time period, 
  calculate the growth rate between the last value and the first non-zero value.
  
  This function is to be used with apply() function for on a DF"""

  new_attr = []

  for i in range(len(period)):
    cols = list(row[period[i]]) 
    # Select columns of a time period
    nzero_index = next((i for i, x in enumerate(cols) if x != 0), None) 
    # Find the index of the first non-zero element
  
    if nzero_index != None and len(cols[nzero_index:]) > 1:
      avgIn = (cols[-1] - cols[nzero_index]) / cols[nzero_index] / (len(cols[nzero_index:])-1) *100
      avgIn = round(avgIn, 1)
    else: 
      avgIn = None

    new_attr = new_attr + [avgIn]
  return new_attr

In [None]:
new_attrs_byrow = df.apply(my_function, axis=1) # axis=1 determines row-wise operation
# Each item are attributes for one row

In [None]:
new_attrs_bycol = [[i[j] for i in new_attrs_byrow] for j in range(len(new_attrs_byrow[0]))]
# Each item are the attributes for one column

In [None]:
df[new_col] = pd.Series(new_attrs_bycol)

In [None]:
df.head()

Unnamed: 0,OBJECTID,GID_0,NAME_0,GID_1,NAME_1,ENGTYPE_1,GID_2,NAME_2,ENGTYPE_2,Shape_Leng,...,Area_ha_2018,Area_ha_2019,geometry,5y_avgGr_1990,5y_avgGr_1995,5y_avgGr_2000,5y_avgGr_2005,5y_avgGr_2010,5y_avgGr_2015,5y_avgGr_2019
0,603,THA,Thailand,THA.13_1,Chumphon,Province,THA.13.1_1,Lamae,District,1.010119,...,4.46,4.7,"MULTIPOLYGON (((99.13953 9.82591, 99.13917 9.8...",7.8,91.9,7.9,2.0,6.1,-0.2,3.7
1,604,THA,Thailand,THA.13_1,Chumphon,Province,THA.13.2_1,Lang Suan,District,1.424844,...,8.21,8.64,"MULTIPOLYGON (((99.18861 10.05694, 99.18916 10...",-2.4,0.1,23.9,-4.0,4.0,0.2,9.0
2,605,THA,Thailand,THA.13_1,Chumphon,Province,THA.13.3_1,Muang Chumphon,District,1.878261,...,45.82,46.23,"MULTIPOLYGON (((99.30019 10.32064, 99.30055 10...",30.3,9.7,4.9,-2.4,3.1,1.0,1.8
3,606,THA,Thailand,THA.13_1,Chumphon,Province,THA.13.5_1,Phato,District,1.437504,...,0.0,0.0,"MULTIPOLYGON (((98.88230 10.00423, 98.88304 10...",,,,,,,
4,607,THA,Thailand,THA.13_1,Chumphon,Province,THA.13.6_1,Sawi,District,1.750744,...,14.29,14.46,"MULTIPOLYGON (((99.24962 10.19962, 99.24889 10...",37.2,33.4,6.0,1.1,1.2,0.1,2.3


#### 3.2 Average Annual Growth Rate
* $ avg.R = \frac{R_{T.head}+...+R_{T.tail}}{T_{length}} $
* Reference: https://www.investopedia.com/terms/a/aagr.asp

In [None]:
# Source: https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

years_category_5y = list(chunks(range(1986, 2020), 5))
years_category_10y = list(chunks(range(1981, 2020), 10))
colnames_category_5y = [['increase_rate(%)_'+str(i) for i in j] for j in years_category_5y]
colnames_category_10y = [['increase_rate(%)_'+str(i) for i in j] for j in years_category_10y]
#colnames_category

In [None]:
colnames_2 = [col for col in df_increase.columns if 'increase' in col]
colnames_5y_chunked = [[i for i in j if i in colnames_2] for j in colnames_category_5y]
colnames_10y_chunked = [[i for i in j if i in colnames_2] for j in colnames_category_10y]

In [None]:
df_avgIncrease = df_increase

for i in range(len(colnames_5y_chunked)):
  new_colname = '5YavgIncrease(%)_period_'+str(years_category_5y[i][-1])
  df_avgIncrease[new_colname] = round(df_avgIncrease[colnames_5y_chunked[i]].mean(axis=1), 1)

for i in range(len(colnames_10y_chunked)):
  new_colname = '10YavgIncrease(%)_period_' + str(years_category_10y[i][-1])
  df_avgIncrease[new_colname] = round(df_avgIncrease[colnames_10y_chunked[i]].mean(axis=1), 1)

#### 4. Ten-year Average Rate of Increase in Pond Area by Provinces / Districts

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
#countries = ['Cambodia', 'Vietnam', 'Thailand', 'Myanmar', 'Malaysia']

files = glob(os.path.join(os.getcwd(), '*dfC*', '*2*', '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Cambodia_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Vietnam_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Thailand_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Myanmar_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Malaysia_zonStats_admin2.geojson']

In [None]:
df = gpd.read_file(files[1])

In [None]:
x = range(1985, 2020)
x = [str(i) for i in x]

from more_itertools import split_after
y = list(split_after(x, lambda x: x.endswith('0')))
for i in range(1, len(y)):
  y[i] = [y[i-1][-1]] + y[i]

category_yrs = [['Area_ha_'+str(i) for i in j] for j in y]

In [None]:
colnames = [col for col in df.columns if 'Area_ha' in col]
new_col = ['10y_avgGr_'+i[-1].split("_")[-1] for i in category_yrs]

period = [[i for i in j if i in colnames] for j in category_yrs]

In [None]:
def my_function(row):
  """For one row (admin region in the case), For every time period, 
  calculate the growth rate between the last value and the first non-zero value.
  
  This function is to be used with apply() function for on a DF"""

  new_attr = []

  for i in range(len(period)):
    cols = list(row[period[i]]) 
    # Select columns of a time period
    nzero_index = next((i for i, x in enumerate(cols) if x != 0), None) 
    # Find the index of the first non-zero element
  
    if nzero_index != None and len(cols[nzero_index:]) > 1:
      avgIn = (cols[-1] - cols[nzero_index]) / cols[nzero_index] / (len(cols[nzero_index:])-1) *100
      avgIn = round(avgIn, 1)
    else: 
      avgIn = None

    new_attr = new_attr + [avgIn]
  return new_attr

In [None]:
new_attrs_byrow = df.apply(my_function, axis=1) # axis=1 determines row-wise operation
# Each item are attributes for one row

In [None]:
new_attrs_bycol = [[i[j] for i in new_attrs_byrow] for j in range(len(new_attrs_byrow[0]))]
# Each item are the attributes for one column

In [None]:
df[new_col] = pd.Series(new_attrs_bycol)

In [None]:
df.head()

Unnamed: 0,OBJECTID,GID_0,NAME_0,GID_1,NAME_1,ENGTYPE_1,GID_2,NAME_2,ENGTYPE_2,Shape_Leng,...,Area_ha_2015,Area_ha_2016,Area_ha_2017,Area_ha_2018,Area_ha_2019,geometry,10y_avgGr_1990,10y_avgGr_2000,10y_avgGr_2010,10y_avgGr_2019
0,4033,VNM,Vietnam,VNM.1_1,An Giang,Province,VNM.1.1_1,An PhÃº,District,0.703803,...,11.18,11.19,11.26,11.35,11.44,"MULTIPOLYGON (((105.13100 10.92130, 105.13191 ...",-0.1,-0.5,0.2,0.4
1,4034,VNM,Vietnam,VNM.1_1,An Giang,Province,VNM.1.10_1,Thoáº¡i SÆ¡n,District,0.990901,...,5.08,4.3,6.15,6.06,7.27,"MULTIPOLYGON (((105.12011 10.37594, 105.12595 ...",-4.8,15.1,-4.1,8.5
2,4035,VNM,Vietnam,VNM.1_1,An Giang,Province,VNM.1.11_1,Tri TÃ´n,District,1.054701,...,1.49,3.83,4.38,4.38,4.6,"MULTIPOLYGON (((105.11389 10.43545, 105.11304 ...",90.9,10.4,-7.9,148.6
3,4036,VNM,Vietnam,VNM.1_1,An Giang,Province,VNM.1.2_1,Chá»£ Má»›i,District,0.838008,...,10.32,10.45,10.5,10.85,11.18,"MULTIPOLYGON (((105.55319 10.51669, 105.55775 ...",0.5,3.3,-0.2,2.4
4,4037,VNM,Vietnam,VNM.1_1,An Giang,Province,VNM.1.3_1,ChÃ¢u Äá»‘c,City,0.454041,...,0.97,0.96,0.97,0.98,0.99,"MULTIPOLYGON (((105.12457 10.70979, 105.12977 ...",-13.0,0.6,-3.5,7.2


### Functions

#### 1. Total Area of Ponds in Provinces / Districts

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
countries = ['Cambodia', 'Vietnam', 'Thailand', 'Myanmar', 'Malaysia']

filesA = glob(os.path.join(os.getcwd(), '*dfA*', '*')) # Pond zonStats
filesB = glob(os.path.join(os.getcwd(), '*dfB*', '*2*', '*')) # Admin level-1 \ -2

files = [[i for i in filesA+filesB if j in i] for j in countries]

gidLevl = 'GID_2'

In [None]:
def genFun(n): # n=len(files)
  j = 0
  while j < n:
    dfA = pd.read_csv(files[j][0]) # dfA is df of pond zonStats
    dfB = gpd.read_file(files[j][1]) # dfB is gdf of admin region

    dfB_filtered = dfB[dfB[gidLevl].isin(dfA[gidLevl])] # Keep only admin regions containing ponds
    GIDs = dfB_filtered[gidLevl].tolist()

    colnames = [col for col in dfA.columns if 'majority' in col]
    
    # Remove years with data gap > 5%
    for i in range(len(colnames)):
      if len(dfA[dfA[colnames[i]]==2]) / len(dfA[colnames[i]]) > 0.05:
        dfA = dfA.drop(colnames[i], axis=1)

    colnames_updated = [col for col in dfA.columns if 'majority' in col]
    years = [i.split('_')[0] for i in colnames_updated]

    dfB_joined = dfB_filtered
    for y in years:
      dfA_1Yactive = dfA[dfA[y+'_majority']==1] # For year Y, select ponds that are active 

      df_area_1Y = dfA_1Yactive.groupby([gidLevl], as_index=False)['Area'].sum() # Group active ponds by GID and calculate sum of pond area for each GID
      df_area_1Y['Area'] = round(df_area_1Y['Area']/10e4, 2) # Transfer pond area from m2 to ha
      df_area_1Y = df_area_1Y.rename(columns={'Area':'Area_ha_'+y}) # Rename column

      dfB_joined = pd.merge(dfB_joined, df_area_1Y, on=gidLevl, how='left') # Join "pond area by GID" of year Y into admin df

      colArea = [col for col in dfB_joined.columns if 'Area_ha' in col]   
      dfB_joined[colArea] = dfB_joined[colArea].fillna(0)
      # NA filled with 0

    yield dfB_joined.to_file(os.path.join(os.getcwd(), 'dfC_admin_zonStats_aggregated', 'admin_lev2', countries[j]+'_zonStats_admin2.geojson'), driver='GeoJSON')
    j += 1

gen = genFun(len(files))

In [None]:
# Exhaust Generator
# Reference: https://stackoverflow.com/questions/47456631/simpler-way-to-run-a-generator-function-without-caring-about-items
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

#### 2. Annual Growth Rate of Pond Area by Provinces / Districts

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
admin_level = "admin_lev2"

files = glob(os.path.join(os.getcwd(), '*dfC*', admin_level, '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Cambodia_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Vietnam_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Thailand_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Myanmar_zonStats_admin2.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfC_pondArea_byRegion/admin_lev2/Malaysia_zonStats_admin2.geojson']

In [None]:
countries = [i.split('/')[-1].split('_')[0] for i in files]

In [None]:
def genFun(n): # n=len(files)
  j = 0
  while j < n:
    df = gpd.read_file(files[j])

    # Annual Increase Rate
    colnames = [col for col in df.columns if 'Area_ha' in col]
    years = [i.split('_')[-1] for i in colnames]

    df_increase = df
    for i in range(len(colnames)-1):
      increase_rate = round((df[colnames[i+1]] - df[colnames[i]]) / df[colnames[i]]*100, 1)
      df_increase['1yGr_'+years[i+1]] = increase_rate
    # Replace infinity with nan
    df_increase.replace([np.inf, -np.inf], np.nan, inplace=True)

    yield df_increase.to_file(os.path.join(os.getcwd(), 'dfD_GrR_byRegion', admin_level, "1y_Gr", countries[j]+'_1yGr.geojson'), driver='GeoJSON')
    j += 1

gen = genFun(len(files))

In [None]:
# Exhaust Generator
# Reference: https://stackoverflow.com/questions/47456631/simpler-way-to-run-a-generator-function-without-caring-about-items
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

#### 3. Five-year avg. Rate of Increase in Pond Area by Provinces / Districts

$ \frac{A_{T.tail} - A_{T.head}}{A_{T.head}} \times \frac{1}{T} \times 100 $

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")

admin_level = 'admin_lev1'
files = glob(os.path.join(os.getcwd(), '*dfC*', admin_level, '*'))

countries = [i.split('/')[-1].split('_')[0] for i in files]

In [None]:
output_path = os.path.join(os.getcwd(), 'dfD_GrR_byRegion', admin_level)

In [None]:
def genFun(n): # n=len(files)

  j = 0
  while j < n:
    df = gpd.read_file(files[j])

    x = range(1985, 2020)
    y = [x[n:n+6] for n in range(0, len(x), 5)]
    category_yrs = [['Area_ha_'+str(i) for i in j] for j in y]

    colnames = [col for col in df.columns if 'Area_ha' in col]
    new_col = ['5y_avgGr_'+i[-1].split("_")[-1] for i in category_yrs]

    period = [[i for i in j if i in colnames] for j in category_yrs]

    #---------------------------------------------------
    def my_function(row):
      """For one row (admin region in the case), For every time period, 
      calculate the growth rate between the last value and the first non-zero value.
      
      This function is to be used with apply() function for on a DF"""

      new_attr = []

      for i in range(len(period)):
        cols = list(row[period[i]]) 
        # Select columns of a time period
        nzero_index = next((i for i, x in enumerate(cols) if x != 0), None) 
        # Find the index of the first non-zero element
      
        if nzero_index != None and len(cols[nzero_index:]) > 1:
          avgIn = (cols[-1] - cols[nzero_index]) / cols[nzero_index] / (len(cols[nzero_index:])-1) *100
          avgIn = round(avgIn, 1)
        else: 
          avgIn = None

        new_attr = new_attr + [avgIn]
      return new_attr
    #---------------------------------------------------

    new_attrs_byrow = df.apply(my_function, axis=1)

    new_attrs_bycol = [[i[j] for i in new_attrs_byrow] for j in range(len(new_attrs_byrow[0]))]

    df[new_col] = pd.Series(new_attrs_bycol)

    yield df.to_file(os.path.join(output_path, countries[j]+'_5yGr.geojson'), driver='GeoJSON')
    j += 1

gen = genFun(len(files))

In [None]:
# Exhaust Generator
# Reference: https://stackoverflow.com/questions/47456631/simpler-way-to-run-a-generator-function-without-caring-about-items
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

#### 4. Ten-year avg. Rate of Increase in Pond Area by Provinces / Districts

$ \frac{A_{T.tail} - A_{T.head}}{A_{T.head}} \times \frac{1}{T} \times 100 $

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")

admin_level = 'admin_lev1'
files = glob(os.path.join(os.getcwd(), '*dfC*', admin_level, '*'))

countries = [i.split('/')[-1].split('_')[0] for i in files]

In [None]:
output_path = os.path.join(os.getcwd(), 'dfD_GrR_byRegion', admin_level)

In [None]:
def genFun(n): # n=len(files)

  j = 0
  while j < n:
    df = gpd.read_file(files[j])

    x = range(1985, 2020)
    x = [str(i) for i in x]

    from more_itertools import split_after
    y = list(split_after(x, lambda x: x.endswith('0')))
    for i in range(1, len(y)):
      y[i] = [y[i-1][-1]] + y[i]

    category_yrs = [['Area_ha_'+str(i) for i in j] for j in y]

    colnames = [col for col in df.columns if 'Area_ha' in col]
    new_col = ['10y_avgGr_'+i[-1].split("_")[-1] for i in category_yrs]

    period = [[i for i in j if i in colnames] for j in category_yrs]

    #---------------------------------------------------
    def my_function(row):
      """For one row (admin region in the case), For every time period, 
      calculate the growth rate between the last value and the first non-zero value.
      
      This function is to be used with apply() function for on a DF"""

      new_attr = []

      for i in range(len(period)):
        cols = list(row[period[i]]) 
        # Select columns of a time period
        nzero_index = next((i for i, x in enumerate(cols) if x != 0), None) 
        # Find the index of the first non-zero element
      
        if nzero_index != None and len(cols[nzero_index:]) > 1:
          avgIn = (cols[-1] - cols[nzero_index]) / cols[nzero_index] / (len(cols[nzero_index:])-1) *100
          avgIn = round(avgIn, 1)
        else: 
          avgIn = None

        new_attr = new_attr + [avgIn]
      return new_attr
    #---------------------------------------------------

    new_attrs_byrow = df.apply(my_function, axis=1)

    new_attrs_bycol = [[i[j] for i in new_attrs_byrow] for j in range(len(new_attrs_byrow[0]))]

    df[new_col] = pd.Series(new_attrs_bycol)

    yield df.to_file(os.path.join(output_path, countries[j]+'_10yGr.geojson'), driver='GeoJSON')
    j += 1

gen = genFun(len(files))

In [None]:
# Exhaust Generator
# Reference: https://stackoverflow.com/questions/47456631/simpler-way-to-run-a-generator-function-without-caring-about-items
from collections import deque

def exhaust(generator):
    deque(generator, maxlen=0)

exhaust(gen)

### Merge Outputs

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
admin_level = "admin_lev2"
files = glob(os.path.join(os.getcwd(), '*dfD*', admin_level, '*1y*', '*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfD_GrR_byRegion/admin_lev2/1y_Gr/Cambodia_1yGr.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfD_GrR_byRegion/admin_lev2/1y_Gr/Vietnam_1yGr.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfD_GrR_byRegion/admin_lev2/1y_Gr/Thailand_1yGr.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfD_GrR_byRegion/admin_lev2/1y_Gr/Myanmar_1yGr.geojson',
 '/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfD_GrR_byRegion/admin_lev2/1y_Gr/Malaysia_1yGr.geojson']

In [None]:
dfs = [gpd.read_file(i) for i in files]

In [None]:
df_concat = pd.concat([i for i in dfs])

In [None]:
df_concat.to_file(os.path.join(os.getcwd(), 'dfD_GrR_byRegion', admin_level, 'all_1yGr_lev2.geojson'), driver='GeoJSON')

## Plot: Violin + Boxplot of Growth Rate

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/")
files = glob(os.path.join(os.getcwd(), "*dfD*", '*lev2*', '*1y*', '*all*'))
files

['/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/7.map_development/dfD_GrR_byRegion/admin_lev2/1y_Gr/all_1yGr_lev2.geojson']

In [None]:
gdf = gpd.read_file(files[0])

In [None]:
colnames = [i for i in gdf.columns if '1y' in i]

In [None]:
ls = []
for i in range(len(colnames)):
  df_new = gdf[['GID_0', 'NAME_0', 'GID_1', 'NAME_1', 'GID_2', 'NAME_2']]
  df_new['stats_item'] = [colnames[i]]*len(gdf.index)
  df_new['year'] = [i.split('_')[-1] for i in df_new['stats_item']]
  df_new['stats_value'] = gdf[colnames[i]]
  ls.append(df_new)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: 

In [None]:
df_reshaped = pd.concat([i for i in ls])

In [None]:
print(gdf.columns)
print(df_reshaped.columns)

Index(['OBJECTID', 'GID_0', 'NAME_0', 'GID_1', 'NAME_1', 'ENGTYPE_1', 'GID_2',
       'NAME_2', 'ENGTYPE_2', 'Shape_Leng', 'Shape_Area', 'Area_ha_1988',
       'Area_ha_1989', 'Area_ha_1990', 'Area_ha_1991', 'Area_ha_1992',
       'Area_ha_1993', 'Area_ha_1994', 'Area_ha_1995', 'Area_ha_1996',
       'Area_ha_1997', 'Area_ha_1998', 'Area_ha_1999', 'Area_ha_2000',
       'Area_ha_2001', 'Area_ha_2002', 'Area_ha_2003', 'Area_ha_2004',
       'Area_ha_2005', 'Area_ha_2006', 'Area_ha_2007', 'Area_ha_2008',
       'Area_ha_2009', 'Area_ha_2010', 'Area_ha_2011', 'Area_ha_2012',
       'Area_ha_2013', 'Area_ha_2014', 'Area_ha_2015', 'Area_ha_2016',
       'Area_ha_2017', 'Area_ha_2018', 'Area_ha_2019', '1yGr_1989',
       '1yGr_1990', '1yGr_1991', '1yGr_1992', '1yGr_1993', '1yGr_1994',
       '1yGr_1995', '1yGr_1996', '1yGr_1997', '1yGr_1998', '1yGr_1999',
       '1yGr_2000', '1yGr_2001', '1yGr_2002', '1yGr_2003', '1yGr_2004',
       '1yGr_2005', '1yGr_2006', '1yGr_2007', '1yGr_2008', '1yGr_2

In [None]:
df_reshaped.to_csv(os.path.join(os.getcwd(), 'dfE_tidy_for_plot', 'lev2_all_1yGr.csv'), index=False)

## Hexagon Map under test

In [None]:
os.chdir("/content/drive/MyDrive/THESIS_AQUAPONDS/gdf_zonstats/3.merged_by_country/")
hexgrid_ini = gpd.read_file(os.path.join(os.getcwd(), 'hexGrid_KHM.geojson'))
ponds = gpd.read_file(os.path.join(os.getcwd(), 'Cambodia_merged.geojson'))

In [None]:
hexgrid_ini['hexagonID'] = range(len(hexgrid_ini))

* https://gis.stackexchange.com/questions/358735/how-to-obtain-mean-maximum-and-mininum-of-all-points-located-within-polygons-u

In [None]:
ponds_sj = gpd.sjoin(ponds, hexgrid_ini, how='left', predicate='within')

In [None]:
pondsOn_1987 = ponds_sj[ponds_sj['1987_majority']==1]

In [None]:
col_to_aggregate = [i for i in ponds_sj.columns if 'majority' in i]
new_colnames = ['Area_'+i.split('_')[0] for i in col_to_aggregate]
pondsOn = [ponds_sj[ponds_sj[i]==1] for i in col_to_aggregate]

In [None]:
d = {'a':1, 'b':2}
for key,val in d.items():
  exec(key + '=val')

In [None]:
ponds_stats = [i.groupby('hexagonID')['Area'].agg(['sum']) for i in pondsOn]
#ponds_stats = [i.rename({'sum':'Area'}, axis=1, inplace=True) for i,j in zip(ponds_stats, new_colnames)]
len(ponds_stats)

33

In [None]:
ponds_stats[3].info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 662 entries, 32.0 to 3204.0
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   sum     662 non-null    float64
dtypes: float64(1)
memory usage: 10.3 KB


In [None]:
ponds_stats[32]

In [None]:

ponds_stats = pondsOn_1987.groupby('hexagonID').agg(Area_1987 = ('Area','sum'))
ponds_stats

Unnamed: 0_level_0,Area_1987
hexagonID,Unnamed: 1_level_1
32.0,7.204881e+05
33.0,1.019850e+06
34.0,8.708514e+04
35.0,5.076738e+03
78.0,4.843661e+04
...,...
2996.0,2.247707e+03
3050.0,2.838944e+03
3051.0,4.004482e+03
3101.0,6.823182e+02


## Explore: Average Rate of Increase

In [None]:
v = [10, 10, 20, 10, 2, 50]

In [None]:
# Annual Growth Rate
ar = [(v[i+1]-v[i])/v[i]*100 for i in range(len(v)-1)]
ar

[0.0, 100.0, -50.0, -80.0, 2400.0]

In [None]:
# Average Annual Growth Rate
# https://www.investopedia.com/terms/a/aagr.asp
aagr = np.mean(ar)
print("Average Annual Growth Rate is "+str(aagr)+" %")

Average Annual Growth Rate is 474.0 %


In [None]:
# Compounded Annual Growth Rate
# https://www.investopedia.com/terms/c/cagr.asp
cagr = ((v[-1]/v[0])**(1/(len(v)-1))-1)*100
print("Compounded Annual Growth Rate is "+str(round(cagr,2))+" %")

Compounded Annual Growth Rate is 37.97 %


In [None]:
# Compounded Annual Growth Rate v2?
((v[-1]-v[0])/v[0]) / (len(v)-1) * 100

80.0