# Lago de Chapala analysis

In [1]:
from pystac_client import Client
from pystac.extensions.eo import EOExtension as eo
import planetary_computer as pc
import geopandas as gpd
import h3
import pandas as pd

import rasterio
from rasterio import windows
from rasterio import features
from rasterio import warp
import rasterio.mask
from rasterio.enums import Resampling
from rasterio.merge import merge

import numpy as np
from PIL import Image

import matplotlib.pyplot as plt

from shapely.geometry import Point

from tqdm import tqdm

import os
import sys
module_path = os.path.abspath(os.path.join('../../../'))
if module_path not in sys.path:
    sys.path.append(module_path)
    import aup

  ox.config(


# Notebook analysis

In [2]:
# Analysis name
index_analysis = 'Chapala_ndwi'

# Analysed raster data from local directory:
tmp_dir = f'../../../data/processed/{index_analysis}/'

In [15]:
#Define the variable 
save_diskspace=True

#Load data CSV
city = 'Chapala'
r = 10
chapala_complete_dataset = pd.read_csv(tmp_dir+f'{index_analysis}_HexRes{r}_v0.csv')

#Show
print(chapala_complete_dataset.shape)

#Load data Geojson
city = 'Chapala'
r = 10
chapala_gdf = gpd.read_file(tmp_dir + f'{index_analysis}_HexRes{r}_v0.geojson')

#Show
print(chapala_complete_dataset.shape)
print(chapala_gdf.shape)
#------------------------------------------------------------------------------------------------------------

#Gdfs to concatenate
complete_gdfs = [chapala_complete_dataset]

#Concatenate
complete_dataset = pd.concat(complete_gdfs)

#Save space in disk deleting:
if save_diskspace:
    del chapala_complete_dataset

#Format
complete_dataset = complete_dataset.drop(columns=['Unnamed: 0'])
    
#Show
print(complete_dataset.shape)
complete_dataset.head(2)

(633276, 7)
(633276, 7)
(70364, 11)
(633276, 6)


Unnamed: 0,hex_id,res,ndwi,month,year,city
0,8a49aa226d17fff,10,0.250447,1,2024,Chapala
1,8a49aa751917fff,10,-0.250447,1,2024,Chapala


In [16]:
gdf1 = chapala_gdf.copy()[['hex_id','res','city','geometry']]
df2 = complete_dataset.copy()[['hex_id','ndwi','month', 'year']]

complete_dataset_dams = pd.merge(gdf1, df2, how='inner', on='hex_id')

#Save space in disk deleting:
if save_diskspace:
    del gdf1
    del df2
    del complete_dataset

#Show
print(complete_dataset_dams.shape)
complete_dataset_dams.head(2)

(633276, 7)


Unnamed: 0,hex_id,res,city,geometry,ndwi,month,year
0,8a49aa226d17fff,10,Chapala,"POLYGON ((-103.34413 20.27176, -103.34348 20.2...",0.250447,1,2024
1,8a49aa226d17fff,10,Chapala,"POLYGON ((-103.34413 20.27176, -103.34348 20.2...",0.231652,2,2024


# Create Dataframe


In [17]:
#Creates datetime data by adding month (string) + / + year (string)
complete_dataset_dams['date'] = pd.to_datetime(complete_dataset_dams['month'].astype(str) + '/' + complete_dataset_dams['year'].astype(str))

#Strips the day, leaving month and year
complete_dataset_dams['date'] = complete_dataset_dams['date'].apply(lambda x: x.strftime('%Y-%m'))

#Show
print(complete_dataset_dams.shape)
complete_dataset_dams.head(2)

  complete_dataset_dams['date'] = pd.to_datetime(complete_dataset_dams['month'].astype(str) + '/' + complete_dataset_dams['year'].astype(str))


(633276, 8)


Unnamed: 0,hex_id,res,city,geometry,ndwi,month,year,date
0,8a49aa226d17fff,10,Chapala,"POLYGON ((-103.34413 20.27176, -103.34348 20.2...",0.250447,1,2024,2024-01
1,8a49aa226d17fff,10,Chapala,"POLYGON ((-103.34413 20.27176, -103.34348 20.2...",0.231652,2,2024,2024-02


# Create Categories

In [18]:
# Creating vegetation categories
complete_dataset_dams['ndwi_category'] = np.nan

complete_dataset_dams.loc[complete_dataset_dams.ndwi>=0.2 , 'ndwi_category'] = 'Water surface'        
complete_dataset_dams.loc[(complete_dataset_dams.ndwi>=0.0)&
             (complete_dataset_dams.ndwi<0.2), 'ndwi_category'] = 'Flooding, humidity'
complete_dataset_dams.loc[(complete_dataset_dams.ndwi>=-0.3)&
             (complete_dataset_dams.ndwi<0.0), 'ndwi_category'] = 'Moderate drought, non-aqueous surfaces'
complete_dataset_dams.loc[(complete_dataset_dams.ndwi<-0.3), 'ndwi_category'] = 'Drought, non-aqueous surfaces'

# Ordering data
categories = ['Drought, non-aqueous surfaces', 'Moderate drought, non-aqueous surfaces', 'Flooding, humidity', 'Water surface']
complete_dataset_dams['ndwi_category'] = pd.Categorical(complete_dataset_dams['ndwi_category'], categories=categories, ordered=True)

#Show
print(complete_dataset_dams.shape)
complete_dataset_dams.head(2)

(633276, 9)


  complete_dataset_dams.loc[complete_dataset_dams.ndwi>=0.2 , 'ndwi_category'] = 'Water surface'


Unnamed: 0,hex_id,res,city,geometry,ndwi,month,year,date,ndwi_category
0,8a49aa226d17fff,10,Chapala,"POLYGON ((-103.34413 20.27176, -103.34348 20.2...",0.250447,1,2024,2024-01,Water surface
1,8a49aa226d17fff,10,Chapala,"POLYGON ((-103.34413 20.27176, -103.34348 20.2...",0.231652,2,2024,2024-02,Water surface


# Create Hex with and without water

In [19]:
#Select surfaces with water
water_list = ['Flooding, humidity', 'Water surface']

#Differenciate
complete_dataset_dams.loc[:,'hex_water'] = complete_dataset_dams['ndwi_category'].apply(lambda x: 1 if x in water_list else 0)

#Show
print(complete_dataset_dams.shape)
complete_dataset_dams.head(2)

(633276, 10)


Unnamed: 0,hex_id,res,city,geometry,ndwi,month,year,date,ndwi_category,hex_water
0,8a49aa226d17fff,10,Chapala,"POLYGON ((-103.34413 20.27176, -103.34348 20.2...",0.250447,1,2024,2024-01,Water surface,1
1,8a49aa226d17fff,10,Chapala,"POLYGON ((-103.34413 20.27176, -103.34348 20.2...",0.231652,2,2024,2024-02,Water surface,1


# Create Data Summary

In [21]:
#Calculate average hex area according to resolution and location
available_dates = len(complete_dataset_dams.date.unique())
data_rows = available_dates

#Empty historical behaviour dataset
behaviour_bydate = pd.DataFrame()

#Summarize data
#Loop for each city (dam)
for city in complete_dataset_dams.city.unique():

    #Create empty temporary dataFrame
    cols = ['date', 'mean_ndwi']
    temp_df = pd.DataFrame(columns=cols, index=range(data_rows)) # rows = available dates * available categories
    temp_df['city'] = city 

    #Loop for each row (date)
    count = 0

    #PRIMERA SUBDIVISIÓN: DATASET DE LA CIUDAD (dam)
    complete_bycity = complete_dataset_dams.loc[(complete_dataset_dams.city == city)]

    #Loop for each date in city (Dam)
    for date in complete_bycity.date.unique():
        #SEGUNDA SUBDIVISIÓN: DATASET DE LA CIUDAD (dam) Y LA FECHA.
        complete_bydate = complete_bycity.loc[(complete_bycity.date == date)]
        
        #Find data
        ndwi_value = complete_bydate.ndwi.mean()
        #Register data in DataFrame
        temp_df.loc[count,'date'] = date
        temp_df.loc[count,'mean_ndwi'] = ndwi_value

        #Loop for each raster category in the date in the city (dam)
        categories = ['Drought, non-aqueous surfaces', 'Moderate drought, non-aqueous surfaces', 'Flooding, humidity', 'Water surface']
        
        for cat in categories:
            #TERCERA SUBDIVISIÓN: DATASET DE LA CIUDAD, LA FECHA Y LA CATEGORÍA.
            complete_bydate_cat = complete_bydate.loc[(complete_bydate.ndwi_category == cat)]

            #Find data
            hexcount = complete_bydate_cat.shape[0]
            #Register data
            temp_df.loc[count, cat] = hexcount

        #Next row (date) in city (dam) dataFrame
        count = count + 1

    #When all dates of city are over, concat and re-start count and data for following city (dam)
    behaviour_bydate = pd.concat([behaviour_bydate,temp_df])
    
    #Save space in disk deleting:
    if save_diskspace:
        del complete_bycity
        del complete_bydate
        del temp_df

#Save date data
behaviour_bydate['month'] = behaviour_bydate['date'].apply(lambda x: pd.Timestamp(x).month)
behaviour_bydate['year'] = behaviour_bydate['date'].apply(lambda x: pd.Timestamp(x).year)

#Final format
behaviour_bydate = behaviour_bydate[['date','year','month','mean_ndwi',
                                     'Drought, non-aqueous surfaces', 'Moderate drought, non-aqueous surfaces', 
                                     'Flooding, humidity', 'Water surface','city']]

#Show
print(behaviour_bydate.shape)
behaviour_bydate.head(9)

(9, 9)


Unnamed: 0,date,year,month,mean_ndwi,"Drought, non-aqueous surfaces","Moderate drought, non-aqueous surfaces","Flooding, humidity",Water surface,city
0,2024-01,2024,1,0.130227,3541.0,8779.0,13349.0,44695.0,Chapala
1,2024-02,2024,2,0.128092,2562.0,9848.0,15038.0,42916.0,Chapala
2,2024-03,2024,3,0.110746,3273.0,9351.0,20050.0,37690.0,Chapala
3,2024-04,2024,4,0.071346,2725.0,11218.0,54370.0,2051.0,Chapala
4,2024-05,2024,5,0.067982,2865.0,15259.0,33790.0,18450.0,Chapala
5,2024-06,2024,6,0.032996,2687.0,15286.0,45319.0,7072.0,Chapala
6,2024-07,2024,7,0.066342,5179.0,10966.0,36246.0,17973.0,Chapala
7,2024-08,2024,8,0.028972,10782.0,8707.0,44250.0,6625.0,Chapala
8,2024-09,2024,9,0.053484,7760.0,11195.0,39224.0,12185.0,Chapala


In [22]:
#sum 
behaviour_bydate['suma agua']=behaviour_bydate['Flooding, humidity']+behaviour_bydate['Water surface']
behaviour_bydate

Unnamed: 0,date,year,month,mean_ndwi,"Drought, non-aqueous surfaces","Moderate drought, non-aqueous surfaces","Flooding, humidity",Water surface,city,suma agua
0,2024-01,2024,1,0.130227,3541.0,8779.0,13349.0,44695.0,Chapala,58044.0
1,2024-02,2024,2,0.128092,2562.0,9848.0,15038.0,42916.0,Chapala,57954.0
2,2024-03,2024,3,0.110746,3273.0,9351.0,20050.0,37690.0,Chapala,57740.0
3,2024-04,2024,4,0.071346,2725.0,11218.0,54370.0,2051.0,Chapala,56421.0
4,2024-05,2024,5,0.067982,2865.0,15259.0,33790.0,18450.0,Chapala,52240.0
5,2024-06,2024,6,0.032996,2687.0,15286.0,45319.0,7072.0,Chapala,52391.0
6,2024-07,2024,7,0.066342,5179.0,10966.0,36246.0,17973.0,Chapala,54219.0
7,2024-08,2024,8,0.028972,10782.0,8707.0,44250.0,6625.0,Chapala,50875.0
8,2024-09,2024,9,0.053484,7760.0,11195.0,39224.0,12185.0,Chapala,51409.0
