In [1]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
import hashlib

import dask
import xarray as xr
import xrspatial
from dask.diagnostics import ProgressBar
from geocube.api.core import make_geocube

import matplotlib.pyplot as plt
import seaborn as sns

PATH = "Z:\Laboral\World Bank\CLIENT v2"
DATA_RAW = rf"{PATH}\Data\Data_raw"
DATA_PROC = rf"{PATH}\Data\Data_proc"
DATA_OUT = rf"{PATH}\Data\Data_out"



In [2]:
# floods = pd.read_csv(rf"{DATA_RAW}\Floods\GloFAS_floods.csv")

def load_population_data(bounds=None):
    
    # Select all files in GPW folder
    gpw_path = r"Z:\WB Data\Gridded Population of The World (GPWv4)"
    files = os.listdir(gpw_path)
    files = [f for f in files if f.endswith(".tif")]
    
    # Compile into a single dataset
    dss = []
    for f in files:
        ds = xr.open_dataset(os.path.join(gpw_path, f), chunks={"x": 100, "y": 100})
        ds["year"] = int(f.split("_")[5])
        ds = ds.set_coords('year')
        dss += [ds]
    population = xr.concat(dss, dim="year")    
    
    # Filter if bounds are provided
    if bounds is not None:
        population = population.sel(
            x=slice(bounds[0], bounds[2]), y=slice(bounds[3], bounds[1])
        )
        
    # Clean band dimension
    population = population.sel(band=1).drop_vars(["band"])
    
    return population

def load_precipitation_data():
    era5 = xr.open_dataset(
        rf"{DATA_OUT}\ERA5_monthly_1970-2021_SPI-SPEI.nc",
        chunks={"latitude": 100, "longitude": 100},
    )
    era5 = era5.rename({"latitude": "y", "longitude": "x"})
    return

def load_WB_country_data(drop_adm2_na=True):
    WB_country = gpd.read_file(rf"{DATA_RAW}\world_bank_adm2.zip")
    
    # Assign nan when ADM2 is not available 
    WB_country.loc[WB_country.ADM2_NAME == "Administrative unit not available", "ADM2_CODE"] = (
        np.nan
    )
    
    # Create ADM_LAST variable: ADM2_NAME if available, else ADM1_NAME
    WB_country["ADM_LAST"] = WB_country.ADM2_NAME
    WB_country.loc[WB_country.ADM_LAST.isnull(), "ADM_LAST"] = WB_country.ADM1_NAME
    
    if drop_adm2_na:
        # Drop rows with missing ADM2_CODE    
        WB_country = WB_country.dropna(subset=["ADM2_CODE"])
        
        # Create ID
        WB_country["str_to_hash"] = WB_country["ADM2_CODE"].astype(str) + WB_country["ADM1_CODE"].astype(str) + WB_country["ADM0_CODE"].astype(str)
        WB_country["ID"] = WB_country["str_to_hash"].apply(lambda x: hashlib.md5(x.encode()).hexdigest())
        WB_country.drop(columns="str_to_hash", inplace=True)

    return WB_country


def rasterize_shape_like_dataset(shape, dataset):
    print("Rasterizing shape...")
    raster = make_geocube(
        vector_data=shape,
        like=dataset,
    )
    # For some reason, like option is not working, so I have to manually add x and y
    assert (raster["x"].shape == dataset["x"].shape)
    assert (raster["y"].shape == dataset["y"].shape)
    raster["x"] = dataset["x"]
    raster["y"] = dataset["y"]
    raster = raster.drop_vars(["spatial_ref"])
    raster = raster.chunk({"x": 100, "y": 100})
    print("Done!")
    return raster

def compute_zonal_stats(dataset, shape, value_var, groupby_var, gridded_groups=None, stats_funcs=["sum"], delayed=True):
    
    # Rasterize shape
    if gridded_groups is None:
        gridded_groups = rasterize_shape_like_dataset(shape[[groupby_var, "geometry"]], dataset)

    # Compute zonal stats  
    assert gridded_groups.chunks is not None, "Please, chunk the dataset before computing zonal stats! (e.g. dataset.chunk({'x': 100, 'y': 100})). Otherwise, you will get a MemoryError."
    assert dataset.chunks is not None, "Please, chunk the dataset before computing zonal stats! (e.g. dataset.chunk({'x': 100, 'y': 100})). Otherwise, you will get a MemoryError."

    print("Setting up zonal stats...")
    pop_by_adm = xrspatial.zonal.stats(gridded_groups[groupby_var], dataset[value_var], stats_funcs=stats_funcs)
    print("Done! Computing zonal stats...")    
    if delayed:
        return pop_by_adm
    
    with ProgressBar():
        pop_by_adm = pop_by_adm.compute()
    
    # Format zonal_stats dataframe
    pop_by_adm = pop_by_adm.rename(columns={
        "sum": value_var,
        "mean": f"{value_var}_mean",
        "zone": groupby_var,
    })
    
    result = (
        shape[[groupby_var, "geometry"]]
        .merge(pop_by_adm, on=groupby_var)
    )
    return result 

def compute_zonal_stats_over_time(dataset, shape, value_var, groupby_var, population_data=None, gridded_groups=None, stats_funcs=["mean"], delayed=True):
    import warnings
    warnings.filterwarnings("ignore", category=RuntimeWarning)
    
    # Rasterize shape
    if gridded_groups is None:
        gridded_groups = rasterize_shape_like_dataset(shape[[groupby_var, "geometry"]], dataset)

    # Compute zonal stats  
    assert gridded_groups.chunks is not None, "Please, chunk the dataset before computing zonal stats! (e.g. dataset.chunk({'x': 100, 'y': 100})). Otherwise, you will get a MemoryError."
    assert dataset.chunks is not None, "Please, chunk the dataset before computing zonal stats! (e.g. dataset.chunk({'x': 100, 'y': 100})). Otherwise, you will get a MemoryError."
    assert "year" in dataset.dims, "Please, add a 'year' dimension to the dataset before computing zonal stats! (e.g. dataset = dataset.assign_coords(year=dataset.time.dt.year))."
 
    print("Setting up zonal stats...")
    tasks = []
    for year in tqdm(dataset["year"].values):
        dataset_year = dataset.sel(year=year).drop_vars("year")
        if population_data is not None:
            dataset_year = dataset_year * population_data.sel(year=year, method="nearest").drop_vars("year")
        else:
            dataset_year[value_var] = dataset_year[value_var].astype("float32")

        tasks += [xrspatial.zonal.stats(zones=gridded_groups[groupby_var], values=dataset_year[value_var], stats_funcs=stats_funcs)]
    if delayed:
        return tasks

    print("Done! Computing zonal stats...")        
    with ProgressBar():
        result = dask.compute(*tasks)
        
    return result 

def compile_zonal_stats_over_time(tasks_results, shape, groupby_var, value_var):
    
    # Compile results into a single df
    out_dict = {year: data.set_index("zone") for year, data in zip(range(1970,2021), tasks_results)}
    df = pd.concat(out_dict)
    df = df.reset_index()
    df = df.rename(columns={"level_0":"year"})
    
    # Format zonal_stats dataframe
    df = df.rename(columns={
        "sum": value_var,
        "mean": value_var,
        "zone": groupby_var,
    })
    
    result = (
        shape[[groupby_var, "geometry"]]
        .merge(df, on=groupby_var)
    )
    return result

def process_era5_data():
   
    # Load ERA5 data
    
    # Create droughts dummies
    
    # Annualize series
    
    return

In [3]:
WB_country = load_WB_country_data()
population = load_population_data(bounds=WB_country.total_bounds)

# Drop data with no ADM2_CODE

# Rasterize WB_country
WB_country_grid = rasterize_shape_like_dataset(
    WB_country[["ID", "geometry"]], 
    population
)

WB_country_path = rf"{DATA_PROC}\WB_country_grid.nc"
if not os.path.exists(WB_country_path):
    print("Saving WB_country_grid...")
    with ProgressBar():
        WB_country_grid.to_netcdf(WB_country_path)

Cannot find the ecCodes library


Rasterizing shape...
Done!


In [4]:
WB_country_path = rf"{DATA_PROC}\WB_country_grid.nc"
WB_country_grid = xr.open_dataset(WB_country_path, chunks={"x": 100, "y": 100})

In [12]:
# Aggregate population data to match era5 grid resolution
population_regridded = population.interp(x=era5.x, y=era5.y, method="sum")
population_regridded

In [10]:
population

Unnamed: 0,Array,Chunk
Bytes,13.07 GiB,39.06 kiB
Shape,"(5, 16242, 43200)","(1, 100, 100)"
Dask graph,352080 chunks in 18 graph layers,352080 chunks in 18 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 13.07 GiB 39.06 kiB Shape (5, 16242, 43200) (1, 100, 100) Dask graph 352080 chunks in 18 graph layers Data type float32 numpy.ndarray",43200  16242  5,

Unnamed: 0,Array,Chunk
Bytes,13.07 GiB,39.06 kiB
Shape,"(5, 16242, 43200)","(1, 100, 100)"
Dask graph,352080 chunks in 18 graph layers,352080 chunks in 18 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [9]:
WB_country_grid

Unnamed: 0,Array,Chunk
Bytes,5.53 GiB,78.12 kiB
Shape,"(17173, 43200)","(100, 100)"
Dask graph,74304 chunks in 2 graph layers,74304 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 5.53 GiB 78.12 kiB Shape (17173, 43200) (100, 100) Dask graph 74304 chunks in 2 graph layers Data type float64 numpy.ndarray",43200  17173,

Unnamed: 0,Array,Chunk
Bytes,5.53 GiB,78.12 kiB
Shape,"(17173, 43200)","(100, 100)"
Dask graph,74304 chunks in 2 graph layers,74304 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.53 GiB,78.12 kiB
Shape,"(17173, 43200)","(100, 100)"
Dask graph,74304 chunks in 2 graph layers,74304 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 5.53 GiB 78.12 kiB Shape (17173, 43200) (100, 100) Dask graph 74304 chunks in 2 graph layers Data type float64 numpy.ndarray",43200  17173,

Unnamed: 0,Array,Chunk
Bytes,5.53 GiB,78.12 kiB
Shape,"(17173, 43200)","(100, 100)"
Dask graph,74304 chunks in 2 graph layers,74304 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [5]:
droughts_path = rf"{DATA_OUT}\ERA5_droughts_1970-2021.nc"
# if not os.path.exists(droughts_path):
print("Preparing droughts dataset...")
# Genera base de sequías
era5 = xr.open_dataset(rf"{DATA_OUT}\ERA5_monthly_1970-2021_SPI-SPEI.nc", chunks={'latitude': 100, 'longitude': 100, 'time': 5})
era5 = era5.rename({'latitude': 'y', 'longitude': 'x'})
# Corrije la dimensión x, que va de 0 a 360
era5['x'] = np.linspace(-180, 180, era5.x.size)

# Calcula las sequías anuales
droughts_yearly = era5.groupby("time.year").max()
spi_spei_vars = [var for var in droughts_yearly.data_vars if "-" in var]
for var in spi_spei_vars:
    for threshold_str in ["1_0", "1_5", "2_0", "2_5"]:
        threshold = float(threshold_str.replace("_", "."))
        droughts_yearly[f"drought_{var}_{threshold_str}"] = (droughts_yearly[var] < -threshold).astype("bool")

droughts_yearly = droughts_yearly[[var for var in droughts_yearly.data_vars if "drought" in var]]
# droughts_resamp = droughts_yearly.reindex(y=WB_country_grid.y, x=WB_country_grid.x, method="nearest")
droughts_resamp = droughts_yearly.interp_like(WB_country_grid, method="nearest")
# droughts_resamp = droughts_resamp.chunk({'x': 100, 'y': 100, 'year': 5})
#     print("Saving droughts...")
#     with ProgressBar():
#         droughts_resamp.to_netcdf(droughts_path)
#     print("Done!")
# else:
#     droughts_resamp = xr.open_dataset(droughts_path)

Preparing droughts dataset...


In [6]:
with ProgressBar():
    droughts_resamp.to_netcdf(droughts_path)

[                                        ] | 0% Completed | 99m 54sss


KeyboardInterrupt: 

In [8]:
# Combine the population and country grid
shocks = xr.combine_by_coords(
    [
        droughts_resamp,
        # floods_yearly,
        # hurricanes_yearly,
        # heatwaves_yearly,
        # coldwaves_yearly,
    ],
    combine_attrs="override",
)

# # List of shocks
# shocks = shocks_and_population.drop_vars("population").data_vars
# for var in shocks:  
#     shocks_and_population[f"{var}_pop_affected"] = shocks_and_population[var] * shocks_and_population["population"]
    
# with ProgressBar():
#     shocks_and_population.to_netcdf(rf"{DATA_PROC}\shocks_and_population.nc")

In [10]:
tasks = []
### Affected area == number of affected rasters / total number of rasters == average of SPI-1 over all the rasters contained in each ADM2 
### Affected population == (affected area * population in each raster) / total population in each ADM2

# Compute population in each adm2 (for affected population)
# population_2020 = compute_zonal_stats(population, "asd", "band_data", "ADM2_CODE", gridded_groups=WB_country_grid, stats_funcs=["sum"], delayed=True)
# tasks += [population_2020]

for var in shocks.data_vars:
    print(var)
    affected_area_tasks = compute_zonal_stats_over_time(
        shocks.sel(year=slice(2015, 2020)), 
        None, 
        var, 
        "ADM2_CODE", 
        gridded_groups=WB_country_grid, 
        stats_funcs=["mean"], 
        delayed=True
    )
    tasks += [affected_area_tasks]
    break
    affected_area_tasks = affected_area_tasks.to_csv(rf"{DATA_OUT}\{var}_affected_area.csv", compute=False) # FIXME: affected_area_tasks is a list, that will later become a dd.DataFrame... ¿should I compile it here?

    # 2) compute affected population
    affected_population_tasks = compute_zonal_stats_over_time(
        shocks.sel(year=slice(2015, 2020)), 
        None, 
        var, 
        "ADM2_CODE", 
        population_data=population["band_data"], 
        gridded_groups=WB_country_grid, 
        stats_funcs=["sum"], 
        delayed=True
    )
    affected_population_tasks = affected_population_tasks.to_csv(rf"{DATA_OUT}\{var}_affected_pop.csv", compute=False)
    tasks += [affected_population_tasks]

# # 3) Compute share of population affected
# df = affected_population.merge(population_2020, on="ADM2_CODE")
# df["share_affected"] = df[f"{var}_sum"] / df["population_sum"]


drought_SPI-1_1_0
Setting up zonal stats...


100%|██████████| 6/6 [16:05<00:00, 160.90s/it]


In [11]:
with ProgressBar():
    results = dask.compute(tasks[0][:2])

KeyboardInterrupt: 

In [None]:
WB_country.merge(test.rename(columns={"zone":"ADM2_CODE"}), on="ADM2_CODE")[["geometry", "sum"]].explore(column="sum")

In [None]:
WB_country.merge(results[0].rename(columns={"zone":"ADM2_CODE"}), on="ADM2_CODE")[["geometry", "sum"]].explore(column="sum")

In [None]:
df = compile_zonal_stats_over_time(results[1], WB_country, "ADM2_CODE", "drought")

In [None]:
df[df["year"]==1971]#.explore(column="drought")

Unnamed: 0,ADM2_CODE,geometry,year,drought
1,18334.0,"POLYGON ((11.21749 44.80560, 11.22535 44.80249...",1971,279.943863
52,18337.0,"POLYGON ((11.09546 44.95912, 11.09914 44.95882...",1971,278.988067
103,18338.0,"POLYGON ((10.18832 45.02916, 10.19214 45.02906...",1971,278.83502
154,18339.0,"MULTIPOLYGON (((9.29325 44.70210, 9.29091 44.7...",1971,
205,18340.0,"POLYGON ((12.27724 44.61994, 12.27997 44.61304...",1971,279.939815
256,18341.0,"POLYGON ((10.74917 44.96363, 10.74381 44.95487...",1971,278.831182
307,18351.0,"MULTIPOLYGON (((12.48575 42.39365, 12.47495 42...",1971,282.686486
358,18352.0,"POLYGON ((8.97122 44.66545, 8.97356 44.66431, ...",1971,280.856755
409,18354.0,"MULTIPOLYGON (((9.85584 44.04656, 9.84677 44.0...",1971,281.238029
460,18373.0,"POLYGON ((8.40586 45.20602, 8.41857 45.19941, ...",1971,


In [None]:
df[df["year"]==1971].explore(column="drought")

In [None]:
results[0]

Unnamed: 0,zone,sum
0,18334.0,766374.2
1,18337.0,279714.4
2,18338.0,28837.91
3,18339.0,68.77915
4,18340.0,426.4735
5,18341.0,86338.23
6,18351.0,9087.664
7,18352.0,146359.1
8,18354.0,211868.1
9,18373.0,0.7920562
