In [2]:
### ESTO VA ANTES DE 02c!!!!

In [3]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
import hashlib

import dask
import xarray as xr
import xrspatial
from dask.diagnostics import ProgressBar
from geocube.api.core import make_geocube

import matplotlib.pyplot as plt
import seaborn as sns

PATH = "Z:\Laboral\World Bank\CLIENT v2"
DATA_RAW = rf"{PATH}\Data\Data_raw"
DATA_PROC = rf"{PATH}\Data\Data_proc"
DATA_OUT = rf"{PATH}\Data\Data_out"



In [59]:
# floods = pd.read_csv(rf"{DATA_RAW}\Floods\GloFAS_floods.csv")

def load_population_data(bounds=None, generate=False):
    print("Processing Population data...")

    # Select all files in GPW folder
    gpw_path = r"Z:\WB Data\Gridded Population of The World (GPWv4)"
    files = os.listdir(gpw_path)
    files = [f for f in files if f.endswith(".tif")]
    
    # Compile into a single dataset
    dss = []
    for f in tqdm(files):
        
        ds = xr.open_dataset(os.path.join(gpw_path, f), chunks={"x": 10000, "y": 10000})
        ds["band_data"] = ds["band_data"].astype(np.uint32)
        if bounds is not None:
            ds = ds.sel(
                x=slice(bounds[0], bounds[2]), y=slice(bounds[3], bounds[1])
            )
        if generate:
            with ProgressBar():
                ds.sel(band=1).drop_vars("band").band_data.rio.to_raster(rf"E:\client_v2_data\{f.replace('.tif','_proc.tif')}")
                print(f"Saved {f.replace('.tif','_proc.tif')}")
        
        ds["year"] = int(f.split("_")[5])
        ds = ds.set_coords('year')
        dss += [ds]
        
    population = xr.concat(dss, dim="year")    
    
    # Filter if bounds are provided
    if bounds is not None:
        population = population.sel(
            x=slice(bounds[0], bounds[2]), y=slice(bounds[3], bounds[1])
        )
        
    # Clean band dimension
    population = population.sel(band=1).drop_vars(["band"])
    
    print("Done!")
    return population

def load_precipitation_data():
    era5 = xr.open_dataset(
        rf"{DATA_OUT}\ERA5_monthly_1970-2021_SPI-SPEI.nc",
        chunks={"latitude": 100, "longitude": 100},
    )
    era5 = era5.rename({"latitude": "y", "longitude": "x"})
    return

def load_WB_country_data(drop_adm2_na=False):
    print("Loading World Bank country data...")
    WB_country = gpd.read_file(rf"{DATA_RAW}\world_bank_adm2.zip")
    
    # Assign nan when ADM2 is not available 
    WB_country.loc[WB_country.ADM2_NAME == "Administrative unit not available", "ADM2_CODE"] = (
        np.nan
    )
    
    # Create ADM_LAST variable: ADM2_NAME if available, else ADM1_NAME
    WB_country["ADMLAST_CODE"] = WB_country.ADM2_CODE
    WB_country["ADMLAST_NAME"] = WB_country.ADM2_NAME
    WB_country.loc[WB_country.ADM2_CODE.isnull(), "ADMLAST_CODE"] = WB_country.ADM1_CODE
    WB_country.loc[ WB_country.ADM_LAST.isnull(), "ADMLAST_NAME"] = WB_country.ADM1_NAME

    # Dissolve by ADM_LAST and country code
    WB_country = WB_country.dissolve(by=["ADM_LAST", "ADM0_CODE"]).reset_index()
    
    # Create ID
    WB_country["ID"] = WB_country.groupby(["ADMLAST_CODE", "ADMLAST_NAME", "ADM0_CODE", "ADM0_NAME"]).ngroup()
    assert WB_country.ID.nunique() == WB_country.shape[0], "ID is not unique!, there's some bug in the code..."
    print("Data loaded!")
    return WB_country


def rasterize_shape_like_dataset(shape, dataset):
    print("Rasterizing shape...")
    raster = make_geocube(
        vector_data=shape,
        like=dataset,
    )
    # For some reason, like option is not working, so I have to manually add x and y
    assert (raster["x"].shape == dataset["x"].shape)
    assert (raster["y"].shape == dataset["y"].shape)
    raster["x"] = dataset["x"]
    raster["y"] = dataset["y"]
    raster = raster.drop_vars(["spatial_ref"])
    raster = raster.chunk({"x": 100, "y": 100})
    print("Done!")
    return raster

def compute_zonal_stats(dataset, shape, value_var, groupby_var, gridded_groups=None, stats_funcs=["sum"], delayed=True):
    
    # Rasterize shape
    if gridded_groups is None:
        gridded_groups = rasterize_shape_like_dataset(shape[[groupby_var, "geometry"]], dataset)

    # Compute zonal stats  
    assert gridded_groups.chunks is not None, "Please, chunk the dataset before computing zonal stats! (e.g. dataset.chunk({'x': 100, 'y': 100})). Otherwise, you will get a MemoryError."
    assert dataset.chunks is not None, "Please, chunk the dataset before computing zonal stats! (e.g. dataset.chunk({'x': 100, 'y': 100})). Otherwise, you will get a MemoryError."

    print("Setting up zonal stats...")
    pop_by_adm = xrspatial.zonal.stats(gridded_groups[groupby_var], dataset[value_var], stats_funcs=stats_funcs)
    print("Done! Computing zonal stats...")    
    if delayed:
        return pop_by_adm
    
    with ProgressBar():
        pop_by_adm = pop_by_adm.compute()
    
    # Format zonal_stats dataframe
    pop_by_adm = pop_by_adm.rename(columns={
        "sum": value_var,
        "mean": f"{value_var}_mean",
        "zone": groupby_var,
    })
    
    result = (
        shape[[groupby_var, "geometry"]]
        .merge(pop_by_adm, on=groupby_var)
    )
    return result 

def compute_zonal_stats_over_time(dataset, shape, value_var, groupby_var, population_data=None, gridded_groups=None, stats_funcs=["mean"], delayed=True):
    import warnings
    warnings.filterwarnings("ignore", category=RuntimeWarning)
    
    # Rasterize shape
    if gridded_groups is None:
        gridded_groups = rasterize_shape_like_dataset(shape[[groupby_var, "geometry"]], dataset)

    # Compute zonal stats  
    assert gridded_groups.chunks is not None, "Please, chunk the dataset before computing zonal stats! (e.g. dataset.chunk({'x': 100, 'y': 100})). Otherwise, you will get a MemoryError."
    assert dataset.chunks is not None, "Please, chunk the dataset before computing zonal stats! (e.g. dataset.chunk({'x': 100, 'y': 100})). Otherwise, you will get a MemoryError."
    assert "year" in dataset.dims, "Please, add a 'year' dimension to the dataset before computing zonal stats! (e.g. dataset = dataset.assign_coords(year=dataset.time.dt.year))."
 
    print("Setting up zonal stats...")
    tasks = []
    for year in tqdm(dataset["year"].values):
        dataset_year = dataset.sel(year=year).drop_vars("year")
        if population_data is not None:
            dataset_year = dataset_year * population_data.sel(year=year, method="nearest").drop_vars("year")
        else:
            dataset_year[value_var] = dataset_year[value_var].astype("float32")

        tasks += [xrspatial.zonal.stats(zones=gridded_groups[groupby_var], values=dataset_year[value_var], stats_funcs=stats_funcs)]
    if delayed:
        return tasks

    print("Done! Computing zonal stats...")        
    with ProgressBar():
        result = dask.compute(*tasks)
        
    return result 

def compile_zonal_stats_over_time(tasks_results, shape, groupby_var, value_var):
    
    # Compile results into a single df
    out_dict = {year: data.set_index("zone") for year, data in zip(range(1970,2021), tasks_results)}
    df = pd.concat(out_dict)
    df = df.reset_index()
    df = df.rename(columns={"level_0":"year"})
    
    # Format zonal_stats dataframe
    df = df.rename(columns={
        "sum": value_var,
        "mean": value_var,
        "zone": groupby_var,
    })
    
    result = (
        shape[[groupby_var, "geometry"]]
        .merge(df, on=groupby_var)
    )
    return result

def process_era5_data():
   
    # Load ERA5 data
    
    # Create droughts dummies
    
    # Annualize series
    
    return

# Procesa WB country Data (administrative boundaries)

In [85]:
WB_country = load_WB_country_data()
population = load_population_data(bounds=WB_country.total_bounds)

# Rasterize WB_country
WB_country_grid = rasterize_shape_like_dataset(
    WB_country[["ID", "geometry"]], 
    population
)

WB_country_path = rf"E:\WB_country_grid.nc"
print("Saving WB_country_grid...")
with ProgressBar():
    WB_country_grid.to_netcdf(WB_country_path)
        
WB_country[["ID", "OBJECTID", "ADM2_CODE", "ADM2_NAME", "ADM1_CODE", "ADM1_NAME", "ADM0_CODE", "ADM0_NAME", "geometry"]].to_feather(rf"E:\client_v2_data\WB_country_IDs.feather")

Loading World Bank country data...
Data loaded!
Processing Population data...




[####                                    ] | 10% Completed | 8.45 sms

  return x.astype(astype_dtype, **kwargs)


[################################        ] | 80% Completed | 67.41 s

In [None]:
WB_country_raw = gpd.read_file(rf"{DATA_RAW}\world_bank_adm2.zip")

In [83]:
# WB_country_raw.cx[-91.412948:-88.813990,29.172487:30.972958].explore()
WB_country_raw[WB_country_raw["OBJECTID"]==11777]

Unnamed: 0,OBJECTID,ADM2_CODE,ADM2_NAME,STR2_YEAR,EXP2_YEAR,ADM1_CODE,ADM1_NAME,STATUS,DISP_AREA,ADM0_CODE,ADM0_NAME,Shape_Leng,Shape_Le_1,Shape_Area,geometry
11776,11777.0,29816,Jefferson,1000,3000,3232,Louisiana,Member State,NO,259,United States of America,12.162589,12.162589,0.09744,"MULTIPOLYGON (((-90.04783 29.21537, -90.04876 ..."


In [84]:
WB_country_raw[WB_country_raw["OBJECTID"]==10694]#.explore()

Unnamed: 0,OBJECTID,ADM2_CODE,ADM2_NAME,STR2_YEAR,EXP2_YEAR,ADM1_CODE,ADM1_NAME,STATUS,DISP_AREA,ADM0_CODE,ADM0_NAME,Shape_Leng,Shape_Le_1,Shape_Area,geometry
10693,10694.0,28727,Jefferson,1000,3000,3214,Alabama,Member State,NO,259,United States of America,2.902567,2.902567,0.28259,"POLYGON ((-86.87909 33.84317, -86.87527 33.843..."


In [76]:
WB_country[WB_country["ID"]==13736].explore()

# Procesa Population data (Gridded Population of the World)

In [None]:
# Select all files in GPW folder
gpw_path = r"Z:\WB Data\Gridded Population of The World (GPWv4)"
files = os.listdir(gpw_path)
files = [f for f in files if f.endswith(".tif")]
bounds=WB_country.total_bounds
# Compile into a single dataset
dss = []
for f in tqdm(files):
    ds = xr.open_dataset(os.path.join(gpw_path, f), chunks={"x": 10000, "y": 10000})
    if bounds is not None:
        ds = ds.sel(
            x=slice(bounds[0], bounds[2]), y=slice(bounds[3], bounds[1])
        )
    ds.rio.to_raster(rf"E:\client_v2_data\{f.replace('.tif','_proc.tif')}")
    print("Se creó el archivo: ", f.replace('.tif','_proc.tif'))
    dss += [ds]
population = xr.concat(dss, dim="year")    

# Filter if bounds are provided
if bounds is not None:
    population = population.sel(
        x=slice(bounds[0], bounds[2]), y=slice(bounds[3], bounds[1])
    )
    
# Clean band dimension
population = population.sel(band=1).drop_vars(["band"])



# GENERA Base de shocks, sin interpolar

In [17]:
droughts_path = rf"{DATA_OUT}\ERA5_droughts_1970-2021.nc"
# if not os.path.exists(droughts_path):
print("Preparing droughts dataset...")
# Genera base de sequías
era5 = xr.open_dataset(rf"{DATA_OUT}\ERA5_monthly_1970-2021_SPI-SPEI.nc", chunks={"latitude": 500, "longitude": 500})

Preparing droughts dataset...


In [20]:
droughts_path = rf"E:\client_v2_data\ERA5_droughts_1970-2021.nc"
# if not os.path.exists(droughts_path):
print("Preparing droughts dataset...")
# Genera base de sequías
era5 = xr.open_dataset(rf"{DATA_OUT}\ERA5_monthly_1970-2021_SPI-SPEI.nc", chunks=)
# Corrije la dimensión x, que va de 0 a 360
era5 = era5.rename({'latitude': 'y', 'longitude': 'x'})
era5['x'] = np.linspace(-180, 180, era5.x.size)

# era5 = era5.chunk({'time': 5})

# Calcula las sequías anuales
spi_yearly = era5.groupby("time.year").min()
with ProgressBar():
    spi_yearly.to_netcdf(rf"E:\client_v2_data\ERA5_yearly_1970-2021_SPI-SPEI.nc")

# droughts_resamp = droughts_yearly.reindex(y=WB_country_grid.y, x=WB_country_grid.x, method="nearest")

# droughts_resamp = droughts_yearly.interp_like(WB_country_grid, method="nearest")
# droughts_resamp = droughts_resamp.chunk({'x': 100, 'y': 100, 'year': 5})
#     print("Saving droughts...")
#     with ProgressBar():
#         droughts_resamp.to_netcdf(droughts_path)
#     print("Done!")
# else:
#     droughts_resamp = xr.open_dataset(droughts_path)

Preparing droughts dataset...
[########################################] | 100% Completed | 2hr 1ms


In [28]:
spi_yearly = xr.open_dataset(rf"E:\client_v2_data\ERA5_yearly_1970-2021_SPI-SPEI.nc")
spi_yearly

In [29]:
spi_yearly = xr.open_dataset(rf"E:\client_v2_data\ERA5_yearly_1970-2021_SPI-SPEI.nc", chunks={"x": 900, "y": 1800})

spi_spei_vars = [var for var in spi_yearly.data_vars if "-" in var]
for var in spi_spei_vars:
    for threshold_str in ["1_0", "1_5", "2_0", "2_5"]:
        threshold = float(threshold_str.replace("_", "."))
        spi_yearly[f"drought_{var}_{threshold_str}"] = (spi_yearly[var] < -threshold).astype("bool")

spi_yearly = spi_yearly[[var for var in spi_yearly.data_vars if "drought" in var]]
with ProgressBar():
    spi_yearly.to_netcdf(droughts_path)

[########################################] | 100% Completed | 313.82 s


In [2]:
import xarray as xr
droughts_path = rf"E:\client_v2_data\ERA5_droughts_1970-2021.nc"
droughts_yearly = xr.open_dataset(droughts_path)

In [3]:
droughts_yearly