In [1]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
import hashlib
import matplotlib.pyplot as plt
import warnings

import dask
import xarray as xr
import xrspatial
from dask.diagnostics import ProgressBar
from geocube.api.core import make_geocube

import matplotlib.pyplot as plt
import seaborn as sns

import procesa_bases 

PATH = "D:\World Bank\CLIENT v2"
DATA_RAW = rf"{PATH}\Data\Data_raw"
DATA_PROC = rf"{PATH}\Data\Data_proc"
DATA_OUT = rf"{PATH}\Data\Data_out"
GPW_PATH = rf"D:\Datasets\Gridded Population of the World"



# Genera y Carga bases

In [None]:
WB_country = procesa_bases.load_WB_country_data()
IPUMS_country = procesa_bases.load_IPUMS_country_data(WB_country)
IPUMS_country = IPUMS_country.clip(WB_country.total_bounds)
population = procesa_bases.load_population_data(bounds=WB_country.total_bounds)

# Procesa WB/IPUMS shapes (administrative boundaries) y GPW (population data)

In [4]:
### Rasterize WB_country
WB_country_grid = procesa_bases.rasterize_shape_like_dataset(
    WB_country[["ID", "geometry"]], 
    population
)

print("Saving WB_country_grid...")
WB_country_grid.to_netcdf(rf"{DATA_PROC}\WB_country_grid.nc")
        
WB_country[["ID", "OBJECTID", "ADM2_CODE", "ADM1_CODE", "ADM0_CODE",  "geometry"]].to_feather(rf"{DATA_PROC}\WB_country_IDs.feather")


### Rasterize IPUMS_country
IPUMS_country_grid = procesa_bases.rasterize_shape_like_dataset(
    IPUMS_country[["ID", "geometry"]], 
    population
)

IPUMS_country_path = rf"{DATA_PROC}\IPUMS_country_grid.nc"
print("Saving IPUMS_country_grid...")
IPUMS_country_grid.to_netcdf(rf"{DATA_PROC}\IPUMS_country_grid.nc")
        
IPUMS_country.to_feather(rf"{DATA_PROC}\IPUMS_country_IDs.feather")

# Export IPUMS IDS to dta
dta = IPUMS_country.rename(columns={"CNTRY_CODE":"adm0", "GEOLEVEL2": "adm2"})[["adm0", "adm2"]]
assert dta.duplicated().sum() == 0
dta.to_stata(rf"{DATA_PROC}\IPUMS_country_IDs.dta", write_index=False)

Rasterizing shape...
Done!
Saving WB_country_grid...
Rasterizing shape...
Done!
Saving IPUMS_country_grid...


In [14]:
dta = IPUMS_country.rename(columns={"CNTRY_CODE":"adm0", "GEOLEVEL2": "adm2"})[["adm0", "adm2"]]
assert dta.duplicated().sum() == 0
dta.to_stata(rf"{DATA_PROC}\IPUMS_country_IDs.dta", write_index=False)

# Droughts, con resolución original

In [None]:
import utils
droughts_path = rf"{DATA_OUT}\ERA5_droughts_yearly.nc"

In [None]:
# if not os.path.exists(droughts_path):
print("Preparing droughts dataset...")
# Genera base de sequías
era5 = xr.open_dataset(rf"{DATA_OUT}\ERA5_monthly_1970-2021_SPI-SPEI.nc", chunks={'latitude': 1000, 'longitude': 1000})
# Corrije la dimensión x, que va de 0 a 360
era5 = era5.rename({'latitude': 'y', 'longitude': 'x'})
era5 = utils.coordinates_from_0_360_to_180_180(era5) # FIXME: no se si esto está andando bien, pero creo que si. VERIFICAR

# Calcula las sequías anuales
spi_yearly = era5.groupby("time.year").min()
with ProgressBar():
    spi_yearly.to_netcdf(rf"{DATA_PROC}\ERA5_yearly_1970-2021_SPI-SPEI.nc")

In [None]:
spi_yearly = xr.open_dataset(rf"{DATA_PROC}\ERA5_yearly_1970-2021_SPI-SPEI.nc", chunks={"x": 900, "y": 1800})

spi_spei_vars = [var for var in spi_yearly.data_vars if "-" in var]
for var in spi_spei_vars:
    for threshold_str in ["1_0", "1_5", "2_0", "2_5"]:
        threshold = float(threshold_str.replace("_", "."))
        threshold_str = threshold_str.replace("_", "")
        spi_yearly[f"drought_{var}_{threshold_str}sd"] = (spi_yearly[var] < -threshold).astype("bool")

spi_yearly = spi_yearly[[var for var in spi_yearly.data_vars if "drought" in var]]
spi_yearly = spi_yearly.rename({
    var: var.replace("drought_", "").replace("-", "") for var in spi_yearly.data_vars
})
with ProgressBar():
    spi_yearly.drop_duplicates(dim="x").to_netcdf(droughts_path)

# Validación de IDS

In [None]:
id_IPUMS_shp = gpd.read_feather(rf"{DATA_PROC}\IPUMS_country_IDs.feather")

In [None]:
id_IPUMS_shp = (
    id_IPUMS_shp
        .rename(columns={"CNTRY_CODE":"country", "GEOLEVEL1":"geolev1", "GEOLEVEL2": "geolev2"})
        .drop_duplicates(subset=["country", "geolev1", "geolev2"]))
id_IPUMS_shp

In [None]:
path = r"D:\World Bank\CLIENT v2\Data\Data_proc\IPUMS_ids"

continentes = os.listdir(path)
dfs = []
for continente in continentes:
    dfs += [pd.read_stata(os.path.join(path, continente), convert_categoricals=False)]

id_IPUMS_data = pd.concat(dfs)
id_IPUMS_data[id_IPUMS_data.duplicated(subset="geolev2", keep=False)].dropna()
# id_IPUMS_data = id_IPUMS_data.fillna(0).drop_duplicates()


# id_IPUMS_data.loc[id_IPUMS_data["geolev2"]==0, "geolev2"] = id_IPUMS_data.loc[id_IPUMS_data["geolev2"]==0, "geolev1"]
# id_IPUMS_data = id_IPUMS_data.fillna(0).drop_duplicates()

# id_IPUMS_data.loc[id_IPUMS_data["geolev2"]!=0, "geolev1"] = id_IPUMS_data.loc[id_IPUMS_data["geolev2"]!=0, "geolev2"].astype(str).str.zfill(9).str[:6].astype(float) 
# id_IPUMS_data = id_IPUMS_data.fillna(0).drop_duplicates()

# id_IPUMS_data = id_IPUMS_data[id_IPUMS_data.geolev1 != 0] # Drop unavailable data
# id_IPUMS_data = id_IPUMS_data[id_IPUMS_data.geolev2 != 888888888] # Drop unavailable data


In [None]:
test = gpd.read_file(r"C:\Users\ofici\Downloads\geo2_mu1990_2011\geo2_mu1990_2011.shp")
test[test["GEOLEVEL2"]=="480014002"]

In [None]:
id_IPUMS_data[id_IPUMS_data.duplicated(subset="geolev2", keep=False)].dropna()#.to_html()

In [None]:
id_IPUMS_shp.loc[id_IPUMS_shp["geolev2"]==0, "geolev2"] = id_IPUMS_shp.loc[id_IPUMS_shp["geolev2"]==0, "geolev1"]


In [None]:

m = id_IPUMS_shp.merge(id_IPUMS_data, on=["country", "geolev2"], how="outer", indicator=True)
m._merge.value_counts()
# m[m._merge == "both"].explore()

In [None]:
errores = m[m._merge!="both"]
pd.crosstab(errores.country, errores._merge)

In [None]:
errores[errores.country==170].sort_values(["geolev1", "geolev2"])

In [None]:
errores[errores.geolev2==170005002]

# Huracanes, con resolución completa

In [None]:
## FIXED Parameters
sshws_min_wind = {
    # Saffir-Simpson Hurricane Wind Scale
    # Measured in knots
    # https://www.nhc.noaa.gov/aboutsshws.php
    5: 137,
    4: 113,
    3: 96,
    2: 83,
    1: 64,
}

agency_measurements = {
    "USA": 1, # 1-m measurement
    "TOK": 3, # 3-m measurement
    "CMA": 2, # 2-m measurement
    "HKO": 10, # 10-m measurement
    "KMA": 10,
    "NEW": 3,
    "REU": 10,
    "BOM": 10,
    "NAD": 10,
    "WEL": 10,
    "DS8": 1,
    "TD6": 1,
    "TD5": 1,
    "NEU": 1,
    "MLC": 1,
}

conversion_factor_to_1m = {
    1: 1,
    2: (1.22/1.15+1.17/1.11)/2,
    3: (1.22/1.12+1.17/1.09)/2,
    10: (1.22/1.06+1.17/1.05)/2,
}

agency_1m_conversion_factor = {
    k:conversion_factor_to_1m[v] for k, v in agency_measurements.items()
}

## Functions
def convert_wind_to_1m(wind, agency):
    if agency in agency_1m_conversion_factor:
        return wind * agency_1m_conversion_factor[agency]
    return wind

def convert_wind_to_sshws(wind):
    for cat, min_wind in sshws_min_wind.items():
        if wind >= min_wind:
            return cat
    return 0



In [None]:
import geopandas as gpd

gdf = gpd.read_file(r"D:\Datasets\International Best Track Archive for Climate Stewardship (IBTrACS)\IBTrACS.ALL.list.v04r01.lines.shp")#, "BASIN", "SUBBASIN", "NAME", "ISO_TIME", "LAT", "LON", "WMO_WIND", "WMO_PRES", "WMO_AGENCY", "TRACK_TYPE"])

# Fill interpolated xy values
gdf["WMO_WIND"] = gdf["WMO_WIND"].ffill()

# Data from 1950 onwards
gdf["year"] = gdf["ISO_TIME"].str.split("-").str[0].astype(int)
gdf = gdf[gdf["year"] >= 1970]

# Convert each Agency wind to 1m-MSW
wind_cols = [col for col in gdf.columns if "_WIND" in col and "WMO_WIND" not in col]
agencies = [col.replace("_WIND", "") for col in wind_cols]

for col in tqdm(wind_cols):
    agency = col.split("_")[0]
    gdf[col] = gdf[col].apply(lambda x: convert_wind_to_1m(x, agency))

gdf["wind_speed"] = gdf[wind_cols].max(axis=1)

# Convert wind to SSHWS category
gdf["category"] = gdf["wind_speed"].apply(convert_wind_to_sshws)

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

buffers = [0.1, 0.25, 0.50, 1.00]
years = gdf["year"].unique()

paths_by_year = {}
for year in tqdm(years, leave=False):
    paths_by_year[year] = []
    for buffer in buffers:
        ### Filter year and create raster map based on the buffered best-track 
        ###     of the hurricane
        print(buffer)
        gdf_year = gdf[gdf.year == year]
        gdf_year = gdf_year[["wind_speed", "geometry"]].fillna(0)

        # Apply buffer to center of the storm
        gdf_year["geometry"] = gdf_year.geometry.buffer(buffer)
        
        # Make the biggest shock at a certain location the one shown in the xr.dataset 
        gdf_year = gdf_year.sort_values("wind_speed", ascending=True) 
        
        raster = make_geocube(
            vector_data=gdf_year,
            like=population,
        )
        raster = raster.assign_coords({"year": year})

        for category in [3, 4, 5]:
            ### Once the raster wind_speed is created, create a new boolean raster 
            ###     where the winds are greater than the minimum for the category
            varname = f"category_{category}_b{int(buffer*100)}"
            # Keep only hurricanes of a certain category
            raster_b = xr.where(
                raster.rename({"wind_speed":varname})[varname] >= sshws_min_wind[category], 
                True, 
                False
            )
            # Transform wind_speed to boolean
            raster_path = rf"{DATA_PROC}\shocks_by_grid\hurricanes_{year}_{varname}.nc"
            raster_b.to_netcdf(raster_path, encoding={varname:{"zlib": True, "complevel": 7}})
            paths_by_year[year] += [raster_path]
            break
        # xr.concat(dss, dim="year")


In [None]:
from dask.distributed import Client
client = Client(memory_limit='7GB')
client

In [None]:
client

In [None]:
xr.open_dataset(files_year[0])["category_3_b10"].encoding["chunksizes"]

In [None]:
### Compile all the data into a single dataset
path = rf"{DATA_PROC}\shocks_by_grid"
files = os.listdir(path)
files = [f for f in files if "hurricanes_" in f and f.endswith(".nc")]

dss = []
for year in range(1970, 2021):
    
    files_year = [os.path.join(path, f) for f in files if f"{year}" in f]
    ds = xr.open_mfdataset(files_year,  parallel=True, chunks="auto")
    ds = ds.assign_coords({"year": year})
    dss += [ds]
    
ds = xr.concat(dss, dim="year")
ds.to_netcdf(r"D:\World Bank\CLIENT v2\Data\Data_out\IBTrACS_hurricanes_yearly.nc")

In [None]:
ds.to_netcdf(r"D:\World Bank\CLIENT v2\Data\Data_out\IBTrACS_hurricanes_yearly.nc")

In [None]:
# Visualizer
year = 2020
hurr_by_name = gdf[(gdf.NAME == "BELNA") & (gdf.year == year)]
fig, ax = plt.subplots()

hurr_by_name.plot(column="category", legend=True, ax=ax)

xmin, ymin, xmax, ymax = hurr_by_name.total_bounds
hurr_proc.sel(year=year, x=slice(xmin, xmax), y=slice(ymax, ymin))["category_1_b10"].plot(ax=ax, cmap="Greys", alpha=0.5)

# Heatwave y Coldwaves, resolución original (ERA5)

In [None]:
shocks = {
    "timeseries-fd-monthly-mean_era_monthly_era5-0.5x0.5-timeseries_mean_1950-2020.nc": {
        "name": "frostdays",
        "variable": "timeseries-fd-monthly-mean",
        "poslat": [3,4,5,6,7,8,9,10,11], 
        "neglat": [1,2,3,4,5,9,10,11,12],
        "spell_index": "csdi",
    },
    "timeseries-id-monthly-mean_era_monthly_era5-0.5x0.5-timeseries_mean_1950-2020.nc": {
        "name": "icedays",
        "variable": "timeseries-id-monthly-mean",
        "poslat": [3,4,5,6,7,8,9,10,11], 
        "neglat": [1,2,3,4,5,9,10,11,12],
        "spell_index": "csdi",
    },
    "timeseries-hd35-monthly-mean_era_monthly_era5-0.5x0.5-timeseries_mean_1950-2020.nc": {
        "name": "heatdays35",
        "variable": "timeseries-hd35-monthly-mean",
        "poslat": [1,2,3,4,5,9,10,11,12], 
        "neglat": [3,4,5,6,7,8,9,10,11],
        "spell_index": "wsdi",
    },
    "timeseries-hd40-monthly-mean_era_monthly_era5-0.5x0.5-timeseries_mean_1950-2020.nc": {
        "name": "heatdays40",
        "variable": "timeseries-hd40-monthly-mean",
        "poslat": [1,2,3,4,5,9,10,11,12], 
        "neglat": [3,4,5,6,7,8,9,10,11],
        "spell_index": "wsdi",
    }
}

tasks = []
for file, params in shocks.items():
    
    ds = xr.open_dataset(rf"{DATA_RAW}\ERA5_CCKP\{file}", chunks="auto")
    name = params["name"]
    var = params["variable"]

    ds = ds.sel(bnds=0) # both bands are the same
    ds = ds.drop_vars(["lon_bnds", "lat_bnds", "bnds"]) # drop last timestep
    ds[var] =  (ds[var] / np.timedelta64(1, 'D')).astype(int)

        
    # 1) heatwaves/coldwaves are rare events, n° days > 2/2.5 std
    
    stand_anomalies = xr.apply_ufunc(
        lambda x, m, s: (x - m) / s,
        ds.groupby("time.month"),
        ds.groupby("time.month").mean("time"),
        ds.groupby("time.month").std("time"),
        dask="parallelized",
    )
    
    tasks += [stand_anomalies.to_netcdf(rf"{DATA_PROC}\heatwaves and coldwaves\{name}_standardized.nc", compute=False)]
    
    ds_more_than_2_std = (stand_anomalies > 2)
    ds_more_than_25_std = (stand_anomalies > 2.5)
    
    tasks += [ds_more_than_2_std.to_netcdf(rf"{DATA_PROC}\heatwaves and coldwaves\{name}_more_than_2_std.nc", compute=False)]
    tasks += [ds_more_than_25_std.to_netcdf(rf"{DATA_PROC}\heatwaves and coldwaves\{name}_more_than_25_std.nc", compute=False)]
 
    # 2)  Heatwaves/Coldwaves only occur in summer/winter months:
    ds[var] = xr.where(
        ds.time.dt.month.isin(params["poslat"]) & (ds.lat > 0), 
        0,
        ds[var], 
    )
    ds[var] = xr.where(
        ds.time.dt.month.isin(params["neglat"]) & (ds.lat < 0), 
        0,
        ds[var], 
    )
        
    # 3) More than 6 hotdays/coldays
    more_than_6_days = (ds[var] >= 6).astype(bool)
    tasks += [more_than_6_days.to_netcdf(rf"{DATA_PROC}\heatwaves and coldwaves\{name}_more_than_6.nc", compute=False)]

In [None]:
# Compute yearly values
tasks = []
for file, params in shocks.items():

    name = params["name"]
    var = params["variable"]

    # (1) heatwaves/coldwaves are rare events, n° days > 2/2.5 std
    ds_more_than_2_std = xr.open_dataset(rf"{DATA_PROC}\heatwaves and coldwaves\{name}_more_than_2_std.nc", chunks={"time":60})
    ds_more_than_25_std = xr.open_dataset(rf"{DATA_PROC}\heatwaves and coldwaves\{name}_more_than_25_std.nc", chunks={"time":60})

    # (2+3)  Heatwaves/Coldwaves only occur in summer/winter months:
    more_than_6_days = xr.open_dataset(rf"{DATA_PROC}\heatwaves and coldwaves\{name}_more_than_6.nc", chunks={"time":60})

    # (1+2+3) all together
    ds_2_std = (more_than_6_days[var] * ds_more_than_2_std[var])
    ds_25_std = (more_than_6_days[var] * ds_more_than_25_std[var])

    # Annual values
    ds_2_std = ds_2_std.groupby('time.year').max()
    ds_25_std = ds_25_std.groupby('time.year').max()

    # WSDI/CSDI (Warm/Cold Spell Duration Index) 
    spell_index = params["spell_index"]
    sdi = xr.open_dataset(rf"{DATA_RAW}\ERA5_CCKP\timeseries-{spell_index}-annual-mean_era_annual_era5-0.5x0.5-timeseries_mean_1950-2020.nc", chunks={"year":5})
    sdi = sdi.sel(bnds=0) # both bands are the same
    sdi = sdi.drop_vars(["lon_bnds", "lat_bnds", "bnds"]) # drop last timestep
    sdi[f"timeseries-{spell_index}-annual-mean"] =  (sdi[f"timeseries-{spell_index}-annual-mean"] / np.timedelta64(1, 'D')).astype(int)
    sdi = (sdi[f"timeseries-{spell_index}-annual-mean"] >= 6)
    
    # Final computation
    waves_2_std = (ds_2_std * sdi)
    waves_25_std = (ds_25_std * sdi)

    tasks += [waves_2_std.to_netcdf(rf"{DATA_PROC}\heatwaves and coldwaves\{name}_annual_2_std.nc", compute=False)]
    tasks += [waves_25_std.to_netcdf(rf"{DATA_PROC}\heatwaves and coldwaves\{name}_annual_25_std.nc", compute=False)]

In [None]:
print(len(tasks))
dask.compute(*tasks)

In [None]:
hws = {
    "hw3520": xr.open_dataset(rf"{DATA_PROC}\heatwaves and coldwaves\heatdays35_annual_2_std.nc"), 
    "hw3525": xr.open_dataset(rf"{DATA_PROC}\heatwaves and coldwaves\heatdays35_annual_25_std.nc"), 
    "hw4020": xr.open_dataset(rf"{DATA_PROC}\heatwaves and coldwaves\heatdays40_annual_2_std.nc"), 
    "hw4025": xr.open_dataset(rf"{DATA_PROC}\heatwaves and coldwaves\heatdays40_annual_25_std.nc"),
}

for name, ds in hws.items():
    ds = ds.rename({"__xarray_dataarray_variable__":name})
    hws[name] = ds
    
ds = xr.combine_by_coords(list(hws.values()))
ds = ds.rename({"lat":"y", "lon":"x"})
ds = ds.sortby(ds.x).sortby(ds.y, ascending=False)
ds.to_netcdf(rf"{DATA_OUT}\CCKP_heatwaves_yearly.nc")

In [None]:
cws = {
    "fd20": xr.open_dataset(rf"{DATA_PROC}\heatwaves and coldwaves\frostdays_annual_2_std.nc"), 
    "fd25": xr.open_dataset(rf"{DATA_PROC}\heatwaves and coldwaves\frostdays_annual_25_std.nc"), 
    "id20": xr.open_dataset(rf"{DATA_PROC}\heatwaves and coldwaves\icedays_annual_2_std.nc"), 
    "id25": xr.open_dataset(rf"{DATA_PROC}\heatwaves and coldwaves\icedays_annual_25_std.nc"),
}

for name, ds in cws.items():
    ds = ds.rename({"__xarray_dataarray_variable__":name})
    cws[name] = ds
    
ds = xr.combine_by_coords(list(cws.values()))
ds = ds.rename({"lat":"y", "lon":"x"})
ds = ds.sortby(ds.x).sortby(ds.y, ascending=False)
ds.to_netcdf(rf"{DATA_OUT}\CCKP_coldwaves_yearly.nc")

# Intense Rain, resolucion original (ERA5)

In [None]:
shocks_by_year = []
for shock in ["rx1day", "rx5day"]:
    file = f"timeseries-{shock}-monthly-mean_era_monthly_era5-0.5x0.5-timeseries_mean_1950-2020.nc"
    ds = xr.open_dataset(rf"{DATA_RAW}\ERA5_CCKP\{file}", chunks={"time": 60})
    ds = ds.sel(bnds=0) # both bands are the same
    ds = ds.drop_vars(["lon_bnds", "lat_bnds", "bnds"])
    for rainfall in [100, 200, 300, 400, 500, 600]:
        ds_shock = (ds[f"timeseries-{shock}-monthly-mean"] >= rainfall)
        ds_shock = ds_shock.rename(f"{shock}_{rainfall}")
        ds_year = ds_shock.groupby('time.year').max()
        shocks_by_year += [ds_year]
        
full_ds = xr.combine_by_coords(shocks_by_year)
full_ds = full_ds.rename({"lat":"y", "lon":"x"})
full_ds = full_ds.sortby(full_ds.x).sortby(full_ds.y, ascending=False)
full_ds.to_netcdf(rf"{DATA_OUT}\CCKP_intenserain_yearly.nc")