# Data Processing
## Purpose
State the purpose of the notebook.
## Methodology
Quickly describle assumptions and processing steps.
## WIP - improvements
Use this section only if the notebook is not final.

Notable TODOs:

- Todo 1;
- Todo 2;

## Results
Describe and comment the most important results.

# Setup
## Library import
We import all the required Python libraries

In [None]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr
import rioxarray
import rasterio
import regionmask
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

## Utils

**set_lat_lon_attrs**

In [None]:
def set_lat_lon_attrs(ds):
    """ Set CF latitude and longitude attributes"""
    ds["lon"] = ds.lon.assign_attrs({
      'axis' : 'X',
       'long_name' : 'longitude',
        'standard_name' : 'longitude',
         'stored_direction' : 'increasing',
          'type' : 'double',
           'units' : 'degrees_east',
            'valid_max' : 360.0,
             'valid_min' : -180.0
             })
    ds["lat"] = ds.lat.assign_attrs({
      'axis' : 'Y',
       'long_name' : 'latitude',
        'standard_name' : 'latitude',
         'stored_direction' : 'increasing',
          'type' : 'double',
           'units' : 'degrees_north',
            'valid_max' : 90.0,
             'valid_min' : -90.0
             })
    return ds

**create_ds_mask**

In [None]:
def create_ds_mask(df, ds, name, lon_name='lon', lat_name='lat'):
    """Create masks of geographical regions"""
    # Create index column
    if 'index' not in df:
        df = df.reset_index(drop=True).reset_index()

    # Extract indexes and geoms that are large enough!
    id_ints = df['index'].values
    geoms = df['geometry'].values
    
    print(f'Number of indexes: {len(id_ints)}')
    print(f'Number of geoms: {len(geoms)}')


    # create mask object
    da_mask = regionmask.Regions(
      name = name,
      numbers = id_ints,
      outlines = geoms)\
      .mask(ds, lon_name=lon_name, lat_name=lat_name)\
      .rename(name)

    # get the ints actually written to mask
    id_ints_mask = da_mask.to_dataframe().dropna()[name].unique()
    id_ints_mask = np.sort(id_ints_mask).astype('int')
    
    print(f'Number of ints in mask: {len(id_ints_mask)}')
    
    # get the ints not written to mask
    id_ints_not_in_mask = df[~df['index'].isin(id_ints_mask)]['index'].values
    
    if len(id_ints_not_in_mask) > 0: 
        print(f'Ints not in mask: {id_ints_not_in_mask}')
    
    # update da attributes
    da_mask.attrs['id_ints'] = id_ints_mask
    da_mask = set_lat_lon_attrs(da_mask)
    
    return da_mask, id_ints_not_in_mask

**find_nearest**

In [None]:
def find_nearest(array, value):
    """Find nearest value in numpy array"""
    array = np.asarray(array)
    
    # Get the mean step values
    step = np.abs(np.diff(array)).max()
    
    # Find the nearest values
    diff = np.abs(array - value)
    idx = np.argwhere((diff >= np.amin(diff) - step) & (diff <= np.amin(diff) + step))

    return idx

**get_xy_from_latlon**

In [None]:
def get_xy_from_latlon(ds, lat, lon):
    """Return the x/y values for a given longitude/latitude values"""
    # Read all lon/lat values
    lons = ds.lon.data
    lats = ds.lat.data
    
    # Find the positions of the nearest longitude/latitude values
    idx_lon = find_nearest(lons, lon)
    idx_lat = find_nearest(lats, lat)
    
    # Check the identical rows in both arrays
    res = (idx_lon[:, None] == idx_lat).all(-1).any(-1)
    yx_positions = idx_lon[res]
    
    
    if yx_positions.shape[0] == 0:
        raise Exception("Sorry, lat/lon values outside data domain")   
    if yx_positions.shape[0] > 1:
        # If more than one identical rows take the row nearest to the mean value
        yx_positions = np.mean(yx_positions,axis=0).astype(int).reshape(1,2)

    # Get the x/y values
    x_position = yx_positions[0][1]
    y_position = yx_positions[0][0]
    x = ds.rlon.data[yx_positions[0][1]]
    y = ds.rlat.data[yx_positions[0][0]]

    return x_position, y_position, x, y

# Data import

## Fenómenos climáticos extremos
- ### Fire danger indicators
    **[Data source](https://cds.climate.copernicus.eu/cdsapp#!/dataset/sis-tourism-fire-danger-indicators?tab=overview)**

    The dataset presents projections of fire danger indicators for Europe based upon the Canadian Fire Weather Index System (FWI) under future climate conditions. The FWI is a meteorologically based index used worldwide to estimate the fire danger and is implemented in the Global ECMWF Fire Forecasting model (GEFF).

    **Variables:**
    - **Seasonal fire weather index:** 
    The mean fire weather index value over the European fire season (June-September). This is calculated as the sum of the daily fire weather index over the European fire season divided by the total number of days within this date range. The higher the index value, the more favorable the meteorological conditions to trigger a wildfire are.
   
- ### Bioclimatic indicators 
    **[Data source](https://cds.climate.copernicus.eu/cdsapp#!/dataset/sis-biodiversity-cmip5-regional?tab=overview)**

    The dataset provides bioclimatic indicators derived from CMIP5 climate projections at 1 km x 1 km resolution for selected regions; Europe, Northern Brazil and Central Africa. This comprehensive set of bioclimatic indicators is specifically relevant for applications within the biodiversity and ecosystem services community.

    **Variables:**
    - **Precipitation in wettest quarter (BIO16):** 
    The mean of monthly mean precipitation during the wettest quarter, defined as the quarter with the highest monthly mean (of the daily mean) precipitation using a moving average of 3 consecutive months. To compute the total precipitation sum over the month, a conversion factor should be applied of 3600x24x91.3 (average number of days per quarter)*1000. This indicator corresponds to the official BIOCLIM variable BIO16.
    - **Annual precipitation (BIO12):** 
    Annual mean of the daily mean precipitation rate (both liquid and solid phases). This indicator corresponds to the official BIOCLIM variable BIO12. To compute the total precipitation sum over the year, a conversion factor should be applied of 3600x24x365x1000 (mm year-1).
    - **Aridity:** 
    Monthly potential evaporation divided by the monthly mean precipitation, averaged over the year.
    - **Dry spells:** 
    Maximum number of consecutive days of the dry spells within a year.
    - **Koeppen-Geiger class:** 
    A climate classification that divides worldwide climates into separate classes depending on temperature and precipitation thresholds.
    
- ### Heat waves and cold spells 
    **[Data source](https://cds.climate.copernicus.eu/cdsapp#!/dataset/sis-heat-and-cold-spells?tab=overview)**

    The dataset contains the number of hot and cold spell days using different European-wide and national/regional definitions developed within the C3S European Health service. These heat wave and cold spell days are available for different future time periods and use different climate change scenarios.

    **Variables:**
    - **Heat wave days:** 
    Number of hot days in a year using specific definitions.
    - **Cold spell days:** 
    Number of cold days in a year using specific definitions.
    
- ### Temperature statistics 
    **[Data source](https://cds.climate.copernicus.eu/cdsapp#!/dataset/sis-temperature-statistics?tab=overview)**

    This dataset contains temperature exposure statistics for Europe (e.g. percentiles) derived from the daily 2 metre mean, minimum and maximum air temperature for the entire year, winter (DJF: December-January-February) and summer (JJA: June-July-August). These statistics were derived within the C3S European Health service and are available for different future time periods and using different climate change scenarios.

    **Variables:**
    - **Maximum temperature:** 
    Daily maximum air temperature valid for a grid cell at the height of 2m above the surface, averaged over the year or season.

### Fire danger indicators

**Read data** 

In [None]:
data_dir = '../../data/raw/climate/dataset-sis-tourism-fire-danger-indicators/'
for ns, scenario in enumerate(['rcp45', 'rcp85']):
    if ns == 0:
        for n, file in enumerate([i for i in os.listdir(data_dir) if scenario in i]):
            # convert to Dataset and concatenate by time
            if n == 0:
                ds_fire_0 = xr.open_dataset(data_dir+file, engine="netcdf4")
                # add scenario coordinate
                ds_fire_0 = ds_fire_0.assign_coords({"scenario": scenario})
            else:
                ds = xr.open_dataset(data_dir+file, engine="netcdf4")
                # add scenario coordinate
                ds = ds.assign_coords({"scenario": scenario})
                ds_fire_0 = xr.concat([ds_fire_0, ds], dim='time')
    else:
        for n, file in enumerate([i for i in os.listdir(data_dir) if scenario in i]):
            # convert to Dataset and concatenate by time
            if n == 0:
                ds_fire_1 = xr.open_dataset(data_dir+file, engine="netcdf4")
                # add scenario coordinate
                ds_fire_1 = ds_fire_1.assign_coords({"scenario": scenario})
            else:
                ds = xr.open_dataset(data_dir+file, engine="netcdf4")
                # add scenario coordinate
                ds = ds.assign_coords({"scenario": scenario})
                ds_fire_1 = xr.concat([ds_fire_1, ds], dim='time')
                
ds_fire = xr.concat([ds_fire_0, ds_fire_1], dim='scenario')

In [None]:
data_dir = '../../data/raw/climate/dataset-sis-tourism-fire-danger-indicators/'
for ns, scenario in enumerate(['rcp45', 'rcp85']):
    for n, file in enumerate([i for i in os.listdir(data_dir) if scenario in i]):
        # convert to Dataset and concatenate by time
        if n == 0:
            ds_fire_sce = xr.open_dataset(data_dir+file, engine="netcdf4")
            # add scenario coordinate
            ds_fire_sce = ds_fire_sce.assign_coords({"scenario": scenario})
        else:
            ds = xr.open_dataset(data_dir+file, engine="netcdf4")
            # add scenario coordinate
            ds = ds.assign_coords({"scenario": scenario})
            ds_fire_sce = xr.concat([ds_fire_sce, ds], dim='time')
            
    if ns == 0:
        ds_fire = ds_fire_sce.copy()
    else:
        ds_fire = xr.concat([ds_fire, ds_fire_sce.copy()], dim='scenario')

In [None]:
ds_fire

**Clip area**

Bounding box spain:

In [None]:
lon_min, lat_min, lon_max, lat_max = (-9.39288367353, 35.946850084, 4.32841, 43.7483377142)

The corresponding x/y values.

In [None]:
x_min, y_min, x_max, y_max = (-25, -15, -10, -3)

In [None]:
ds_fire = ds_fire.sel(rlon=slice(x_min, x_max), rlat=slice(y_min, y_max)).copy()
ds_fire

**Display data**

In [None]:
ds_fire['fwi-mean-jjas']

In [None]:
fig, ax = plt.subplots(figsize=(15,7.5))

ax = plt.axes(projection=ccrs.PlateCarree())

ax.set_global()

ds_fire['fwi-mean-jjas'].isel(time=-1).sel(scenario='rcp85').plot.pcolormesh(ax=ax, cmap='magma', transform=ccrs.PlateCarree(), x='lon', y='lat', add_colorbar=True)
ax.coastlines()
ax.set_ylim([lat_min-1,lat_max+1]);
ax.set_xlim([lon_min-1,lon_max+1]);

### Bioclimatic indicators

Customized bioclimatic indicators were calculated on raw daily and monthly climate data timeseries per year. 
Monthly, annual and 20-year window statistics (for 1961-1980, 1981-2000, 2021-2040, 2041-2060, 2061-2080, 2081-2100) were calculated. Also median and spread (inter-quartile range) were calculated for the CMIP multi-model ensemble
for each RCP scenario.

    `1979-01-01T00:00:00.000000000` -> 1970-1989 
    `1989-01-01T00:00:00.000000000` -> 1980-1999
    `2009-01-01T00:00:00.000000000` -> 2001-2020 
    `2030-01-01T00:00:00.000000000` -> 2021-2040
    `2050-01-01T00:00:00.000000000` -> 2041-2060 
    `2070-01-01T00:00:00.000000000` -> 2061-2080
    `2090-01-01T00:00:00.000000000` -> 2081-2100
    
**Read data** 

In [None]:
data_dir = '../../data/raw/climate/dataset-sis-biodiversity-cmip5-regional/'
scenario = 'rcp45'
[i for i in os.listdir(data_dir) if scenario in i]

In [None]:
data_dir = '../../data/raw/climate/dataset-sis-biodiversity-cmip5-regional/'
for ns, scenario in enumerate(['rcp45', 'rcp85']):
    print(scenario)
    for n, file in enumerate([i for i in os.listdir(data_dir) if scenario in i][3:]):
        print(file)
        if n == 0:
            ds_bio_sce = xr.open_dataset(data_dir+file, engine="netcdf4")
        else:
            if 'dry-spells_maximum-length' in file:
                # Ingest the dry spells variables as float32 and not as timedelta64
                raw = xr.open_dataset(data_dir+file, engine="netcdf4", decode_cf=False)
                del raw['dry-spells_maximum-length'].attrs['units']
                ds = xr.decode_cf(raw)
            else:
                ds = xr.open_dataset(data_dir+file, engine="netcdf4")
                
            # Assign new data variables to a Dataset
            ds_bio_sce = ds_bio_sce.assign({list(ds.keys())[-1]: ds[list(ds.keys())[-1]]})
            # Add scenario coordinate
            ds_bio_sce = ds_bio_sce.assign_coords({"scenario": scenario})
            
    if ns == 0:
        ds_bio = ds_bio_sce.copy()
    else:
        ds_bio = xr.concat([ds_bio, ds_bio_sce.copy()], dim='scenario')

In [None]:
ds_bio

**Clip area**

In [None]:
ds_bio = ds_bio.sel(longitude=slice(lon_min, lon_max), latitude=slice(lat_min, lat_max)).copy()
ds_bio

**Display data**

In [None]:
ds_bio['BIO16']

In [None]:
fig, ax = plt.subplots(figsize=(15,7.5))

ax = plt.axes(projection=ccrs.PlateCarree())

ax.set_global()

ds_bio['BIO16'].isel(time=-1).sel(scenario='rcp85').plot.pcolormesh(ax=ax, cmap='magma', transform=ccrs.PlateCarree(), x='longitude', y='latitude', add_colorbar=True)
ax.coastlines()
ax.set_ylim([lat_min-1,lat_max+1]);
ax.set_xlim([lon_min-1,lon_max+1]);

### Heat waves and cold spells 

**Read data** 

In [None]:
data_dir = '../../data/raw/climate/dataset-sis-heat-and-cold-spells/'
for ns, scenario in enumerate(['rcp45', 'rcp85']):
    for n, file in enumerate([i for i in os.listdir(data_dir) if scenario in i]):
        ds_heat_sce = xr.open_dataset(data_dir+file, engine="netcdf4")
        # Add scenario coordinate
        ds_heat_sce = ds_heat_sce.assign_coords({"scenario": scenario})
            
    if ns == 0:
        ds_heat = ds_heat_sce.copy()
    else:
        ds_heat = xr.concat([ds_heat, ds_heat_sce.copy()], dim='scenario')

In [None]:
ds_heat

**Clip area**

In [None]:
ds_heat = ds_heat.sel(lon=slice(lon_min, lon_max), lat=slice(lat_min, lat_max)).copy()
ds_heat

**Display data**

In [None]:
fig, ax = plt.subplots(figsize=(15,7.5))

ax = plt.axes(projection=ccrs.PlateCarree())

ax.set_global()

ds_heat['HWD_EU_climate'].isel(time=-1).sel(scenario='rcp85').plot.pcolormesh(ax=ax, cmap='magma', transform=ccrs.PlateCarree(), x='lon', y='lat', add_colorbar=True)
ax.coastlines()
ax.set_ylim([lat_min-1,lat_max+1]);
ax.set_xlim([lon_min-1,lon_max+1]);

### Temperature statistics 

**Read data** 

In [None]:
data_dir = '../../datasets/raw/climate/dataset-sis-temperature-statistics/'
for ns, scenario in enumerate(['rcp45', 'rcp85']):
    for n, file in enumerate([i for i in os.listdir(data_dir) if scenario in i]):
        
        if n == 0:
            ds_temp_sce = xr.open_dataset(data_dir+file, engine="netcdf4")
            
        else:
            ds = xr.open_dataset(data_dir+file, engine="netcdf4")
            # Assign new data variables to a Dataset
            ds_temp_sce = ds_temp_sce.assign({list(ds.keys())[-1]: ds[list(ds.keys())[-1]]})
            
        # Add scenario coordinate
        ds_temp_sce = ds_temp_sce.assign_coords({"scenario": scenario})
            
    if ns == 0:
        ds_temp = ds_temp_sce.copy()
    else:
        ds_temp = xr.concat([ds_temp, ds_temp_sce.copy()], dim='scenario')

In [None]:
ds_temp

**Clip area**

In [None]:
ds_temp = ds_temp.sel(lon=slice(lon_min, lon_max), lat=slice(lat_min, lat_max)).copy()
ds_temp

**Display data**

In [None]:
fig, ax = plt.subplots(figsize=(15,7.5))

ax = plt.axes(projection=ccrs.PlateCarree())

ax.set_global()

ds_temp['mean_Tmax_Summer'].isel(time=-1).sel(scenario='rcp85').plot.pcolormesh(ax=ax, cmap='magma', transform=ccrs.PlateCarree(), x='lon', y='lat', add_colorbar=True)
ax.coastlines()
ax.set_ylim([lat_min-1,lat_max+1]);
ax.set_xlim([lon_min-1,lon_max+1]);

In [None]:
data_dir = '../../datasets/raw/climate/dataset-sis-biodiversity-cmip5-regional/'
file = 'aridity_wettest-quarter_noresm1-m_rcp45_r1i1p1_1960-2099-mean_v1.0.nc'

ds_bio_sce = xr.open_dataset(data_dir+file, engine="netcdf4")

In [None]:
ds_bio_sce

In [None]:
ds_bio_sce.sel(time=slice('2009-01-01T00:00:00.000000000', '2090-01-01T00:00:00.000000000'))

In [None]:
ds_fire

In [None]:
ds_bio

In [None]:
ds_heat

In [None]:
ds_temp.sel(time=slice('2000-06-01T00:00:00.000000000', '2085-06-01T00:00:00.000000000')).resample(time="20Y", loffset=10).mean()

In [None]:
ds_temp.resample(time="20Y").mean()

In [None]:
20-year window
statistics (for 1961-1980, 1981-2000, 2021-2040, 2041-2060, 2061-2080, 2081-2100