# Download ERA5-Land Daily Aggregated time series averaged over an Area of Interest


## Requirements: 

1. Google Earth Engine account. Sign up [here](https://earthengine.google.com/signup/).

2. GIS file of the Area of Interest (AOI) boundaries (.shp, .gpkg, or other file readable by geopandas). 

3. Lapse rates calculated from monthly ERA5 air temperatures at varying pressure levels from [Rounce et al. (2023)](https://www.science.org/doi/10.1126/science.abo1324), downloadable from the [Carnegie Mellon repository](https://cmu.app.box.com/s/p8aiby5s9f3n6ycgmhknbgo4htk3pn9j/folder/124736593075) ("ERA5_lapserates_monthly.nc"). 

4. Digital elevation model (DEM) over the area of interest for applying lapse rates.

5. ERA5 geopotential for estimating geoid heights and applying lapse rates. Downloadable from the [ERA5-Land Documentation](https://confluence.ecmwf.int/display/CKB/ERA5-Land%3A+data+documentation). See Parameter Listings and download links in Table 1. 


In [None]:
import ee
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
from tqdm.auto import tqdm
import rioxarray as rxr
import xarray as xr
import numpy as np
import sys

## Authenticate and initialize Google Earth Engine

In [None]:
project_id = 'ee-raineyaberle'
try:
    ee.Initialize(project=project_id)
except:
    ee.Authenticate()
    ee.Initialize(project=project_id)

## Define filters, etc. for ERA5-Land querying

In [None]:
# -----Paths to input files
# Define path to study-sites
study_sites_path = '/Volumes/LaCie/raineyaberle/Research/PhD/snow_cover_mapping/study-sites'
# Define path to this code package
code_path = '/Users/raineyaberle/Research/PhD/snow_cover_mapping/glacier-snow-cover-analysis/'
# Import utility functions
sys.path.append(os.path.join(code_path, 'scripts'))
import utils as f

# -----Date range
date_start = '2012-10-01'
date_end = '2023-12-01'

# -----Bands to extract from ERA5-Land
# See all data bands in the GEE documentation here: 
# https://developers.google.com/earth-engine/datasets/catalog/ECMWF_ERA5_LAND_DAILY_AGGR#bands
bands = ['temperature_2m', 
         'total_precipitation_sum', 
         'snowfall_sum', 
         'snowmelt_sum'] 

## Download and process data for multiple study sites

### Load study site names

Assumes all study sites start with "RGI" and are located in the same folder, `study-sites-path`.

In [None]:
# Grab site names from "RGI" folder names
rgi_ids = [rgi_id for rgi_id in sorted(os.listdir(study_sites_path)) if 'RGI' in rgi_id]
# Filter to sites without ERA data already downloaded
rgi_ids = [rgi_id for rgi_id in rgi_ids if not 
           os.path.exists(os.path.join(study_sites_path, rgi_id, 'ERA', f'{rgi_id}_ERA5-Land_daily_means.csv'))] 
print(f'Sites to run = {len(rgi_ids)}')
rgi_ids

### Iterate over sites, query GEE, and export ERA5-Land to Google Drive

Go to your GEE Task Manager to monitor exports: https://code.earthengine.google.com/tasks

In [None]:
# -----Iterate over sites
for rgi_id in tqdm(rgi_ids):
    print(rgi_id)
    
    # Define AOI file name
    aoi_fn = os.path.join(study_sites_path, rgi_id, 'AOIs', f'{rgi_id}_outline.shp')
        
    # Load AOI and adjust for GEE querying
    aoi = gpd.read_file(aoi_fn)
    aoi = aoi.to_crs('EPSG:4326')
    aoi_ee = ee.Geometry.Polygon(list(zip(aoi.geometry[0].exterior.coords.xy[0], 
                                          aoi.geometry[0].exterior.coords.xy[1])))
 
    # Query GEE for the ERA5-Land dataset
    era5 = (ee.ImageCollection("ECMWF/ERA5_LAND/DAILY_AGGR")
             .filter(ee.Filter.date(date_start, date_end))
             .filterBounds(aoi_ee)
             .select(bands))

    # Resample at 30 m resolution to improve clipping and averaging
    scale = 30
    def resample(image):
        return (image
                .resample('bicubic')
                .reproject(crs=aoi_ee.projection(), scale=scale))
    era5_interp = era5.map(resample)
                        
    # Clip to AOI
    def clip_to_aoi(image):
        return ee.Image(image.clip(aoi_ee.buffer(11e3)))
    era5_interp = era5_interp.map(clip_to_aoi)
    era5_heights_interp = clip_to_aoi(era5_heights_interp)

    # Calculate band means over the AOI
    def average_bands_over_aoi(image):
        # Calculate the mean for all bands over the study area
        mean_dict = image.reduceRegion(
            reducer=ee.Reducer.mean(),
            geometry=aoi_ee,
            scale=scale,  
            bestEffort=True
        )
        # Convert the mean values dictionary to a list of property names
        band_names = image.bandNames()
        properties = band_names.map(lambda band: ee.String('mean_').cat(ee.String(band)))
        # Create a dictionary of the mean values with new names prefixed by 'mean_'
        mean_properties = ee.Dictionary.fromLists(properties, band_names.map(lambda band: mean_dict.get(band)))
        # Create a feature with the system:time_start property and the mean values
        return ee.Feature(None, mean_properties.set('system:time_start', image.get('system:time_start')))
    era5_mean = era5_interp.map(average_bands_over_aoi)

    # Export features to Drive as CSV
    task = ee.batch.Export.table.toDrive(
        collection=era5_mean,
        description=f'{rgi_id}_ERA5-Land_daily_means',
        fileNamePrefix=f'{rgi_id}_ERA5-Land_daily_means',
        folder='ERA5-Land_Exports',
        fileFormat='CSV'
    )
    task.start()

print('\nExports are a-go-go!')

## Process ERA5-Land exports

Apply lapse rates to air temperatures using the ERA5 ground reference heights and a site DEM, calculate PDDs and annual sums.

Download all CSVs and place into one folder: `downloads_path`

In [None]:
# Define path to your downloads
downloads_path = '/Volumes/LaCie/raineyaberle/Research/PhD/snow_cover_mapping/ERA5-Land_exports'

# Grab file names
fns = sorted(glob.glob(os.path.join(downloads_path, '*.csv')))
# Grab RGI IDs from file names
rgi_ids = [os.path.basename(fn)[0:14] for fn in fns]
print(f'Number of files = {len(rgi_ids)}')
rgi_ids

### Iterate over site names

In [None]:
# Load ERA5 geopotential
era5_heights_fn = os.path.join(code_path, 'inputs-outputs', 'geo_1279l4_0.1x0.1.grib2_v4_unpack.nc')
era5_heights = rxr.open_rasterio(era5_heights_fn).squeeze()
era5_heights.rio.write_crs('EPSG:4326', inplace=True)
# Convert to heights above the geoid
era5_heights = era5_heights / 9.816

# Load lapse rates
lapse_rates_fn = os.path.join(study_sites_path, '..', 'Rounce_et_al_2023', 'ERA5_lapserates_monthly.nc')
lapse_rates = xr.open_dataset(lapse_rates_fn)
lapse_rates = lapse_rates.drop_dims('level')
lapse_rates = lapse_rates.sel(time=slice(np.datetime64('2012-10-01'), None)) # subset time to speed up computations
lapse_rates = lapse_rates.rename({'longitude': 'x', 'latitude': 'y'}) # rename coords for comparison with DEM
lapse_rates.rio.write_crs("EPSG:4326", inplace=True)
### Lapse rates only go to 2020-04-01, so extend to 2023 and use the mean 
lapse_rates = lapse_rates.sel(time=slice(None, "2020-04-01"))
mean_data = lapse_rates["lapserate"].mean(dim="time")
new_time = pd.date_range(str(lapse_rates.time.data.astype('datetime64[D]').max()), 
                         "2023-12-31", freq="M")
new_data = np.tile(mean_data.values[np.newaxis, :, :], (len(new_time), 1, 1))
new_ds = xr.Dataset(
    {"lapserate": (("time", "y", "x"), new_data)},
    coords={"time": new_time, "y": lapse_rates.coords["y"], "x": lapse_rates.coords["x"]},
)
extended_ds = xr.concat([lapse_rates, new_ds], dim="time")
lapse_rates = extended_ds.sortby("time")

# Iterate over RGI IDs
for rgi_id in tqdm(rgi_ids):
    # print(rgi_id)
    era5_fn = [fn for fn in fns if rgi_id in fn][0]
    
    # Define path for outputs
    out_path = os.path.join(study_sites_path, rgi_id, 'ERA')
    if not os.path.exists(out_path):
        os.mkdir(out_path)
    out_fn = os.path.join(out_path, os.path.basename(era5_fn))
    if os.path.exists(out_fn):
        continue
    
    # Load ERA5-Land daily means CSV
    era5_df = pd.read_csv(era5_fn)
    era5_df['system:index'] = pd.to_datetime(era5_df['system:index'], format='%Y%m%d')
    era5_df.rename(columns={'system:index':'Date'}, inplace=True)
    era5_df.drop(columns=['.geo', 'system:time_start'], inplace=True) # remove unwanted columns
    
    # Load AOI
    aoi_fn = os.path.join(study_sites_path, rgi_id, 'AOIs', f'{rgi_id}_outline.shp')
    aoi = gpd.read_file(aoi_fn)
    aoi = aoi.to_crs("EPSG:4326")
    
    # Load DEM
    dem_fn = glob.glob(os.path.join(study_sites_path, rgi_id, 'DEMs', "*.tif"))[0]
    dem = rxr.open_rasterio(dem_fn).squeeze()
    dem = dem.rio.write_crs("EPSG:4326")
    if len(dem.data.ravel()) > 1e6: # downsample DEMs for really big glaciers
        # reproject to UTM for coordinates in meters
        epsg_utm = f.convert_wgs_to_utm(aoi.geometry[0].centroid.coords.xy[0][0], 
                                        aoi.geometry[0].centroid.coords.xy[1][0])
        dem = dem.rio.reproject(epsg_utm)
        # downsample
        dem = dem.rio.reproject(resolution=(1000,1000), dst_crs=epsg_utm) 
    dem = dem.rio.reproject("EPSG:4326") # make sure it's now in WGS84 lat lon projection
    if 'band' in dem.dims:
        dem = dem.isel(band=0)
    # Clip DEM to AOI
    dem = dem.rio.clip(aoi.geometry)      
    # Remove wacky values
    dem = xr.where((dem < 1e3) | (dem > 1e4), np.nan, dem)

    # Shift longitudes to align with ERA5 heights grid
    dem['x'] = dem['x'] + 360
    dem.rio.write_crs('EPSG:4326', inplace=True)

    # Difference ERA5 heights and DEM heights
    era5_df['ERA5_height_mean_m'] = float(era5_heights.rio.reproject_match(dem).mean().values)
    era5_df['DEM_height_mean_m'] = float(dem.mean().values)
    era5_df['height_diff_mean_m'] = float((dem - era5_heights.rio.reproject_match(dem)).mean().values)

    # Get average monthly lapse rates over site
    lapse_rates_site = lapse_rates.rio.reproject_match(dem).mean(dim='x').mean(dim='y')
    era5_df['lapse_rate_C/m'] = 0
    for t in lapse_rates.time.data:
        era5_df.loc[(era5_df['Date'].dt.year==pd.Timestamp(t).year) 
                    & (era5_df['Date'].dt.month==pd.Timestamp(t).month), 'lapse_rate_C/m'] = float(lapse_rates_site.sel(time=t).lapserate)

    # Apply monthly lapse rates to temperatures
    era5_df['mean_temperature_2m_C'] = era5_df['mean_temperature_2m'] - 273.15    
    era5_df['mean_temperature_2m_C_adj'] = era5_df['mean_temperature_2m_C'] + (era5_df['lapse_rate_C/m'] * era5_df['height_diff_mean_m'])
    
    # Calculate positive degree days (PDDs)
    if 'mean_temperature_2m_C_adj' in era5_df.keys():
        def calculate_pdd(temp_C):
            if temp_C > 0:
                return temp_C
            else:
                return 0
        era5_df['positive_degree_days'] = era5_df['mean_temperature_2m_C_adj'].apply(calculate_pdd)
        # Calculate cumulative PDDs starting in January
        era5_df['positive_degree_days_annual_cumsum'] = era5_df.groupby(era5_df['Date'].dt.year)['positive_degree_days'].cumsum()

    # Calculate annual sums for other columns starting in October
    # Add water year column
    def calculate_water_year(date):
        if pd.Timestamp(date).month >= 10:
            return pd.Timestamp(date).year
        else:
            return pd.Timestamp(date).year - 1
    era5_df['water_year'] = era5_df['Date'].apply(lambda x: calculate_water_year(x))
    for column in era5_df.keys():
        if ('precip' in column) | ('snowfall' in column) | ('snowmelt' in column):
            era5_df[f'{column}_wateryear_cumsum'] = era5_df.groupby('water_year')[column].cumsum()
            
    # Save to file
    era5_df.to_csv(out_fn, index=False)
    print('Processed ERA5 data saved to file:', out_fn)

    # Plot time series
    plot_columns = [col for col in era5_df.keys() if (col!='Date') & ('height' not in col) & (col!='water_year')]
    fig, ax = plt.subplots(len(plot_columns), 1, figsize=(8,4*len(plot_columns)))
    for i, column in enumerate(plot_columns):
        ax2 = ax[i].twinx()
        ax2.set_ylabel('')
        ax[i].plot(era5_df['Date'], era5_df[column], '-k')
        ax[i].set_title(column)
        ax[i].grid()
    # Save figure to file
    fig_fn = out_fn.replace('.csv', '.png')
    fig.savefig(fig_fn, dpi=300, bbox_inches='tight')
    plt.close()
