# Download ERA5-Land Daily Aggregated time series averaged over an Area of Interest


## Requirements: 

- Google Earth Engine account. Sign up [here](https://earthengine.google.com/signup/).
- GIS file of the Area of Interest (AOI) boundaries (.shp, .gpkg, or other file readable by geopandas). 

In [None]:
import ee
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import glob
from tqdm.auto import tqdm
import xarray as xr
import rioxarray as rxr
import geojson
import math
from rasterio.enums import Resampling
import geemap
import wxee as wx

## Authenticate and initialize Google Earth Engine

In [None]:
try:
    ee.Initialize()
except:
    ee.Authenticate()
    ee.Initialize()

## Define filters, etc. for ERA5-Land querying

In [None]:
# -----Paths to input files
# Full path to study-sites
study_sites_path = '/Volumes/LaCie/raineyaberle/Research/PhD/snow_cover_mapping/study-sites'
# Full path to this code package
code_path = '/Users/raineyaberle/Research/PhD/snow_cover_mapping/snow-cover-mapping-application/'

# -----Date range
date_start = '2012-10-01'
date_end = '2023-12-01'

# -----Bands to extract from ERA5-Land
# See all data bands in the GEE documentation here: 
# https://developers.google.com/earth-engine/datasets/catalog/ECMWF_ERA5_LAND_DAILY_AGGR#bands
bands = ['temperature_2m', 
         'total_precipitation_sum', 
         'snowfall_sum', 
         'snowmelt_sum'] 

# -----Lapse rate to apply to air temperatures using elevation difference between DEM and ERA5 reference grid
lapse_rate = -6 / 1e3 # deg C / m

## Download and process data for multiple study sites

### Load study site names

Assumes all study sites start with "RGI" and are located in the same folder, `study-sites-path`.

In [None]:
# Grad site names from "RGI" folder names
rgi_ids = [rgi_id for rgi_id in sorted(os.listdir(study_sites_path)) if 'RGI' in rgi_id]
# Filter to sites without ERA data
rgi_ids = [rgi_id for rgi_id in rgi_ids if not 
           os.path.exists(os.path.join(study_sites_path, rgi_id, 'ERA', f'{rgi_id}_ERA5_daily_means.csv'))] 
print(f'Sites to run = {len(rgi_ids)}')
rgi_ids

### Iterate over sites, query GEE, and export ERA5-Land to Google Drive

Go to your GEE Task Manager to monitor exports: https://code.earthengine.google.com/tasks

In [None]:
# -----Load ERA5 heights and EGM96 geoid heights
era5_heights = ee.Image("projects/ee-raineyaberle/assets/ERA5_heights_NAmerica_EGM96geoid")
egm96_geoid = ee.Image("projects/ee-raineyaberle/assets/us_nga_egm96_15")

# -----Load ArcticDEM coverage
arcticdem_coverage_fn = os.path.join(code_path, 'inputs-outputs', 'ArcticDEM_Mosaic_coverage.shp')
arcticdem_coverage = gpd.read_file(arcticdem_coverage_fn)

# -----Iterate over sites
for rgi_id in tqdm(rgi_ids):
    print(f'\n{rgi_id}')
    
    # Define AOI file name
    aoi_fn = os.path.join(study_sites_path, rgi_id, 'AOIs', f'{rgi_id}_outline.shp')
        
    # Load AOI and adjust for GEE querying
    aoi = gpd.read_file(aoi_fn)
    aoi = aoi.to_crs('EPSG:4326')
    aoi_polar = aoi.to_crs(arcticdem_coverage.crs)
    aoi_ee = ee.Geometry.Polygon(list(zip(aoi.geometry[0].exterior.coords.xy[0], 
                                          aoi.geometry[0].exterior.coords.xy[1])))
    
    # Query GEE for DEM
    if arcticdem_coverage.contains(aoi_polar.geometry[0])[0]: # Check for ArcticDEM coverage
        dem = ee.Image("UMN/PGC/ArcticDEM/V3/2m_mosaic").clip(aoi_ee)
        dem = ee.Image(dem.subtract(egm96_geoid)) # Convert to geoid heights
    else:
        dem = ee.Image("NASA/NASADEM_HGT/001").select('elevation').clip(aoi_ee)    
    
    # Query GEE for the ERA5-Land dataset
    era5 = (ee.ImageCollection("ECMWF/ERA5_LAND/DAILY_AGGR")
             .filter(ee.Filter.date(date_start, date_end))
             .filterBounds(aoi_ee)
             .select(bands))

    # Interpolate ERA5 and ERA5 heights to DEM grid
    scale = 30
    def resample(image):
        return (image
                .resample('bicubic')
                .reproject(crs=dem.projection(), scale=scale))
        
    era5_interp = era5.map(resample)
    era5_heights_interp = resample(era5_heights)
                        
    # Clip to AOI
    def clip_to_aoi(image):
        return ee.Image(image.clip(aoi_ee.buffer(11e3)))
    era5_interp = era5_interp.map(clip_to_aoi)
    era5_heights_interp = clip_to_aoi(era5_heights_interp)

    # Apply lapse rate adjustment to air temperatures
    def apply_lapse_rate(image):
        # Convert temperatures from Kelvin to Celsius
        temp_C = image.select('temperature_2m').subtract(273.15)
        # Calculate the elevation difference
        elevation_diff = dem.subtract(era5_heights_interp)
        # Apply the lapse rate adjustment
        temp_C_adj = temp_C.add(elevation_diff.multiply(lapse_rate))
        # Add adjusted temperature as a new band
        return image.addBands(temp_C_adj.rename('temperature_2m_C_adj'))
    era5_interp_adj = era5_interp.map(apply_lapse_rate)

    # Calculate band means over the AOI
    def average_bands_over_aoi(image):
        # Calculate the mean for all bands over the study area
        mean_dict = image.reduceRegion(
            reducer=ee.Reducer.mean(),
            geometry=aoi_ee,
            scale=scale,  
            bestEffort=True
        )
        # Convert the mean values dictionary to a list of property names
        band_names = image.bandNames()
        properties = band_names.map(lambda band: ee.String('mean_').cat(ee.String(band)))
        # Create a dictionary of the mean values with new names prefixed by 'mean_'
        mean_properties = ee.Dictionary.fromLists(properties, band_names.map(lambda band: mean_dict.get(band)))
        # Create a feature with the system:time_start property and the mean values
        return ee.Feature(None, mean_properties.set('system:time_start', image.get('system:time_start')))
    era5_mean = era5_interp_adj.map(average_bands_over_aoi)

    # Export features to Drive as CSV
    task = ee.batch.Export.table.toDrive(
        collection=era5_mean,
        description=f'{rgi_id}_ERA5_daily_means',
        fileNamePrefix=f'{rgi_id}_ERA5_daily_means',
        folder='ERA5_Exports',
        fileFormat='CSV'
    )
    task.start()

print('\nExports are a-go-go!')

## Process ERA5-Land exports

Download all CSVs and place into one folder, `downloads_path`

In [None]:
# Define path to your downloads
downloads_path = '/Users/raineyaberle/Downloads/ERA5_Exports/'

# Grab file names
fns = sorted(glob.glob(os.path.join(downloads_path, '*.csv')))
# Grab RGI IDs from file names
rgi_ids = [os.path.basename(fn)[0:14] for fn in fns]
# Filter to sites without ERA data
rgi_ids = [rgi_id for rgi_id in rgi_ids if not 
           os.path.exists(os.path.join(study_sites_path, rgi_id, 'ERA', f'{rgi_id}_ERA5-Land_daily_means.csv'))] 

rgi_ids

### Define some functions for processing

In [None]:
def process_dataframe(df, columns):
    # Add Date column
    df['system:index'] = pd.to_datetime(era5_df['system:index'], format='%Y%m%d')
    df.rename(columns={'system:index': 'Date'}, inplace=True)
    df.set_index('Date', inplace=True)

    # Add year and month columns
    df['year'] = df.index.year
    df['month'] = df.index.month

    # Calculate positive degree days (PDDs)
    if 'mean_temperature_2m_C_adj' in columns:
        def calculate_pdd(temp_C):
            if temp_C > 0:
                return temp_C
            else:
                return 0
        df['positive_degree_days'] = df['mean_temperature_2m_C_adj'].apply(calculate_pdd)
        # Calculate cumulative PDDs starting in January
        df['positive_degree_days_annual_sum'] = df.groupby('year')['positive_degree_days'].cumsum()

    # Calculate annual sums for other bands starting in October
    # Add water year column
    def calculate_water_year(month, year):
        if month >= 10:
            return year + 1
        else:
            return year
    df['water_year'] = df.apply(lambda x: calculate_water_year(x.month, x.year), axis=1)
    for column in columns:
        if column=='temperature_2m':
            continue
        else:
            df[f'{column}_annual_sum'] = df.groupby('water_year')[column].cumsum()

    return df

def plot_time_series(df, columns, out_fn):
    fig, ax = plt.subplots(len(columns), 1, figsize=(8,4*len(columns)))
    for i, column in enumerate(columns):
        if 'temperature' in column:
            column_sum = 'positive_degree_days_annual_sum'
            ylabel = 'PDD Annual sum'
        else:
            column_sum = f'{column}_annual_sum'
            ylabel = 'Annual sum'
        ax2 = ax[i].twinx()
        ax2.fill_between(df.index, df[column_sum], color='k', alpha=0.2)
        ax2.set_ylabel(ylabel)
        ax[i].plot(df.index, df[column], '-k')
        ax[i].set_title(column)
        ax[i].grid()
    plt.close()
    # Save figure to file
    fig_fn = out_fn.replace('.csv', '.png')
    fig.savefig(fig_fn, dpi=300, bbox_inches='tight')
    return fig

### Iterate over site names

In [None]:
# Iterate over RGI IDs
for rgi_id in tqdm(rgi_ids):
    # Load ERA5 CSV
    era5_fn = [fn for fn in fns if rgi_id in fn][0]
    era5_df = pd.read_csv(era5_fn)

    # Define path for outputs
    out_path = os.path.join(study_sites_path, rgi_id, 'ERA')
    if not os.path.exists(out_path):
        os.mkdir(out_path)
    out_fn = os.path.join(out_path, os.path.basename(era5_fn))

    # Process dataframe
    columns = [column for column in era5_df.columns if 'mean_' in column]
    era_df = process_dataframe(era5_df, columns)

    # Save to file
    era5_df.to_csv(out_fn, index=True)
    
    # Plot time series
    fig = plot_time_series(era5_df, columns, out_fn)

print('Done! :)')