# Load and save ERA5-Land Daily Aggregated data over an Area of Interest


## Requirements: 

- __Google Earth Engine account__. Sign up [here](https://earthengine.google.com/signup/).
- __GIS file__ of the Area of Interest (AOI) boundaries (.shp, .gpkg, or other file readable by geopandas). 
- __Digital Elevation Model__ (DEM) over the AOI (TIF, netCDF, or other file readable by xarray).
    - If the DEM is referenced to the ellipsoid, comment out the lines below where ERA5-Land ellipsoid heights are reprojected to the geoid. 
    - A DEM from the ArcticDEM or NASADEM can be downloaded from Google Earth Engine using the AOI boundaries and [this Python function](https://github.com/RaineyAbe/glacier-snow-cover-mapping/blob/350af45c63fc77e6bd2f777fdc255d1d7d32c719/functions/pipeline_utils.py#L133) in the `glacier-snow-cover-mapping` repository. For the ArcticDEM, the function automatically saves a second DEM file reprojected to the geoid. 
- __ERA5-Land gridded geopotential__ file, used to calculate surface heights (TIF, netCDF, or other file readable by xarray).  Options for access:
    - This code repository: "geo_1279l4_0.1x0.1.grib2_v4_unpack.nc" in the [`inputs-outputs` folder](https://github.com/RaineyAbe/snow-cover-mapping-application/tree/main/inputs-outputs). 
    - Download from the [ECMFW documentation for ERA5-Land](https://confluence.ecmwf.int/display/CKB/ERA5-Land%3A+data+documentation#ERA5Land:datadocumentation-LandSurfaceModel) (see Table 1).
- If your DEM is referenced to the geoid: __EGM96 geoid heights__, used to reproject ERA5-Land ellipsoid heights to the geoid (TIF, netCDF, or other file readable by xarray). Options for access:
    - This code repository: "us_nga_egm96_15.tif" in the [`inputs-outputs` folder](https://github.com/RaineyAbe/snow-cover-mapping-application/tree/main/inputs-outputs). 
    - Download from the USA NGS via [Agisoft](https://www.agisoft.com/downloads/geoids/). 

In [None]:
import ee
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import glob
from tqdm.auto import tqdm
import xarray as xr
import rioxarray as rxr
from shapely.geometry import Polygon, LineString
import geojson

## Authenticate and initialize Google Earth Engine

In [None]:
try:
    ee.Initialize()
except:
    ee.Authenticate()
    ee.Initialize()

## Define filters, etc. for ERA5-Land querying

In [None]:
# -----Date range for ERA5 querying
date_start = '2023-10-01'
date_end = '2023-12-01'

# -----Bands to extract from ERA5-Land
# See all data bands in the GEE documentation here: 
# https://developers.google.com/earth-engine/datasets/catalog/ECMWF_ERA5_LAND_DAILY_AGGR#bands
bands = ['temperature_2m', 
         'total_precipitation_sum', 
         'snowfall_sum', 
         'snowmelt_sum'] 

# -----Define lapse rate to apply to air temperatures
lapse_rate = 6 # deg C / km

## Load and calculate ERA5-Land ellipsoid heights, reproject to the geoid

In [None]:
# Full path to geopotential for ERA5-Land
era_geo_fn =  '/Users/raineyaberle/Research/PhD/snow_cover_mapping/snow-cover-mapping-application/inputs-outputs/geo_1279l4_0.1x0.1.grib2_v4_unpack.nc'

# Full path to geoid heights
egm96_fn = '/Users/raineyaberle/Research/PhD/snow_cover_mapping/snow-cover-mapping-application/inputs-outputs/us_nga_egm96_15.tif'

# Load ERA5-Land reference elevation data
era_geo = xr.open_dataset(era_geo_fn)
era_geo = era_geo / 9.8
# shift longitudes > 180 to longitude - 360
era_geo.longitude.data[era_geo.longitude.data > 180] = era_geo.longitude.data[era_geo.longitude.data>180] - 360

# Load EGM96 geoid heights
egm96 = xr.open_dataset(egm96_fn)
# interpolate to era_geo coordinates
egm96_interp = egm96.interp(x=era_geo.longitude, y=era_geo.latitude, method='nearest')

# Subtract the geoid from ERA5-Land ellipsoid heights
era_elevs_geoid = era_geo.z.data - egm96_interp.band_data.data
era_geo['z'] = (('time', 'latitude', 'longitude'), era_elevs_geoid)
era_geo = era_geo.rio.write_crs('EPSG:4326')

# Plot geoid heights
print("Note: Longitude degree values are incorrect \nMatplotlib won't let me make the axis go from positive -> negative values")
plt.figure(figsize=(10,6))
plt.imshow(era_geo.z.data[0], extent=(0,360,-90,90), cmap='terrain')
plt.title('Geoid heights calculated from ERA5-Land geopotential')
plt.colorbar(label='meters', shrink=0.5)
plt.grid()
plt.show()

## Define functions for grabbing data over the site area


In [None]:
# Define function to grab mean band values over the region of interest 
def calculate_band_means_over_area(image):
    # Select specific bands
    image = image.select(bands)
    # Calculate mean for the selected bands in the AOI
    mean_values = image.reduceRegion(
        reducer=ee.Reducer.mean(),
        geometry=aoi_ee,
        scale=1000)
    return image.set(mean_values)

## Process multiple sites in the same folder with same file structure

### Load names of sites to process

In [None]:
study_sites_path = '/Volumes/LaCie/raineyaberle/Research/PhD/snow_cover_mapping/study-sites'
rgi_ids = [x for x in sorted(os.listdir(study_sites_path)) if 'RGI' in x]

# Filter to sites without ERA data
rgi_ids = [x for x in rgi_ids if not len(glob.glob(os.path.join(study_sites_path, x, 'ERA', '*.csv'))) > 0]
print(f'Sites to run = {len(rgi_ids)}')

### Iterate over sites and query GEE for ERA5-Land

In [None]:
# -----Iterate over sites
for rgi_id in tqdm(rgi_ids):

    # Define path to ERA data
    era_path = os.path.join(study_sites_path, rgi_id, 'ERA')

    # Make output directory if it does not exist
    if not os.path.exists(era_path):
        os.mkdir(era_path)
        # print('Made directory for output ERA files')
    
    # Check if ERA5-Land data already exist in directory
    out_fn = os.path.join(era_path, rgi_id + '_ERA5-Land_' + date_start + '_' + date_end + '.csv')
    if not os.path.exists(out_fn):

        # Load AOI and DEM
        aoi_fn = os.path.join(study_sites_path, rgi_id, 'AOIs', rgi_id + '_outline.shp')
        try:
            aoi = gpd.read_file(aoi_fn)
            
            aoi = aoi.to_crs('EPSG:4326')
            dem_fns = glob.glob(os.path.join(study_sites_path, rgi_id, 'DEMs', '*.tif'))
            if any(['ArcticDEM' in x for x in dem_fns]):
                dem_fn = [x for x in dem_fns if '_geoid.tif' in x][0]
            elif any (['USGS' in x for x in dem_fns]):
                dem_fn = [x for x in dem_fns if '_geoid.tif' in x][0]
            else:
                dem_fn = glob.glob(os.path.join(study_sites_path, rgi_id, 'DEMs', rgi_id + '*NASADEM*.tif'))[0]
            dem = xr.open_dataset(dem_fn)
            dem = dem.rio.reproject('EPSG:4326') # reproject to WGS84
            dem = xr.where((dem > 1e38) or (dem<=-9999), np.nan, dem) # remove no data values
            dem = dem.rio.write_crs('EPSG:4326')
            
            # Reformat for GEE querying
            aoi = gpd.read_file(aoi_fn)
            # reproject to WGS84
            aoi_wgs = aoi.to_crs('EPSG:4326')
            # Reformat AOI to ee.Geometry.Polygon 
            region = ee.Geometry.Polygon(list(zip(aoi_wgs.geometry[0].exterior.coords.xy[0],
                                                  aoi_wgs.geometry[0].exterior.coords.xy[1])))
    
            # Grab median elevation over the AOI
            if 'Zmed' in list(aoi.columns): # If using an RGI outline, use the "Zmed" value
                zmed = aoi['Zmed'].values[0]
            else: # Otherwise, use the median value from the DEM
                zmed = np.nanmedian(dem.data)
            # print(f'Median site elevation = {np.round(zmed, 2)} m')
    
            # Grab median ERA5-Land elevation over AOI
            # interpolate to DEM coordinates
            era_geo_interp = era_geo.interp(longitude=dem.x, latitude=dem.y, method='linear')
            # clip to AOI
            era_geo_interp_clip = era_geo_interp.rio.clip(aoi_wgs.geometry)
            # calculate median elevation
            elev_med_era = np.nanmedian(np.ravel(era_geo_interp_clip.z.data[0]))
            # print(f'Median site elevation from ERA5-Land = {np.round(elev_med_era, 2)} m')
            
            # Query GEE for the ERA5-Land dataset
            era5_land = (ee.ImageCollection("ECMWF/ERA5_LAND/DAILY_AGGR")
                         .filter(ee.Filter.date(date_start, date_end))
                         .filterBounds(region))
        
            # Calculate mean daily values for all bands
            era5_land_mean = era5_land.map(calculate_band_means_over_area)
            
            # Compile statistics into a pandas.DataFrame
            # ceate empty lists to store the data
            dates = []
            mean_values_list = {band: [] for band in bands}
            # iterate over the ImageCollection to collect data
            for image in era5_land_mean.getInfo()['features']:
                date = pd.to_datetime(image['properties']['system:time_start'], unit='ms')  # Convert to datetime
                dates.append(date)
                mean_values = image['properties']
                for band in bands:
                    mean_values_list[band].append(mean_values[band])
            # create a Pandas DataFrame
            data = {'Date': dates}
            data.update(mean_values_list)
            df = pd.DataFrame(data)
    
            # Adjust air temperatures for elevation using defined lapse rate
            if 'temperature_2m' in list(df.columns):
                # Convert air temperatures to Celsius (from Kelvin)
                df['temperature_2m_C'] = df['temperature_2m'] - 273.15
                # Adjust air temperatures using reference elevations and lapse rate
                df['temperature_2m_C_adjusted'] = df['temperature_2m_C'] - lapse_rate * (zmed - elev_med_era)/1e3
            
                # Add cumulative positive degree days
                # Calculate Positive Degree Days (PDDs)
                df['positive_degree_days'] = df['temperature_2m_C_adjusted'].apply(lambda x: max(0, x))
                # Calculate cumulative sum and reset at the start of each calendar year
                df['cumulative_positive_degree_days'] = df.groupby(df['Date'].dt.year, group_keys=True)['positive_degree_days'].cumsum()
                # Reset cumulative sum to zero at the start of each year
                df['cumulative_positive_degree_days'] = df.groupby(df['Date'].dt.year, group_keys=False)['cumulative_positive_degree_days'].apply(lambda x: x - x.iloc[0])
            
            # Add cumulative annual precipitation, snowfall, and snowmelt
            # Restart the count each water year 
            df['water_year'] = df['Date'].apply(lambda x: x.year if x.month >= 10 else x.year - 1) # add a water year column
            if 'total_precipitation_sum' in list(df.columns):
                df['cumluative_total_precipitation_sum'] = df.groupby('water_year')['total_precipitation_sum'].cumsum()    
            if 'snowfall_sum' in list(df.columns):
                df['cumluative_snowfall_sum'] = df.groupby('water_year')['snowfall_sum'].cumsum()   
            if 'snowmelt_sum' in list(df.columns):
                df['cumulative_snowmelt_sum'] = df.groupby('water_year')['snowmelt_sum'].cumsum()
    
            # Save DataFrame to CSV
            df.to_csv(out_fn, index=False)
            # print('ERA5-Land data variables saved to file:', out_fn)
            
            # Plot data variables
            plot_vars = [x for x in list(df.columns) if (x!='Date') & (x!='water_year')]
            plt.rcParams.update({'font.size':12, 'font.sans-serif':'Arial'})
            
            fig, ax = plt.subplots(len(plot_vars), 1, figsize=(8, 4*len(plot_vars)))
            for i, var in enumerate(plot_vars):
                ax[i].plot(df.Date.values.astype('datetime64[ns]'), df[var].values, '.', markersize=3)
                ax[i].set_title(var)
                ax[i].grid()
            plt.close()
        
            # Save figure
            fig_fn = out_fn.replace('.csv', '.png')
            fig.savefig(fig_fn, dpi=300, bbox_inches='tight')
            # print('Figure saved to file: ', fig_fn)

        except Exception as e:
            print(rgi_id)
            print(e, '\n')
            continue


## Process a single site

### Define paths in directory

In [None]:
# Define name of study site, used in output file names
site_name = 'Hubbard'

# Full path to AOI boundaries 
aoi_fn = '/Users/raineyaberle/Research/Hubbard/velocity/center.gpkg'

# Full path to DEM
dem_fn = '/Users/raineyaberle/Research/Hubbard/DEMs/ifsar_hubbardDEM.tif'

# Full path to geopotential for ERA5-Land
era_geo_fn =  '/Users/raineyaberle/Research/PhD/snow_cover_mapping/snow-cover-mapping-application/inputs-outputs/geo_1279l4_0.1x0.1.grib2_v4_unpack.nc'

# Full path to geoid heights
egm96_fn = '/Users/raineyaberle/Research/PhD/snow_cover_mapping/snow-cover-mapping-application/inputs-outputs/us_nga_egm96_15.tif'

# Path where output files will be saved
out_path = '/Users/raineyaberle/Research/Hubbard/weather/'

### Query GEE for ERA5-Land

In [None]:
# -----Load AOI
aoi = gpd.read_file(aoi_fn)
# reproject to WGS84
aoi_wgs = aoi.to_crs('EPSG:4326')

# -----Load DEM
dem = rxr.open_rasterio(dem_fn)
# reproject to WGS84
dem = dem.rio.reproject('EPSG:4326')
# remove no data values
dem = xr.where((dem > 1e38) | (dem<=-9999), np.nan, dem)
dem = dem.rio.write_crs('EPSG:4326')
    
# -----Grab median elevation over the AOI
if 'Zmed' in list(aoi.columns): # If using an RGI outline, use the "Zmed" value
    zmed = float(aoi['Zmed'].values[0])
else: # Otherwise, use the median value from the DEM
    zmed = np.nanmedian(dem.data)
print(f'Median site elevation from DEM = {np.round(zmed, 2)} m')

# -----Query GEE for ERA5-Land
# Make output directory if it does not exist
if not os.path.exists(era_path):
    os.mkdir(era_path)
    # print('Made directory for output ERA files')

# Check if ERA5-Land data already exist in directory
out_fn = os.path.join(era_path, rgi_id + '_ERA5-Land_' + date_start + '_' + date_end + '.csv')
if not os.path.exists(out_fn):
            
    # Reformat AOI to ee.Geometry.Polygon 
    region = ee.Geometry.Polygon(list(zip(aoi_wgs.geometry[0].exterior.coords.xy[0],
                                          aoi_wgs.geometry[0].exterior.coords.xy[1])))
    
    # Query GEE for the ERA5-Land dataset
    era5_land = (ee.ImageCollection("ECMWF/ERA5_LAND/DAILY_AGGR")
                 .filter(ee.Filter.date(date_start, date_end))
                 .filterBounds(region))

    # Calculate mean daily values for all bands
    era5_land_mean = era5_land.map(calculate_band_means_over_area)
            
    # Compile statistics into a pandas.DataFrame
    # ceate empty lists to store the data
    dates = []
    mean_values_list = {band: [] for band in bands}
    # iterate over the ImageCollection to collect data
    for image in era5_land_mean.getInfo()['features']:
        date = pd.to_datetime(image['properties']['system:time_start'], unit='ms')  # Convert to datetime
        dates.append(date)
        mean_values = image['properties']
        for band in bands:
            mean_values_list[band].append(mean_values[band])
    # create a Pandas DataFrame
    data = {'Date': dates}
    data.update(mean_values_list)
    df = pd.DataFrame(data)
    
    # Adjust air temperatures for elevation using defined lapse rate
    if 'temperature_2m' in list(df.columns):
        # Convert air temperatures to Celsius (from Kelvin)
        df['temperature_2m_C'] = df['temperature_2m'] - 273.15
        # Adjust air temperatures using reference elevations and lapse rate
        df['temperature_2m_C_adjusted'] = df['temperature_2m_C'] - lapse_rate * (zmed - elev_med_era)/1e3
    
        # Add cumulative positive degree days
        # Calculate Positive Degree Days (PDDs)
        df['positive_degree_days'] = df['temperature_2m_C_adjusted'].apply(lambda x: max(0, x))
        # Calculate cumulative sum and reset at the start of each calendar year
        df['cumulative_positive_degree_days'] = df.groupby(df['Date'].dt.year, group_keys=True)['positive_degree_days'].cumsum()
        # Reset cumulative sum to zero at the start of each year
        df['cumulative_positive_degree_days'] = df.groupby(df['Date'].dt.year, group_keys=False)['cumulative_positive_degree_days'].apply(lambda x: x - x.iloc[0])
    
    # Add cumulative annual precipitation, snowfall, and snowmelt
    # Restart the count each water year 
    df['water_year'] = df['Date'].apply(lambda x: x.year if x.month >= 10 else x.year - 1) # add a water year column
    if 'total_precipitation_sum' in list(df.columns):
        df['cumluative_total_precipitation_sum'] = df.groupby('water_year')['total_precipitation_sum'].cumsum()    
    if 'snowfall_sum' in list(df.columns):
        df['cumluative_snowfall_sum'] = df.groupby('water_year')['snowfall_sum'].cumsum()   
    if 'snowmelt_sum' in list(df.columns):
        df['cumulative_snowmelt_sum'] = df.groupby('water_year')['snowmelt_sum'].cumsum()
    
    # Save DataFrame to CSV
    df.to_csv(out_fn, index=False)
    # print('ERA5-Land data variables saved to file:', out_fn)
    
    # Plot data variables
    plot_vars = [x for x in list(df.columns) if (x!='Date') & (x!='water_year')]
    plt.rcParams.update({'font.size':12, 'font.sans-serif':'Arial'})
    
    fig, ax = plt.subplots(len(plot_vars), 1, figsize=(8, 4*len(plot_vars)))
    for i, var in enumerate(plot_vars):
        ax[i].plot(df.Date.values.astype('datetime64[ns]'), df[var].values, '.', markersize=3)
        ax[i].set_title(var)
        ax[i].grid()
    plt.close()

    # Save figure
    fig_fn = out_fn.replace('.csv', '.png')
    fig.savefig(fig_fn, dpi=300, bbox_inches='tight')
    print('Figure saved to file: ', fig_fn)
