# 09 ERA5 Pre-processing
UW Geospatial Data Analysis  
CEE467/CEWA567  
David Shean  

## Install necessary packages to open GRIB files (default ERA5 format) with xarray

https://github.com/ecmwf/cfgrib

While you wait, open a terminal and inspect the contents of the era5 directory, and review this information
* http://xarray.pydata.org/en/stable/io.html#grib-format-via-cfgrib

In [None]:
!mamba install -y -c conda-forge cfgrib

* I downloaded some sample ERA5 datasets, created zip files, staged and shared on Google Drive (accessible to anyone with link)
    * https://drive.google.com/open?id=1gomQR_lvhuww_xyR6wcUiziS12x1yCSx
* We can use the `drivanon` convenience package to easily download anonymously without authentication

In [None]:
#Install directly from github repo main branch
#%pip install git+https://github.com/friedrichknuth/driveanon.git

In [None]:
#import driveanon as da

In [None]:
import zipfile

In [None]:
outdir = 'era5_data'

In [None]:
if not os.path.exists(outdir):
    os.makedirs(outdir)

## Function to load and combine grib files into a single xarray DataSet
* Also creates a new, compressed netcdf (nc) file to store the data for future use
* See relevant doc on opening and writing files: http://xarray.pydata.org/en/stable/io.html

In [None]:
import os
from glob import glob
import xarray as xr

In [None]:
%cd $outdir

In [None]:
#blob_id = '1Gwkg21LPKxvZsjwMrwVESGi2ZaVLQP58'
#out_fn = f'{outdir}/ecv-for-climate-change_t2m_monthlymean.zip'
zip_fn_list = ['ecv-for-climate-change_t2m_climatology.zip', \
               'ecv-for-climate-change_t2m_anomaly.zip', \
               'ecv-for-climate-change_t2m_monthlymean.zip']
for zip_fn in zip_fn_list:
    download(zip_fn)

In [None]:
datadir = os.path.splitext(out_fn)[0]

In [None]:
%cd $datadir

In [None]:
out_fn_list = ['climatology_0.25g_ea_2t.nc', \
               '1month_anomaly_Global_ea_2t.nc', \
               '1month_mean_Global_ea_2t.nc']

In [None]:
def zip2nc(zip_fn, v='2t', parallel=True, writeout=True, compress=False):
    datadir = os.path.splitext(zip_fn)[0]
    if not os.path.exists(datadir):
        with zipfile.ZipFile(zip_fn, 'r') as zip_ref:
            zip_ref.extractall(datadir)
    #Get all grib filenames in the directory
    paths = sorted(glob(os.path.join(datadir, '*.grib')))
    out_fn = paths[0].split(v)[0]+f'{v}.nc'
    print(out_fn)
    if not os.path.exists(out_fn):
        #Generate xarray dataset list, opening with cfgrib engine
        print(f"Opening {len(paths)} grib datasets")
        #if parallel:
        combined = xr.open_mfdataset(paths, engine='cfgrib', combine="nested", \
                                     concat_dim="time", parallel=parallel)
        #else:
        #    datasets = [xr.open_dataset(p, engine='cfgrib') for p in paths]
        #    #Concatenate all datasets along the time axis
        #    print("Concatenating along time dimension")
        #    combined = xr.concat(datasets, dim='time')       
        #Drop unnecessary coordinates
        combined = combined.drop(['number', 'surface', 'step', 'valid_time'])
        if writeout:
            encoding = {}
            if compress:
                #Set up encoding parameters to use compression when writing netcdf file
                comp = dict(zlib=True, complevel=9)
                encoding = {var: comp for var in combined.data_vars}
            #Write out
            print(f"Writing out: {out_fn}")
            combined.to_netcdf(out_fn, encoding=encoding)

In [None]:
for zip_fn in zip_fn_list:
    %time zip2nc(zip_fn)

In [None]:
blob_id = '1nWjcjlqzkSqi-3u2vXt-5ya4VRSKS7v5'
out_fn = f'{outdir}/era5_WA_1979-2021_6hr.zip'
download_unzip(out_fn)

In [None]:
datadir = os.path.splitext(out_fn)[0]

In [None]:
%cd $datadir

In [None]:
fn_list = ['era5_WA_1979-2021_6hr_2m_temperature.nc', \
           'era5_WA_1979-2021_6hr_total_precipitation.nc', 
           'era5_WA_1979-2021_6hr_snow_depth.nc']

In [None]:
fn_list = ['era5_data/WA_ERA5-Land_hourly_1950-2022_0000.nc', \
           'era5_data/WA_ERA5-Land_hourly_1950-2022_0600.nc', \
           'era5_data/WA_ERA5-Land_hourly_1950-2022_1200.nc', \
           'era5_data/WA_ERA5-Land_hourly_1950-2022_1800.nc']

#### Use open_mfdataset to merge when opening
* Could have used `open_dataset` on each nc, then combined
* http://xarray.pydata.org/en/stable/generated/xarray.open_mfdataset.html
* See more details on merge/combine in xarray: http://xarray.pydata.org/en/stable/combining.html

In [None]:
test = xr.open_dataset(fn_list[0])

In [None]:
test

In [None]:
wa_merge = xr.open_mfdataset(fn_list,t_mean_ds

In [None]:
merge_fn = os.path.join(outdir, 'WA_ERA5-Land_hourly_1950-2022_6hr.nc')

In [None]:
#Expensive time sort and save out to new netcdf
wa_merge.sortby('time').to_netcdf(merge_fn)