In [1]:
import xarray as xr
import grib2io
import pandas as pd
import datetime
from glob import glob
from tqdm.auto import tqdm
import numpy as np
import multiprocessing
from numba import jit
import scipy
from netCDF4 import Dataset


import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm
import matplotlib
from mpl_toolkits.basemap import Basemap
matplotlib.rcParams.update({
 "savefig.facecolor": "w",
 "figure.facecolor" : 'w',
 "figure.figsize" : (8,6),
 "text.color": "k",
 "legend.fontsize" : 20,
 "font.size" : 30,
 "axes.edgecolor": "k",
 "axes.labelcolor": "k",
 "axes.linewidth": 3,
 "xtick.color": "k",
 "ytick.color": "k",
 "xtick.labelsize" : 25,
 "ytick.labelsize" : 25,
 "ytick.major.size" : 12,
 "xtick.major.size" : 12,
 "ytick.major.width" : 2,
 "xtick.major.width" : 2,
 "font.family": 'STIXGeneral',
 "mathtext.fontset" : "cm"})

How impactful is the addition of supplemental locations to the results of QMD? 
Let’s compare biases between GEFS (in lieu of full blend stack) and URMA with and withoutsupplemental locations.s

First, how are supplemental locations actually used? 
       
        "Presumably, the population of CDFs using the forecast and analyzed data at additional supplemental locations will help ameliorate sampling  error while still preserving the ability to correctly estimate location-dependent biases," (Hamill+2017). 


IOW, the supplemental locations are chosen to provide attional data points to increase the sample size of the precipitation CDFs, increasing forecast reliability and skill without the need for actually *new, independent* data measurements.

So what we should see is a decrease in bias between forecast and analysis when using supplemental locations.


## utils

In [33]:
from nimbl import stations  #.from_mos2ktbl

#selected stations for analysis
station_data = '//scratch1/NCEPDEV/mdl/Eric.Engle/gitrepos/blend/fix/common/mdl_station.tbl'
station_points = stations.from_mos2ktbl(station_data, ['KSEA','KMSP','KBOS', 'KLAS','KLIT','KMCO', 'KDEN', 'KCVG'])

#rename weird ones
st_names = station_points['name'].copy()
st_names[3] = 'Las Vegas'
st_names[6] = 'Denver'
st_names[8] = 'Cincinnati'

In [34]:
#get CONUS lat/lons
ds = xr.open_dataset(f'/scratch2/STI/mdl-sti/Sidney.Lower/supplemental_locations/limit_water_v4/blend.supplemental_locations_4.co.2p5.nc',
             mode="r")

#unpack lats and lons to index over
lat_arr = ds.latitude.data
lon_arr = ds.longitude.data
ds.close()

In [35]:
def get_nearest_grid(lat_input, long_input):
    # Get indices of the nearest lat/lon values
    # Lat/long input will be values corresponding to chosen stations
    lat_index, lon_index = [],[]


    for i in range(len(lat_input)):
        a = np.abs(lat_arr-station_lats[i]) + np.abs(lon_arr-station_lons[i])
        i,j = np.unravel_index(a.argmin(), a.shape)
        lat_index.append(i)
        lon_index.append(j)
    return lat_index, lon_index

def to360(x):
    return (x - 180) % 360 + 180

station_lats = station_points['lat']
station_lons = to360(station_points['lon']) #need to convert from E/W to 0-360

lat_idx, lon_idx = get_nearest_grid(station_lats, station_lons)

conus_gp = (lat_idx, lon_idx)

# Gather data

We'll select 3 seasons worth of data: **January**, **April**, and **July** to get a feel for any variability in the effectiveness of SLs with season (something we've already identified as a possible obstacle in previous analysis).

In QMD processing, 60 days worth of data are used to construct the precipitation CDFs. So we'll choose a day in the 3 months from above, and load in the previous 60 days. I'll stick with **2023 t0z** data at a lead time of **96h** for this analysis. (One bookkeeping thing with this is when using xarray to load in multiple files, I am not sure how to deal with different days **+** different months. So I think I will just do each month's analysis serially?)

### January 15 2023

In [7]:
from dask.distributed import Client

client = Client(n_workers=24)

In [19]:
### GEFS ###

gefs_files = sorted(glob('/scratch2/STI/mdl-sti/Sidney.Lower/data/gefs/supplemental_locations_QMD/*/gefs*.t0z.f96'))
gefs = xr.open_mfdataset(gefs_files, combine="nested", engine='grib2io',concat_dim=['refDate'], parallel=True)

In [20]:
gefs

Unnamed: 0,Array,Chunk
Bytes,1.98 MiB,1.98 MiB
Shape,"(361, 720)","(361, 720)"
Dask graph,1 chunks in 9145 graph layers,1 chunks in 9145 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.98 MiB 1.98 MiB Shape (361, 720) (361, 720) Dask graph 1 chunks in 9145 graph layers Data type float64 numpy.ndarray",720  361,

Unnamed: 0,Array,Chunk
Bytes,1.98 MiB,1.98 MiB
Shape,"(361, 720)","(361, 720)"
Dask graph,1 chunks in 9145 graph layers,1 chunks in 9145 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.98 MiB,1.98 MiB
Shape,"(361, 720)","(361, 720)"
Dask graph,1 chunks in 9145 graph layers,1 chunks in 9145 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.98 MiB 1.98 MiB Shape (361, 720) (361, 720) Dask graph 1 chunks in 9145 graph layers Data type float64 numpy.ndarray",720  361,

Unnamed: 0,Array,Chunk
Bytes,1.98 MiB,1.98 MiB
Shape,"(361, 720)","(361, 720)"
Dask graph,1 chunks in 9145 graph layers,1 chunks in 9145 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.77 GiB,0.99 MiB
Shape,"(1830, 361, 720)","(1, 361, 720)"
Dask graph,1830 chunks in 5491 graph layers,1830 chunks in 5491 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.77 GiB 0.99 MiB Shape (1830, 361, 720) (1, 361, 720) Dask graph 1830 chunks in 5491 graph layers Data type float32 numpy.ndarray",720  361  1830,

Unnamed: 0,Array,Chunk
Bytes,1.77 GiB,0.99 MiB
Shape,"(1830, 361, 720)","(1, 361, 720)"
Dask graph,1830 chunks in 5491 graph layers,1830 chunks in 5491 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


Spent a fair bit of time ruminating on how to concat these files, since there's ensemble member **and** refDate...but since all of this data will be combined to make a CDF for January 15 (the chosen analysis date), it shouldn't actually matter to keep track of what ensemble/refDate each file corresponds to? But I'll check on this assumption later...

In [32]:
#first, interpolate APCP to CONUS grid
from nimbl import get_metadata
import grib2io

conus_grid_def = get_metadata.get_metadata('grib2_grid', model='blend',region='co' )
conus_grid = grib2io.Grib2GridDef(*conus_grid_def)
gefs_conus = gefs.grib2io.interp('bilinear', conus_grid)

Ok need to think about this for a sec (refDate, leadTime, etc.). If we're using January 15 0z, leadtime 96h, I believe we use URMA data from that same initialization date (January 15 0z) for QMD and then we analyze bias for the actual refDate (January 19 0z). So load in URMA on same days as GEFS above for QMD, then load in the validDate URMA later for bias analysis.

In [61]:
## URMA
urma_files = sorted(glob('/scratch2/STI/mdl-sti/Sidney.Lower/data/urma/supplemental_locations_QMD/*/urma2p5.*.pcp_06h.wexp.grb2'))
urma = xr.open_mfdataset(urma_files, combine="nested", engine='grib2io',concat_dim=['refDate'], parallel=True)

This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Co

In [62]:
urma

Unnamed: 0,Array,Chunk
Bytes,28.57 MiB,28.57 MiB
Shape,"(1597, 2345)","(1597, 2345)"
Dask graph,1 chunks in 290 graph layers,1 chunks in 290 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 28.57 MiB 28.57 MiB Shape (1597, 2345) (1597, 2345) Dask graph 1 chunks in 290 graph layers Data type float64 numpy.ndarray",2345  1597,

Unnamed: 0,Array,Chunk
Bytes,28.57 MiB,28.57 MiB
Shape,"(1597, 2345)","(1597, 2345)"
Dask graph,1 chunks in 290 graph layers,1 chunks in 290 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,28.57 MiB,28.57 MiB
Shape,"(1597, 2345)","(1597, 2345)"
Dask graph,1 chunks in 290 graph layers,1 chunks in 290 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 28.57 MiB 28.57 MiB Shape (1597, 2345) (1597, 2345) Dask graph 1 chunks in 290 graph layers Data type float64 numpy.ndarray",2345  1597,

Unnamed: 0,Array,Chunk
Bytes,28.57 MiB,28.57 MiB
Shape,"(1597, 2345)","(1597, 2345)"
Dask graph,1 chunks in 290 graph layers,1 chunks in 290 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,842.87 MiB,14.29 MiB
Shape,"(59, 1597, 2345)","(1, 1597, 2345)"
Dask graph,59 chunks in 178 graph layers,59 chunks in 178 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 842.87 MiB 14.29 MiB Shape (59, 1597, 2345) (1, 1597, 2345) Dask graph 59 chunks in 178 graph layers Data type float32 numpy.ndarray",2345  1597  59,

Unnamed: 0,Array,Chunk
Bytes,842.87 MiB,14.29 MiB
Shape,"(59, 1597, 2345)","(1, 1597, 2345)"
Dask graph,59 chunks in 178 graph layers,59 chunks in 178 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


Next step: get data at selected stations and generate CDFs for both GEFS and URMA. The way this is done in the blend is probably how we should be doing this analysis (at each grid point, collect sums and number of positive vals over the 60 days, calculate alpha, beta, and then construct quantiles from that), but that seems like too many steps if we're just interested in analyzing a couple of grid points (i.e., not too worried about the memory load). So I will use scipy to fit each grid point's data to a gamma dist and do quantile mapping from there (roughly following [this notebook](https://github.com/SidneyLower-NOAA/SMD_Notebooks/blob/main/QMD.ipynb))