## Import libraries

In [1]:
# XCLIM and xarray
import xclim.indices as xci
import xclim.temperature as temp
import xclim.precip as prec
import numpy as np
import xclim.utils as xut
import xarray as xr
import dask

# file handling libraries
import os
import glob
import time




## 1. Setting up the Dask client - Parralel processing / workers

In [2]:
from distributed import Client
client=Client(n_workers=5, threads_per_worker=6, diagnostics_port=8787, memory_limit='3GB') 
#client=Client(n_workers=1)
client

DEBUG:asyncio:Using selector: EpollSelector
DEBUG:asyncio:Using selector: EpollSelector


0,1
Client  Scheduler: tcp://127.0.0.1:39481  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 5  Cores: 30  Memory: 15.00 GB


## 2. Finding files using thredds server (TODO using siphon)

In [3]:
infolder = '/home/logan/boreas/ouranos/cb-oura-1.0/'
outfolder = '/home/logan/boreas/testdata/XCLIM_workshop/Exercise1/'
if not os.path.exists(outfolder):
    os.makedirs(outfolder)
models = [f.path for f in glob.os.scandir(infolder) if f.is_dir()]
models = sorted(models)
print(models)

rcps = ['rcp45','rcp85']
v = 'tasmax'
m = models[0]
r = rcps[0]
# Get list of files for tasmax
sim_files= sorted(glob.glob(os.path.join(m,r,'day',v, '*.nc')))
print(len(sim_files))

['/home/logan/boreas/ouranos/cb-oura-1.0/ACCESS1-3', '/home/logan/boreas/ouranos/cb-oura-1.0/BNU-ESM', '/home/logan/boreas/ouranos/cb-oura-1.0/CMCC-CMS', '/home/logan/boreas/ouranos/cb-oura-1.0/CanESM2', '/home/logan/boreas/ouranos/cb-oura-1.0/GFDL-ESM2M', '/home/logan/boreas/ouranos/cb-oura-1.0/HadGEM2-CC', '/home/logan/boreas/ouranos/cb-oura-1.0/INM-CM4', '/home/logan/boreas/ouranos/cb-oura-1.0/IPSL-CM5A-LR', '/home/logan/boreas/ouranos/cb-oura-1.0/IPSL-CM5B-LR', '/home/logan/boreas/ouranos/cb-oura-1.0/MPI-ESM-LR', '/home/logan/boreas/ouranos/cb-oura-1.0/NorESM1-M']
151


## 3. Creating Xarray datasets & dask chunks

In [4]:
# chunked vs unchunked
ds = xr.open_dataset(sim_files[0])
print(ds.tasmax)

<xarray.DataArray 'tasmax' (time: 365, lat: 700, lon: 1064)>
[271852000 values with dtype=float32]
Coordinates:
  * time     (time) datetime64[ns] 1950-01-01 1950-01-02 ... 1950-12-31
  * lat      (lat) float32 83.28931 83.20598 83.12265 ... 25.12497 25.04164
  * lon      (lon) float32 -141.04314 -140.9598 ... -52.54667 -52.46334
Attributes:
    units:          K
    long_name:      air_temperature
    standard_name:  air_temperature


In [5]:
ds = xr.open_dataset(sim_files[0], chunks={'time':31})
print(ds.tasmax)

<xarray.DataArray 'tasmax' (time: 365, lat: 700, lon: 1064)>
dask.array<shape=(365, 700, 1064), dtype=float32, chunksize=(31, 700, 1064)>
Coordinates:
  * time     (time) datetime64[ns] 1950-01-01 1950-01-02 ... 1950-12-31
  * lat      (lat) float32 83.28931 83.20598 83.12265 ... 25.12497 25.04164
  * lon      (lon) float32 -141.04314 -140.9598 ... -52.54667 -52.46334
Attributes:
    units:          K
    long_name:      air_temperature
    standard_name:  air_temperature


## 4. Multifile dataset - treat 151 yearly netcdf files as a single dataset

In [6]:
# create multifile data & chunks 
ds = xr.open_mfdataset(sim_files, chunks={'time':365*3, 'lat':50*2, 'lon':56*2})
ds = ds.drop('time_vectors')
ds = ds.drop('ts')
print(ds)

<xarray.Dataset>
Dimensions:  (lat: 700, lon: 1064, time: 55152)
Coordinates:
  * lat      (lat) float32 83.28931 83.20598 83.12265 ... 25.12497 25.04164
  * lon      (lon) float32 -141.04314 -140.9598 ... -52.54667 -52.46334
  * time     (time) datetime64[ns] 1950-01-01 1950-01-02 ... 2100-12-31
Data variables:
    tasmax   (time, lat, lon) float32 dask.array<shape=(55152, 700, 1064), chunksize=(365, 100, 112)>
Attributes:
    Conventions:     CF-1.5
    title:           ACCESS1-3 model output prepared for CMIP5 historical
    history:         CMIP5 compliant file produced from raw ACCESS model outp...
    institution:     CSIRO (Commonwealth Scientific and Industrial Research O...
    source:          ACCESS1-3 2011. Atmosphere: AGCM v1.0 (N96 grid-point, 1...
    redistribution:  Redistribution prohibited. For internal use only.


## 5. Subsetting data

In [7]:
lat_bnds = [45, 60]
lon_bnds = [-55, -82]

ds = xut.subset_bbox(ds,lat_bnds=lat_bnds,lon_bnds=lon_bnds)
print(ds)

<xarray.Dataset>
Dimensions:  (lat: 180, lon: 324, time: 55152)
Coordinates:
  * lat      (lat) float32 59.95691 59.87358 59.79025 ... 45.12417 45.04084
  * lon      (lon) float32 -81.96216 -81.87883 -81.7955 ... -55.1299 -55.04657
  * time     (time) datetime64[ns] 1950-01-01 1950-01-02 ... 2100-12-31
Data variables:
    tasmax   (time, lat, lon) float32 dask.array<shape=(55152, 180, 324), chunksize=(365, 20, 75)>
Attributes:
    Conventions:     CF-1.5
    title:           ACCESS1-3 model output prepared for CMIP5 historical
    history:         CMIP5 compliant file produced from raw ACCESS model outp...
    institution:     CSIRO (Commonwealth Scientific and Industrial Research O...
    source:          ACCESS1-3 2011. Atmosphere: AGCM v1.0 (N96 grid-point, 1...
    redistribution:  Redistribution prohibited. For internal use only.


## 6. Climate index calculation & Resampling frequencies
http://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases

**Example : Maximum of daily tasmax (tx_max)**

In [8]:
fr = 'YS' #MS #QS-DEC #AS-AUG #W #2W #5D
out = temp.tx_max(ds.tasmax, freq=fr)
dsOut = ds.drop(ds.data_vars)
dsOut = dsOut.drop('time')
dsOut[out.name] = out
print('Number of time-steps using freq == ', fr, ' : ', len(out.time),'\n')
print(out.time)
print(dsOut)

outfile = '/home/logan/boreas/testdata/XCLIM_workshop/Exercise1/rcp45/tx_max/test.nc'
if not os.path.exists(os.path.dirname(outfile)):
    os.makedirs(os.path.dirname(outfile))
print(outfile)


  warn(e)



Number of time-steps using freq ==  YS  :  151 

<xarray.DataArray 'time' (time: 151)>
array(['1950-01-01T00:00:00.000000000', '1951-01-01T00:00:00.000000000',
       '1952-01-01T00:00:00.000000000', '1953-01-01T00:00:00.000000000',
       '1954-01-01T00:00:00.000000000', '1955-01-01T00:00:00.000000000',
       '1956-01-01T00:00:00.000000000', '1957-01-01T00:00:00.000000000',
       '1958-01-01T00:00:00.000000000', '1959-01-01T00:00:00.000000000',
       '1960-01-01T00:00:00.000000000', '1961-01-01T00:00:00.000000000',
       '1962-01-01T00:00:00.000000000', '1963-01-01T00:00:00.000000000',
       '1964-01-01T00:00:00.000000000', '1965-01-01T00:00:00.000000000',
       '1966-01-01T00:00:00.000000000', '1967-01-01T00:00:00.000000000',
       '1968-01-01T00:00:00.000000000', '1969-01-01T00:00:00.000000000',
       '1970-01-01T00:00:00.000000000', '1971-01-01T00:00:00.000000000',
       '1972-01-01T00:00:00.000000000', '1973-01-01T00:00:00.000000000',
       '1974-01-01T00:00:00.000000000

## 7. Lazy computation
**Up until we have ony created a schedule of tasks with a small preview**

**Writing the file to netcdf will actually calculate the values for all years**

In [9]:
outfile = '/home/logan/boreas/testdata/XCLIM_workshop/Exercise1/rcp45/tx_max/test.nc'
start= time.time()

dsOut.to_netcdf( outfile,format='NETCDF4')

end = time.time()
print('calculation took ',end-start, 's')

calculation took  73.00857853889465 s


In [None]:

# client.restart()
# f = 'YS'
# start= time.time()
# out = temp.tx_days_above(ds.tasmax, thresh='30 C', freq=f)
# dsOut = ds.drop(ds.data_vars)
# dsOut = dsOut.drop('time')
# dsOut[out.name] = out
# outfile = os.path.join(outfolder,r,'txgt_30','txgt_30_'+ f +'_' + os.path.basename(sim_files[0].replace('1950','1950-2100')))
# if not os.path.exists(os.path.dirname(outfile)):
#     os.makedirs(os.path.dirname(outfile))
# outfile = outfile.replace('tasmax','').replace('__','_')
# #comp = dict(zlib=True, complevel=5, dtype='single')
# #encoding = {var: comp for var in dsOut.data_vars}
# encoding = {}
# encoding['time'] = dict(dtype='single')

# dsOut.to_netcdf( outfile,format='NETCDF4', encoding=encoding)

# end = time.time()
# print(end-start)