## Import libraries

In [1]:
# XCLIM and xarray
import xclim.indices as xci
import xclim.atmos as atmos
import numpy as np
import xarray as xr
import dask

# file handling libraries
import os
import glob
import time




## 1. Setting up the Dask client - Parralel processing / workers

In [2]:
from distributed import Client
client=Client(n_workers=2, threads_per_worker=12, diagnostics_port=8787, memory_limit='6GB') 
#client=Client(n_workers=1)
client

DEBUG:asyncio:Using selector: EpollSelector
DEBUG:asyncio:Using selector: EpollSelector
  "diagnostics_port has been deprecated. "

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.



0,1
Client  Scheduler: tcp://127.0.0.1:35701  Dashboard: http://127.0.0.1:40606/status,Cluster  Workers: 2  Cores: 24  Memory: 12.00 GB


## 2. Finding data files 

In [3]:
infolder = '/scen3/scenario/netcdf/ouranos/cb-oura-1.0/'
outfolder = '/scen3/logan/testdata/XCLIM_workshop/Exercise1/'
if not os.path.exists(outfolder):
    os.makedirs(outfolder)
rcps = ['rcp45','rcp85']
v = 'tasmax'
r = rcps[0]
# Get list of files for tasmax
search_str = os.path.join(infolder, '{v}*CanESM*{r}*.nc'.format(v=v,r=r))
sim_files= sorted(glob.glob(search_str))
print(len(sim_files))

151


## 3. Creating Xarray datasets & dask chunks

### Default (unchunked)

In [4]:
ds = xr.open_dataset(sim_files[0])
print(ds.tasmax)

<xarray.DataArray 'tasmax' (time: 365, lat: 700, lon: 1064)>
[271852000 values with dtype=float32]
Coordinates:
  * time     (time) object 1950-01-01 00:00:00 ... 1950-12-31 00:00:00
  * lat      (lat) float32 83.28931 83.20598 83.12265 ... 25.12497 25.04164
  * lon      (lon) float32 -141.04314 -140.9598 ... -52.54667 -52.46334
Attributes:
    units:          K
    long_name:      air_temperature
    standard_name:  air_temperature


### Chunked in memory - Data-type is now 'Dask array'.  Xarray will automatically use client workers 

In [5]:
ds = xr.open_dataset(sim_files[0], chunks={'time':31})
print(ds.tasmax)

<xarray.DataArray 'tasmax' (time: 365, lat: 700, lon: 1064)>
dask.array<shape=(365, 700, 1064), dtype=float32, chunksize=(31, 700, 1064)>
Coordinates:
  * time     (time) object 1950-01-01 00:00:00 ... 1950-12-31 00:00:00
  * lat      (lat) float32 83.28931 83.20598 83.12265 ... 25.12497 25.04164
  * lon      (lon) float32 -141.04314 -140.9598 ... -52.54667 -52.46334
Attributes:
    units:          K
    long_name:      air_temperature
    standard_name:  air_temperature


## 4. Multifile dataset - treat 151 yearly netcdf files as a single dataset

In [6]:
sim_files

['/scen3/scenario/netcdf/ouranos/cb-oura-1.0/tasmax_day_CanESM2_rcp45_r1i1p1_na10kgrid_qm-moving-50bins-detrend_1950.nc',
 '/scen3/scenario/netcdf/ouranos/cb-oura-1.0/tasmax_day_CanESM2_rcp45_r1i1p1_na10kgrid_qm-moving-50bins-detrend_1951.nc',
 '/scen3/scenario/netcdf/ouranos/cb-oura-1.0/tasmax_day_CanESM2_rcp45_r1i1p1_na10kgrid_qm-moving-50bins-detrend_1952.nc',
 '/scen3/scenario/netcdf/ouranos/cb-oura-1.0/tasmax_day_CanESM2_rcp45_r1i1p1_na10kgrid_qm-moving-50bins-detrend_1953.nc',
 '/scen3/scenario/netcdf/ouranos/cb-oura-1.0/tasmax_day_CanESM2_rcp45_r1i1p1_na10kgrid_qm-moving-50bins-detrend_1954.nc',
 '/scen3/scenario/netcdf/ouranos/cb-oura-1.0/tasmax_day_CanESM2_rcp45_r1i1p1_na10kgrid_qm-moving-50bins-detrend_1955.nc',
 '/scen3/scenario/netcdf/ouranos/cb-oura-1.0/tasmax_day_CanESM2_rcp45_r1i1p1_na10kgrid_qm-moving-50bins-detrend_1956.nc',
 '/scen3/scenario/netcdf/ouranos/cb-oura-1.0/tasmax_day_CanESM2_rcp45_r1i1p1_na10kgrid_qm-moving-50bins-detrend_1957.nc',
 '/scen3/scenario/netcdf

In [7]:
# create multifile data & chunks 
ds = xr.open_mfdataset(sim_files, chunks={'time':365, 'lat':50*2, 'lon':56*2})
ds = ds.drop('time_vectors')
ds = ds.drop('ts')
print(ds)

<xarray.Dataset>
Dimensions:  (lat: 700, lon: 1064, time: 55115)
Coordinates:
  * lat      (lat) float32 83.28931 83.20598 83.12265 ... 25.12497 25.04164
  * lon      (lon) float32 -141.04314 -140.9598 ... -52.54667 -52.46334
  * time     (time) object 1950-01-01 00:00:00 ... 2100-12-31 00:00:00
Data variables:
    tasmax   (time, lat, lon) float32 dask.array<shape=(55115, 700, 1064), chunksize=(365, 100, 112)>
Attributes:
    Conventions:     CF-1.5
    title:           CanESM2 model output prepared for CMIP5 historical
    history:         2011-04-14T00:21:01Z CMOR rewrote data to comply with CF...
    institution:     CCCma (Canadian Centre for Climate Modelling and Analysi...
    source:          CanESM2 2010 atmosphere: CanAM4 (AGCM15i, T63L35) ocean:...
    redistribution:  Redistribution prohibited. For internal use only.


## 5. Subsetting utilities
### Using latitude longitude bounds

In [8]:
from xclim import subset
lat_bnds = [45, 60]
lon_bnds = [-55, -82]

ds1 = subset.subset_bbox(ds,lat_bnds=lat_bnds,lon_bnds=lon_bnds)
print(ds1)

<xarray.Dataset>
Dimensions:  (lat: 180, lon: 324, time: 55115)
Coordinates:
  * lat      (lat) float64 59.96 59.87 59.79 59.71 ... 45.29 45.21 45.12 45.04
  * lon      (lon) float64 -81.96 -81.88 -81.8 -81.71 ... -55.21 -55.13 -55.05
  * time     (time) object 1950-01-01 00:00:00 ... 2100-12-31 00:00:00
Data variables:
    tasmax   (time, lat, lon) float32 dask.array<shape=(55115, 180, 324), chunksize=(365, 20, 75)>
Attributes:
    Conventions:     CF-1.5
    title:           CanESM2 model output prepared for CMIP5 historical
    history:         2011-04-14T00:21:01Z CMOR rewrote data to comply with CF...
    institution:     CCCma (Canadian Centre for Climate Modelling and Analysi...
    source:          CanESM2 2010 atmosphere: CanAM4 (AGCM15i, T63L35) ocean:...
    redistribution:  Redistribution prohibited. For internal use only.


### Add start and/or end years 

In [10]:
ds2 = subset.subset_bbox(ds,lat_bnds=lat_bnds,lon_bnds=lon_bnds, start_yr = 1981, end_yr = 2010)
print(ds2)
print(' ')

# subset years only
ds2 = subset.subset_bbox(ds, start_yr = 1981, end_yr = 2010)
print(ds2)

<xarray.Dataset>
Dimensions:  (lat: 180, lon: 324, time: 10950)
Coordinates:
  * lat      (lat) float64 59.96 59.87 59.79 59.71 ... 45.29 45.21 45.12 45.04
  * lon      (lon) float64 -81.96 -81.88 -81.8 -81.71 ... -55.21 -55.13 -55.05
  * time     (time) object 1981-01-01 00:00:00 ... 2010-12-31 00:00:00
Data variables:
    tasmax   (time, lat, lon) float32 dask.array<shape=(10950, 180, 324), chunksize=(365, 20, 75)>
Attributes:
    Conventions:     CF-1.5
    title:           CanESM2 model output prepared for CMIP5 historical
    history:         2011-04-14T00:21:01Z CMOR rewrote data to comply with CF...
    institution:     CCCma (Canadian Centre for Climate Modelling and Analysi...
    source:          CanESM2 2010 atmosphere: CanAM4 (AGCM15i, T63L35) ocean:...
    redistribution:  Redistribution prohibited. For internal use only.
 
<xarray.Dataset>
Dimensions:  (lat: 700, lon: 1064, time: 10950)
Coordinates:
  * lat      (lat) float64 83.29 83.21 83.12 83.04 ... 25.29 25.21 25.12 

### Select a single grid point 

In [11]:
lon_pt = -70.0
lat_pt = 50.0

ds3 = subset.subset_gridpoint(ds,lon=lon_pt,lat=lat_pt, start_yr=1981)
print(ds3)

<xarray.Dataset>
Dimensions:  (time: 43800)
Coordinates:
    lat      float32 50.04064
    lon      float32 -69.96264
  * time     (time) object 1981-01-01 00:00:00 ... 2100-12-31 00:00:00
Data variables:
    tasmax   (time) float32 dask.array<shape=(43800,), chunksize=(365,)>
Attributes:
    Conventions:     CF-1.5
    title:           CanESM2 model output prepared for CMIP5 historical
    history:         2011-04-14T00:21:01Z CMOR rewrote data to comply with CF...
    institution:     CCCma (Canadian Centre for Climate Modelling and Analysi...
    source:          CanESM2 2010 atmosphere: CanAM4 (AGCM15i, T63L35) ocean:...
    redistribution:  Redistribution prohibited. For internal use only.


## 6. Climate index calculation & Resampling frequencies
http://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases

**Example : Maximum of daily tasmax (tx_max)**

In [12]:
fr = 'YS' #MS #QS-DEC #AS-AUG #W #2W #5D
out = atmos.tx_max(ds1.tasmax, freq=fr)
dsOut = ds1.drop(ds1.data_vars)
dsOut = dsOut.drop('time')
dsOut[out.name] = out
print('Number of time-steps using freq == ', fr, ' : ', len(out.time),'\n')
print(out.time)
print(dsOut)

outfile = '/home/logan/boreas/testdata/XCLIM_workshop/Exercise1/rcp45/tx_max/test.nc'
if not os.path.exists(os.path.dirname(outfile)):
    os.makedirs(os.path.dirname(outfile))
print(outfile)


  warn(e)



Number of time-steps using freq ==  YS  :  151 

<xarray.DataArray 'time' (time: 151)>
array([cftime.DatetimeNoLeap(1950, 1, 1, 0, 0, 0, 0, 4, 1),
       cftime.DatetimeNoLeap(1951, 1, 1, 0, 0, 0, 0, 5, 1),
       cftime.DatetimeNoLeap(1952, 1, 1, 0, 0, 0, 0, 6, 1),
       cftime.DatetimeNoLeap(1953, 1, 1, 0, 0, 0, 0, 0, 1),
       cftime.DatetimeNoLeap(1954, 1, 1, 0, 0, 0, 0, 1, 1),
       cftime.DatetimeNoLeap(1955, 1, 1, 0, 0, 0, 0, 2, 1),
       cftime.DatetimeNoLeap(1956, 1, 1, 0, 0, 0, 0, 3, 1),
       cftime.DatetimeNoLeap(1957, 1, 1, 0, 0, 0, 0, 4, 1),
       cftime.DatetimeNoLeap(1958, 1, 1, 0, 0, 0, 0, 5, 1),
       cftime.DatetimeNoLeap(1959, 1, 1, 0, 0, 0, 0, 6, 1),
       cftime.DatetimeNoLeap(1960, 1, 1, 0, 0, 0, 0, 0, 1),
       cftime.DatetimeNoLeap(1961, 1, 1, 0, 0, 0, 0, 1, 1),
       cftime.DatetimeNoLeap(1962, 1, 1, 0, 0, 0, 0, 2, 1),
       cftime.DatetimeNoLeap(1963, 1, 1, 0, 0, 0, 0, 3, 1),
       cftime.DatetimeNoLeap(1964, 1, 1, 0, 0, 0, 0, 4, 1),
       cftime

## 7. Lazy computation
**Up until we have ony created a schedule of tasks with a small preview**

**Writing the file to netcdf will actually calculate the values for all years**

In [13]:
outfile = os.path.join(outfolder, 'test.nc')
start= time.time()

dsOut.to_netcdf( outfile,format='NETCDF4')

end = time.time()
print('calculation took ',end-start, 's')

calculation took  176.1228289604187 s


### Rechunk data - bigger memory chunks

In [14]:
print(ds1)


<xarray.Dataset>
Dimensions:  (lat: 180, lon: 324, time: 55115)
Coordinates:
  * lat      (lat) float64 59.96 59.87 59.79 59.71 ... 45.29 45.21 45.12 45.04
  * lon      (lon) float64 -81.96 -81.88 -81.8 -81.71 ... -55.21 -55.13 -55.05
  * time     (time) object 1950-01-01 00:00:00 ... 2100-12-31 00:00:00
Data variables:
    tasmax   (time, lat, lon) float32 dask.array<shape=(55115, 180, 324), chunksize=(365, 20, 75)>
Attributes:
    Conventions:     CF-1.5
    title:           CanESM2 model output prepared for CMIP5 historical
    history:         2011-04-14T00:21:01Z CMOR rewrote data to comply with CF...
    institution:     CCCma (Canadian Centre for Climate Modelling and Analysi...
    source:          CanESM2 2010 atmosphere: CanAM4 (AGCM15i, T63L35) ocean:...
    redistribution:  Redistribution prohibited. For internal use only.


In [15]:
ds1 = ds1.chunk(chunks={'time':365, 'lon':-1, 'lat':-1})
print(ds1)

<xarray.Dataset>
Dimensions:  (lat: 180, lon: 324, time: 55115)
Coordinates:
  * lat      (lat) float64 59.96 59.87 59.79 59.71 ... 45.29 45.21 45.12 45.04
  * lon      (lon) float64 -81.96 -81.88 -81.8 -81.71 ... -55.21 -55.13 -55.05
  * time     (time) object 1950-01-01 00:00:00 ... 2100-12-31 00:00:00
Data variables:
    tasmax   (time, lat, lon) float32 dask.array<shape=(55115, 180, 324), chunksize=(365, 180, 324)>
Attributes:
    Conventions:     CF-1.5
    title:           CanESM2 model output prepared for CMIP5 historical
    history:         2011-04-14T00:21:01Z CMOR rewrote data to comply with CF...
    institution:     CCCma (Canadian Centre for Climate Modelling and Analysi...
    source:          CanESM2 2010 atmosphere: CanAM4 (AGCM15i, T63L35) ocean:...
    redistribution:  Redistribution prohibited. For internal use only.


In [16]:
out = atmos.tx_max(ds1.tasmax, freq=fr)
dsOut = ds1.drop(ds1.data_vars)
dsOut = dsOut.drop('time')
dsOut[out.name] = out
outfile = os.path.join(outfolder, 'test.nc')
start= time.time()

dsOut.to_netcdf( outfile,format='NETCDF4')

end = time.time()
print('calculation took ',end-start, 's')

  warn(e)



calculation took  114.31398057937622 s
