# Step 2: Notebook for extracting depth-integrated mesozooplankton for a set of stations by day (for a given year)


#### (for running your own extractions, it may be best to put this notebook in a .py script and run under tmux or something - this is in STEP2_extract.py)

- (it's run in parallel using Process), which should speed everything up ~6x - a year of signal extracting should take ~3 hours and one click

In [1]:
import arrow
import netCDF4 as nc
import glob
import numpy as np


In [2]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

#### open the grid file and find cell volumes in m3, and cell thicknesses (for multiplying by mesozooplankton concentrations)

variable: domain_volume, e3t_0

In [3]:
grid = nc.Dataset('/data/tjarniko/MEOPAR/grid/mesh_mask201702.nc')

tmask = grid['tmask'][0,:,:,:]
e1t = grid['e1t'][0,:,:]
e2t = grid['e2t'][0,:,:]
e3t_0 = grid['e3t_0'][0,:,:,:]

e1t_proj = np.zeros([40,898,398])
e2t_proj = np.zeros([40,898,398])
for d in range(0,40):
    e1t_proj[d,:] = e1t
    e2t_proj[d,:] = e2t
    
domain_volume = tmask*e1t_proj*e2t_proj*e3t_0



#### Make a list of ncfile-path strings and ymd strings based on a start and end date that you give the function

variables: nclist, ymdlist

In [4]:
start ='2012-01-01'
end ='2012-12-31'

def get_list_of_model_ncs(start,end, verbose = False, \
                          ncpath = '/results2/SalishSea/nowcast-green.201905',\
                          filetype = 'ptrc_T'):
    "returns a list of model output, need to specify "
    
    nclist = []
    ymdlist = []

    start_run = arrow.get(start)
    end_run = arrow.get(end)

    arrow_array = []

    for r in arrow.Arrow.span_range('day', start_run, end_run):
        arrow_array.append(r)

    dayslen = len(arrow_array)    

    for i in range(0,dayslen):
        if i%50 == 0:
            print(i)
        tdate = arrow_array[i][0]
        ymd = tdate.format('YYYYMMDD')
        ymdlist.append(ymd)
        ncnam = f'{ncpath}/*/SalishSea_1d*{ymd}*{filetype}.nc'            
        t_nc = glob.glob(ncnam)
        nclist.append(t_nc[0])
        
        
    if verbose:
        print(f'first day nc: {nclist[0]}')
        print(f'last day nc: {nclist[-1]}')
        
    return ymdlist, nclist
        
ymdlist, nclist = get_list_of_model_ncs(start,end, verbose = True, \
                          ncpath = '/results2/SalishSea/nowcast-green.201905',\
                         filetype = 'ptrc_T')


0
50
100
150
200
250
300
350
first day nc: /results2/SalishSea/nowcast-green.201905/01jan12/SalishSea_1d_20120101_20120101_ptrc_T.nc
last day nc: /results2/SalishSea/nowcast-green.201905/31dec12/SalishSea_1d_20121231_20121231_ptrc_T.nc


### Load the station coordinates found in step 1

variables: ycoords, xcoords

In [5]:
stns = nc.Dataset('./DATASETS/X_AND_Y_COORDS.nc')

ycoords = stns['stn_ycoords']
xcoords = stns['stn_xcoords']

### write an extraction function for depth-integrated mesozoo (change for whatever else you want)
### test for one day

In [6]:
stn_x = 250; stn_y = 250
ncfile = '/results2/SalishSea/nowcast-green.201905/01jan12/SalishSea_1d_20120101_20120101_ptrc_T.nc'

def extract_signal_mesozoo(stn_x,stn_y,ncfile):
    
    t_nc = nc.Dataset(ncfile)
    #get depth profile of mesozoo at a given station
    t_mesozoo = (t_nc['mesozooplankton'][0,:,stn_y,stn_x])
    
    #multiply out by cell thickness and sum to get depth_integrated mesozo
    meso_integ = np.nansum(t_mesozoo*e3t_0[:,stn_y,stn_x])
    
    #this returns meso_integ in mmol N / m2
    return meso_integ
    
meso_integ = extract_signal_mesozoo(stn_x,stn_y,ncfile)

print(f'test of integrated mesozooplankton (Jan 1, 2012, mmol N/m2): {meso_integ}')

test of integrated mesozooplankton (Jan 1, 2012, mmol N/m2): 45.968838563620764


### write a looping function called big_extractor that extracts depth-integrated mesozooplankton for each day and station
- takes nclist, ymdlist, ycoords, xcoords, start_index, end_index, ncstring 
- (start_index and end_index are DOY, this is to make it parallelizable!)
- ncstring gives the pattern that the extracted file will be saved under 
- saves by-day in ./DATASETS, as an nc file.  (one ncfile per day of extraction)
- one day of extracting the signal for ~600 stations takes about 3 minutes
- if you wanted to extract a different signal, you would put something else in place of extract_signal_mesozoo


In [12]:
import time

def big_extractor(nclist, ymdlist,ycoords,xcoords, start_index, end_index, ncstring):
    
    for day in range(start_index,end_index):
        
        #ymd
        t_ymd = ymdlist[day]
        print(t_ymd)
        t_nc = nclist[day]
        
        #for each day, extract the signal for each station
        extracted_signals = np.zeros(len(xcoords))
        
        for stn in range(0,len(xcoords)):
            if stn%50 == 0:
                print(stn)
            stn_x = int(xcoords[stn]); stn_y = int(ycoords[stn])
            extracted_signals[stn] = extract_signal_mesozoo(stn_x,stn_y,t_nc)
            
        ### save those signals in an ncfile    
        ncname = f'./DATASETS/{ncstring}_{t_ymd}.nc'
        f = nc.Dataset(ncname,'w', format='NETCDF4') #'w' stands for write
        #g = f.createGroup('model_output')
        f.createDimension('stn', len(xcoords))
        ts2 = f.createVariable('depthint_mesozoo','f4',('stn'))
        ts2[:] = extracted_signals
        f.close()
    
ncstring = 'DEPTHINT_MESOZOO'
w = time.time()
big_extractor(nclist, ymdlist,ycoords,xcoords, 300, 301, ncstring)
w2 = time.time()
print(w2-w)

20121027
0
50
100
150
200
250
300
350
400
450
500
550
600
130.2552342414856


## use multiprocessing.Process to run big_extractor in parallel. If 1 day takes 3 minutes and 6 processes are running in parallel, this should take roughly 3 hours?

note that I started at i = 1, not i = 0, because i = 0 was done in the cell above!

In [8]:
from multiprocessing import Process
    
def func1():
    print('func1: starting')
    big_extractor(nclist, ymdlist,ycoords,xcoords, 1, 60, ncstring)

def func2():
    print('func2: starting')
    big_extractor(nclist, ymdlist,ycoords,xcoords, 60, 120, ncstring)
      
def func3():
    print('func3: starting')
    big_extractor(nclist, ymdlist,ycoords,xcoords, 120, 180, ncstring)    
    
def func4():
    print('func4: starting')
    big_extractor(nclist, ymdlist,ycoords,xcoords, 180, 240, ncstring)
        
def func5():
    print('func5: starting')
    big_extractor(nclist, ymdlist,ycoords,xcoords, 240, 300, ncstring)
        
def func6():
    print('func6: starting')
    big_extractor(nclist, ymdlist,ycoords,xcoords, 300, 365, ncstring)
    
if __name__ == '__main__':
  p1 = Process(target=func1)
  p1.start()
  p2 = Process(target=func2)
  p2.start()
  p3 = Process(target=func3)
  p3.start()
  p4 = Process(target=func4)
  p4.start()
  p5 = Process(target=func5)
  p5.start()
  p6 = Process(target=func6)
  p6.start()                
    
  p1.join()
  p2.join()
  p3.join()
  p4.join()
  p5.join()
  p6.join()

func1: starting
20120102
func2: starting
20120301
func3: starting
20120430
func4: starting
20120629
func5: starting
20120828
func6: starting
20121027
20120829


Process Process-1:
Traceback (most recent call last):
  File "/home/tjarniko/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/tjarniko/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-8-abda31d42e40>", line 5, in func1
    big_extractor(nclist, ymdlist,ycoords,xcoords, 1, 60, ncstring)
  File "<ipython-input-7-a8dd4d9d7a95>", line 21, in big_extractor
    f = nc.Dataset(ncname,'w', format='NETCDF4') #'w' stands for write
  File "netCDF4/_netCDF4.pyx", line 2135, in netCDF4._netCDF4.Dataset.__init__
  File "netCDF4/_netCDF4.pyx", line 1752, in netCDF4._netCDF4._ensure_nc_success
PermissionError: [Errno 13] Permission denied: b'./DATASETS/DEPTHINT_MESOZOO_20120102.nc'
Process Process-6:
Traceback (most recent call last):
  File "/home/tjarniko/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/h

20120501


  File "netCDF4/_netCDF4.pyx", line 2135, in netCDF4._netCDF4.Dataset.__init__
  File "netCDF4/_netCDF4.pyx", line 1752, in netCDF4._netCDF4._ensure_nc_success
PermissionError: [Errno 13] Permission denied: b'./DATASETS/DEPTHINT_MESOZOO_20121027.nc'


20120302
20120630


Process Process-5:
Traceback (most recent call last):
  File "/home/tjarniko/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/tjarniko/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-8-abda31d42e40>", line 21, in func5
    big_extractor(nclist, ymdlist,ycoords,xcoords, 240, 300, ncstring)
  File "<ipython-input-7-a8dd4d9d7a95>", line 17, in big_extractor
    extracted_signals[stn] = extract_signal_mesozoo(stn_x,stn_y,t_nc)
  File "<ipython-input-6-1da739072178>", line 6, in extract_signal_mesozoo
    t_nc = nc.Dataset(ncfile)
  File "netCDF4/_netCDF4.pyx", line 2157, in netCDF4._netCDF4.Dataset.__init__
  File "netCDF4/_netCDF4.pyx", line 1741, in netCDF4._netCDF4._get_vars
  File "netCDF4/_netCDF4.pyx", line 3640, in netCDF4._netCDF4.Variable.__init__
  File "/home/tjarniko/anaconda3/lib/python3.7/site-packages/netCDF4/utils.py", line 40, i

KeyboardInterrupt: 