In [18]:
import numpy as np
import xarray as xr
import os
import re
import pandas as pd
from datetime import datetime
import salishsea_tools.river_202108 as rivers  # if you want to find river inputs for a different (future?) version of rivers, you'll have to change this the files_between_dates fn!

In [None]:
rivers_list = {'test': [['fraser', 'Fraser'], ['skagit', 'Skagit1']],
               'test2': [['fraser', 'Fraser'], ['skagit', 'Skagit1']]}

In [22]:
# Example usage:
start_str = '20210801'
end_str = '20210810'
rivers_list = [['fraser', 'Fraser'], ['skagit', 'Skagit1']]
source_directory = '/results/forcing/rivers/'
save_name = 'test'

In [12]:
start_date = datetime.strptime(start_str, '%Y%m%d').date()
end_date = datetime.strptime(end_str, '%Y%m%d').date()

In [13]:
def files_between_dates(start_date, end_date, directory):
    files = []
    file_dates = []
    date_pattern = r'R202108Dailies_y(\d{4})m(\d{2})d(\d{2})'  # regex pattern to specifically match the 202108Dailies - CHANGE IF DIFF VERSION
    
    for filename in sorted(os.listdir(directory)):
        match = re.search(date_pattern, filename)
        if match:
            year, month, day = map(int, match.groups())
            file_date = datetime(year, month, day).date()
            if start_date <= file_date <= end_date:
                files.append(filename)
                file_dates.append(file_date.strftime('%m-%d-%Y'))
    
    return files, file_dates

In [14]:
# the entire set of rivers and their w_shed/r_call pairs is located in /ocean/cdonaldson/MEOPAR/tools/SalishSeaTools/salishsea_tools/river_202108.py
# this fn looks up and returns the river input coordinates and widths
def river_bounds(river):

    w_shed = river[0]
    r_call = river[1]

    y = rivers.prop_dict[w_shed][r_call]['i']  # model grid Y-axis
    x = rivers.prop_dict[w_shed][r_call]['j']  # model grid X-axis
    dy = rivers.prop_dict[w_shed][r_call]['di']  # the number of boxes in Y
    dx = rivers.prop_dict[w_shed][r_call]['dj']  # the number of boxes in X

    return y, dy, x, dx

# when selecting from the big array, do it like [y:y+dy, x:x+dx]
# np.array([[1, 2, 3], [4, 5, 6], [7, 8 ,9]])[0:1, 2:3] = array([[3]]), specifies the row then the column

In [15]:
def files_to_timeseries(directory, file_names, rivers):

    num_rows = len(file_names)
    num_cols = len(rivers)

    result = np.zeros((num_rows, num_cols), dtype=float)  # allocate memory based on # days and # rivers

    row_idx = 0
    col_idx = 0

    for file in file_names:
        fname = directory + file
        ds = xr.open_dataset(fname)
        array = ds['rorunoff'].values[0, :, :]  # og shape is (1, 898, 398)

        for river in rivers:
            y, dy, x, dx = river_bounds(river)
            result[row_idx, col_idx] = array[y:y+dy, x:x+dx].sum()  # take the sum in the box, slices are not inclusive
            col_idx += 1
        
        ds.close()  # close the dataset for this day before opening the next one

        col_idx = 0  # reset the river idx to loop again
        row_idx += 1  # add one to the file idx

    return result

In [16]:
file_names, file_dates = files_between_dates(start_date, end_date, source_directory)
result = files_to_timeseries(source_directory, file_names, rivers_list)

In [17]:
data_dict = {}
for i in np.arange(len(rivers_list)):
    river_name = rivers_list[i][1] + ' [kg/m2/s]'
    data_dict[river_name] = result[:,i]
df_data = pd.DataFrame(data_dict)

metas = {'filename':file_names, 'date':file_dates}
df_metas = pd.DataFrame(metas)

df_all = pd.concat([df_metas, df_data], axis=1)
# df_all.to_csv('river_dailies_to_ts_{}_{}_{}.csv'.format(save_name, start_str, end_str))