# Set Up Environment

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# this command is executed twice for interactive plotting in Jupyter notebook
%matplotlib notebook
%matplotlib notebook

In [3]:
from pathlib import Path
import sys
import matplotlib.pyplot as plt
import numpy as np
import copy
from tqdm.notebook import tqdm
import h5py
import os
import pickle as pkl

In [4]:
working_dir = Path('/home/gsnearing/projects/lstm_based_hydrology/') # lstm codebase
config_file = working_dir / 'configs/extreme_years.yml' # config file

In [5]:
sys.path.append(str(working_dir))
from codebase.config import read_config
from codebase.data.utils import load_basin_file
from codebase.data.climateindices import precalculate_dyn_climate_indices

In [6]:
# number of train and test years
n_train_years = 9
n_test_years = 3
n_skip_years = [0,4,8,12]
holdout_types = ['low', 'median', 'high']

# Load Data

In [7]:
# load configuration file for this run
cfg = read_config(config_file)

In [8]:
# basin lists
basins = load_basin_file(cfg['train_basin_file'])
nBasins = len(basins)
print(f'There are {nBasins} basins.')

There are 531 basins.


# Calculate Dynamic Climate Indexes

In [40]:
# calculate all climate indexes
climate_indexes = precalculate_dyn_climate_indices(data_dir=cfg['data_dir'], 
                                                   basin_file=cfg['train_basin_file'], 
                                                   window_length=cfg['seq_length'],
                                                   forcings=cfg['forcings'][1])

100%|██████████| 531/531 [01:50<00:00,  4.79it/s]


Precalculated features successfully stored at /home/gsnearing/projects/lstm_based_hydrology/data/dyn_climate_indices_daymet_531basins_365lookback.p


In [41]:
# remove leading nans
for basin in basins:
    climate_indexes[basin] = climate_indexes[basin].iloc[364*4:,:]

In [42]:
# dimensions
assert nBasins == len(climate_indexes) # basins
nTimes, nClimate = climate_indexes[basins[0]].shape
climate_index_names = list(climate_indexes[basins[0]].columns)
print('Climate Index are: ', climate_index_names)

Climate Index are:  ['p_mean_dyn', 'pet_mean_dyn', 'aridity_dyn', 't_mean_dyn', 'frac_snow_dyn', 'high_prec_freq_dyn', 'high_prec_dur_dyn', 'low_prec_freq_dyn', 'low_prec_dur_dyn']


In [43]:
# extract timestamps
dates = climate_indexes[basins[0]].index

In [44]:
# convert to numpy array
climate_indexes_np = np.full([nTimes, nBasins, nClimate], np.nan)
for b, basin in enumerate(basins):
    climate_indexes_np[:,b,:] = climate_indexes[basin].values
assert np.all(~np.isnan(climate_indexes_np))

# Find Extreme Years in Each Basin by Each Climate Index

In [45]:
# find water years
start_mask = ((dates.month == 10) & (dates.day == 1))
water_year_start_dexes = np.where(start_mask)[0][:-1]
end_mask = ((dates.month == 9) & (dates.day == 30))
water_year_end_dexes = np.where(end_mask)[0][1:]

# list water years
years = np.unique(dates[start_mask].year)
years = years[:-1]
nYears = len(years)
print(f'There are {nYears} water years.')

There are 30 water years.


In [46]:
# pull and sort water years from climate indexes
climate_water_years = climate_indexes_np[water_year_start_dexes,:,:]
sorted_climate_water_years_indexes = np.argsort(climate_water_years, axis=0)

In [47]:
# save climate indexes for analysis script
climate_water_years = {}
for basin in enumerate(basins):
    year_start_dexes = np.where((climate_indexes[basin].index.month == 1) & (climate_indexes[basin].index.day == 1))
    climate_water_years[basin].iloc[year_start_dexes]

file = 'notebook_env_saves/extreme_year_climate_indexes.pkl'
with open(file, 'wb') as f:
    pkl.dump(climate_water_years, f)

KeyError: (0, '01022500')

# Create Training Dates Lists

In [None]:
def filter_years(original_list, remove_first_n, skip_n):
    remove_list = []
    for i in range(remove_first_n):
        remove_list.append(original_list[i])
        remove_list.append(original_list[i]+1)
        remove_list.append(original_list[i]-1)
    pruned_list = [ele for ele in original_list if ele not in remove_list] 
    return pruned_list

In [None]:
def train_test_split(idx, htype, skip):

    if htype == 'low':
        test = idx[:n_test_years]
        pruned_list = filter_years(idx, n_test_years, skip)
        train = pruned_list[skip:skip+n_train_years]
    
    elif htype =='high':
        test = idx[-n_test_years:]
        idx.reverse()
        pruned_list = filter_years(idx, n_test_years, skip)
        train = pruned_list[skip:skip+n_train_years]

    elif htype == 'median':
        sdex = int(np.ceil((len(idx) - n_test_years) / 2))
        edex = int(np.floor((len(idx) - n_test_years) / 2))
        test = idx[sdex:-edex]
        train = test.copy()
        for step in range(len(idx)):
            try:
                train.append(idx[sdex-step])
                train.append(idx[edex+step+1])
            except:
                pass
        pruned_list = filter_years(train, n_test_years, skip)
        train = pruned_list[skip:skip+n_train_years]

    return test, train


In [None]:
# test dates
test_dates = {}
train_dates = {}
for i, index in enumerate(tqdm(climate_index_names)):
    for htype in holdout_types:
        for skip in n_skip_years:
            
            test_dates[(index,htype,skip)] = {}
            train_dates[(index,htype,skip)] = {}

            test_dates[(index,htype,skip)]['start_dates'] = {}
            test_dates[(index,htype,skip)]['end_dates'] = {}
            train_dates[(index,htype,skip)]['start_dates'] = {}
            train_dates[(index,htype,skip)]['end_dates'] = {}
            
            for b, basin in enumerate(basins):
                test_dex, train_dex = train_test_split(list(sorted_climate_water_years_indexes[:,b,i]), htype, skip)
                
                test_dates[(index,htype,skip)]['start_dates'][basin] = dates[water_year_start_dexes[test_dex]]
                test_dates[(index,htype,skip)]['end_dates'][basin] = dates[water_year_end_dexes[test_dex]]
                
                train_dates[(index,htype,skip)]['start_dates'][basin] = dates[water_year_start_dexes[train_dex]]
                train_dates[(index,htype,skip)]['end_dates'][basin] = dates[water_year_end_dexes[train_dex]]
                

# Create Train/Test Dates Files

In [None]:
for i, index in enumerate(tqdm(climate_index_names)):
    for htype in holdout_types:
        for skip in n_skip_years:
            test_fname = Path(f'{working_dir}/extreme_year_dates/test_{index}_{htype}_{skip}.pkl')
            with open(test_fname, 'wb') as f:
                pkl.dump(test_dates[(index,htype,skip)], f)
            
            train_fname = Path(f'{working_dir}/extreme_year_dates/train_{index}_{htype}_{skip}.pkl')
            with open(train_fname, 'wb') as f:
                pkl.dump(train_dates[(index,htype,skip)], f)
             

In [None]:
train_dates[(index,htype,skip)]['start_dates']

# Create Config Files

In [None]:
base_config_file = Path(f'{working_dir}/configs/extreme_years.yml')

In [None]:
use_climate_indexes = ['p_mean_dyn', 'aridity_dyn']
for index in use_climate_indexes:
    print(index)

In [None]:
# number of ensembles
num_seeds = 8
first_seed = 200
seeds = list(range(first_seed, first_seed + num_seeds))
seeds

In [None]:
# cretate training files for high-end of climate indexes
for index in use_climate_indexes:
    for htype in holdout_types:
        for skip in n_skip_years:
            for seed in seeds:

                # read basefile
                with open(base_config_file, 'r') as file :
                    filedata = file.read()

                # replace experiment name
                exp_name = f'{index}_{htype}_{skip}_{seed}'
                filedata = filedata.replace('extreme_years', exp_name)

                # replace train dates file
                train_dates_fname = f'train_{index}_{htype}_{skip}.pkl'
                filedata = filedata.replace('train_high_p_mean_dyn_0.pkl', train_dates_fname)

                # replace test dates file
                test_dates_fname = f'test_{index}_{htype}_{skip}.pkl'
                filedata = filedata.replace('test_high_p_mean_dyn_0.pkl', test_dates_fname)

                # replace random seed
                filedata = filedata.replace('seed: 100', f'seed: {seed}')

                # write to new config file
                new_config_file = Path(f'{working_dir}/extreme_year_configs/{exp_name}.yml')
                with open(new_config_file, 'w') as file:
                    file.write(filedata)