In [None]:
#------------------------------------------------------------------#
#------------------------- User Settings --------------------------#
#------------------------------------------------------------------#

#### 1. Select one or more input variables from:
# 'SAC','Exp','SJR','DICU','Vern','SF_Tide','DXC'

input_var = ['SAC','Exp','SJR','DICU','Vern','SF_Tide','DXC']

#### 2. Select one or more output stations from:
# 'Emmaton', 'Jersey Point', 'Collinsville', 'Rock Slough', 'Antioch', 'Mallard',
# 'LosVaqueros', 'Martinez', 'MiddleRiver', 'Vict Intake', 'CVP Intake', 'CCFB_OldR'

output_stations=['Emmaton', 'Jersey Point', 'Collinsville', 'Rock Slough', 'Antioch', 'Mallard',
                 'LosVaqueros', 'Martinez', 'MiddleRiver', 'Vict Intake', 'CVP Intake', 'CCFB_OldR']

#### 3. Specify directory to excel dataset and the helper script (folder name only)
google_drive_dir = 'python_ANN'

#### 4. Specify number of nodes in hidden layers
hidden_layer_nodes = [8*12, 2*12]

#### 5. define lead days (days forecasting ahead) for each station
lead_day = 0
lead_day_for_station = {'Emmaton':lead_day,
                        'Jersey Point':lead_day,
                        'Collinsville':lead_day,
                        'Rock Slough':lead_day,
                        'Antioch':lead_day,
                        'Mallard':lead_day,
                        'LosVaqueros':lead_day,
                        'Martinez':lead_day,
                        'MiddleRiver':lead_day,
                        'Vict Intake':lead_day,
                        'CVP Intake':lead_day,
                        'CCFB_OldR':lead_day}

#------------------------------------------------------------------#
#------------------- User Settings Finished -----------------------#
#------------------------------------------------------------------#


### set this to False if you want to train locally
data_in_google_drive = True

from google.colab import drive
import os
import numpy as np

# Mount Google drive
if data_in_google_drive:
    drive.mount('/content/drive',force_remount=True)
    input_data_path = os.path.join('/content/drive','My Drive',google_drive_dir,"input_2.csv")
    output_data_path = os.path.join('/content/drive','My Drive',google_drive_dir,"output_2.csv")
else:
    ### set to dataset path if want to train locally
    input_data_path = '/Users/siyuqi/.spyder-py3/DSM2/input_2.csv'
    output_data_path = '/Users/siyuqi/.spyder-py3/DSM2/output_2.csv'

Check user settings

In [None]:
full_input_variable_list = ['SAC','Exp','SJR','DICU','Vern','SF_Tide','DXC']
full_output_station_list = ['Emmaton', 'Jersey Point', 'Collinsville', 'Rock Slough', 'Antioch', 'Mallard',
                            'LosVaqueros', 'Martinez', 'MiddleRiver', 'Vict Intake', 'CVP Intake', 'CCFB_OldR']

# check if input variables are valid
assert len(set(input_var))==len(input_var), ('Duplicate input variable(s): ' + ", ".join(str(e) for e in set([x for x in input_var if input_var.count(x) > 1])) )
assert len(list(set(input_var).intersection(full_input_variable_list)))==len(input_var), ('Invalid input variable(s): ' + ", ".join(str(e) for e in set([x for x in input_var if full_input_variable_list.count(x) == 0])) )

# check if output stations are valid
assert len(set(output_stations))==len(output_stations), ('Duplicate output station(s): ' + ", ".join(str(e) for e in set([x for x in output_stations if output_stations.count(x) > 1])) )
assert len(list(set(output_stations).intersection(full_output_station_list)))==len(output_stations), ('Invalid output station(s): ' + ", ".join(str(e) for e in set([x for x in output_stations if full_output_station_list.count(x) == 0])) )


# Import helper functions

In [None]:
import sys
!export PYTHONPATH=""

sys.path.append(os.path.join('/content/drive','My Drive',google_drive_dir))

from ann_helper import read_csv,normalize_in,process_data,process_data_vary_pred,initnw,conv_filter_generator
import tensorflow as tf
import math
from sklearn.model_selection import train_test_split
import time
# from sklearn.utils import shuffle
from scipy import stats

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
#   print('xxxxxxxxxxxxxx Using CPU xxxxxxxxxxxxxx')
else:
    print('Found GPU at: {}'.format(device_name))

# Read and prepare data

In [None]:
single_daily = 8
window_num = 10
window_width = 11
input_shape = (1,(single_daily+window_num)*len(input_var))
output_shape = len(output_stations)

nn_shape = [single_daily+window_num*window_width,]+hidden_layer_nodes+[output_shape,]



# label and sort output stations by a given order
locs = {'Emmaton':0,'Jersey Point':1,'Collinsville':2,'Rock Slough':3,'Antioch':4,
        'Mallard':5, 'LosVaqueros':6, 'Martinez':7, 'MiddleRiver':8, 'Vict Intake':9,
        'CVP Intake':10, 'CCFB_OldR':11}
abbrev_map = {'rock slough':'ORRSL','rockslough':'ORRSL',
            'emmaton':'EMM','jersey point':'JP','jerseypoint':'JP',
            'antioch':'antioch','collinsville':'CO',
            'mallard':'Mallard','mallard island':'Mallard',
            'los vaqueros':'LosVaqueros','losvaqueros':'LosVaqueros',
            'martinez':'MTZ',
            'middle river':'MidR_intake','middleriver':'MidR_intake',
            'victoria cannal':'Victoria_intake','vict intake':'Victoria_intake',
            'cvp intake':'CVP_intake','clfct forebay':'CCFB',
            'clfct forebay intake':'ccfb_intake','x2':'X2'};

output_stations = sorted(output_stations,key=lambda x: locs[x])
output_stations = [abbrev_map[x] if x in abbrev_map.keys() else x for x in output_stations]

pred = [lead_day_for_station[station] for station in output_stations]

# set same weight for all stations
class_weights = np.ones(12)
# if 'Martinez' in output_stations:
#   class_weights[output_stations.index('Martinez')] = 10


input_var_map = {'SAC':'SAC 0', 'Exp':'Exports',
                 'SJR':'SJR','DICU':'DICU','Vern':'Vern EC',
                 'SF_Tide':'SF_Tide', 'DXC':'DXC'}
input_var_order = {'SAC 0':0, 'Exports':1,
                 'SJR':2,'DICU':3,'Vern EC':4,
                 'SF_Tide':5, 'DXC':6}

try:
    input_var = [input_var_map[var] for var in input_var]
except Exception as e:
    print('Input variables can only be selected in: SAC, Exp, SJR, DICU, Vern, SF_Tide, DXC.')
    print('But got %s.' % e)

sorted(input_var,key=lambda x: input_var_order[x])


# read data from csv files
x_data,y_data=read_csv(input_data_path,output_data_path,input_var,output_stations)
test_mode = True

# normalize data to 0.1 ~ 0.9
[x_data,x_slope,x_bias] = normalize_in(x_data)
[y_data,y_slope,y_bias] = normalize_in(y_data)

# prepare input and output data sample pairs
# x_data,y_data=process_data(x_data,y_data,single_daily+window_num*window_width,0,0)
x_data,y_data=process_data_vary_pred(x_data,y_data,single_daily+window_num*window_width,0,0,predict_list=pred)

# Determine hyper-parameters

In [None]:
# adam optimizer settings
batch_size = 32

train_loc = locs[output_stations[0]]
ann_name = '_'.join([abbrev_map[x.lower()][:4] if x.lower() in abbrev_map.keys() else x[:4] for x in output_stations])
start = time.time()


# split 80% data for training, 20% for testing
x_train_ori, x_test_ori, y_train0, y_test0 = train_test_split(x_data,
                                                              y_data,
                                                              test_size=0.2,
                                                              random_state = 0)

if test_mode:
    x_train_ori = x_train_ori[:100]
    x_test_ori = x_test_ori[:100]
    y_train0 = y_train0[:100]
    y_test0 = y_test0[:100]
    epochs = 5
else:
    epochs = 200

train_err = []
test_err = []
train_shape = len(x_train_ori)

def lr_schedule(epoch):
    """Learning Rate Schedule

    Learning rate is scheduled to be reduced after 80, 120, 160, 180 epochs.
    Called automatically every epoch as part of callbacks during training.

    # Arguments
        epoch (int): The number of epochs

    # Returns
        lr (float32): learning rate
    """
    lr = 1e-3
    if epoch/epochs > 0.9:
        lr *= 0.5e-3
    elif epoch/epochs > .8:
        lr *= 1e-3
    elif epoch/epochs > .6:
        lr *= 1e-2
    elif epoch/epochs > .4:
        lr *= 1e-1
    elif epoch/epochs < .05:
        lr *= epoch/5
    if epoch/epochs < .05:
        print('Learning rate: ', lr)
    elif epoch/epochs in [0.9, 0.8, 0.6, 0.4]:
        print('Learning rate changed to: ', lr)
    return lr


# custom loss function
def weighted_mse(class_weights):
    def custom_mse(y_true, y_pred):
        """
        :param y_true: A tensor of the same shape as `y_pred`
        :param y_pred:  A tensor resulting from a sigmoid
        :return: Output tensor.
        """
        # print('y_pred:', tf.keras.backend.int_shape(y_pred))
        # print('y_true:', K.int_shape(y_true))
        # calculating mean squared error
        mse = tf.keras.backend.mean(class_weights * tf.keras.backend.square(y_pred - y_true), axis=-1)
        return mse
    return custom_mse

# Build and train ANN

In [None]:
# Build model
conv_filter_init = tf.constant_initializer(conv_filter_generator(single_days=single_daily,window_num=window_num,window_size = window_width))

inputs = tf.keras.Input(shape=(len(input_var),single_daily+window_num*window_width))
x = tf.keras.layers.Conv1D(single_daily+window_num,1, activation='relu',
                           kernel_initializer=conv_filter_init,
                           kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0, l2=0))(inputs)
x = tf.keras.layers.Flatten()(x)

for nodes in hidden_layer_nodes:
    x = tf.keras.layers.Dense(nodes, activation='sigmoid')(x)
x = tf.keras.layers.Dense(output_shape)(x)
outputs = tf.keras.layers.LeakyReLU(alpha=0.3)(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)


model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=weighted_mse(class_weights),
    metrics=[tf.keras.metrics.MeanAbsolutePercentageError()])

model.summary()

# define callbacks
checkpoint_filepath = os.path.join('/content/drive','My Drive',google_drive_dir,"models/Conv_MTL/%s/model.ckpt"%ann_name)
callbacks = [
    tf.keras.callbacks.LearningRateScheduler(lr_schedule, verbose=0),
    tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,
                                       save_weights_only=True,
                                       monitor='val_loss',mode='min',save_best_only=True)
    ]

start = time.time()
history_callback=model.fit(x_train_ori,y_train0,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test_ori,y_test0),
          callbacks=callbacks,verbose=2)
end = time.time()
print('Training finished in %d seconds' % (end-start) )

# Print results in terms of mean squared error (MSE) and mean absolute percentage error (MAPE) on training and test sets


In [None]:
print("With trainable conv layer (initiated with pre-determined weights):\n")

y_train_pred = np.clip(model.predict(x_train_ori),0,1)
y_test_pred = np.clip(model.predict(x_test_ori),0,1)

cell_width = 15
header_format = "|{:^%d}|{:^%d}|{:^%d}|{:^%d}|{:^%d}|" % (cell_width, cell_width, cell_width, cell_width, cell_width)
row_format = "|{:<%d}|{:<%d.1f}|{:<%d.2%%}|{:<%d.1f}|{:<%d.2%%}|" % (cell_width, cell_width, cell_width, cell_width, cell_width)

for ii in range(output_shape):
    if ii == 0:
        print("-"*(cell_width*5+6))
        print(header_format.format('Station', 'Train MSE', 'Train MAPE', 'Test MSE', 'Test MAPE'))  # *row == row element wise
        print("-"*(cell_width*5+6))
        print("-"*(cell_width*5+6))

    train_mse = np.mean(((y_train_pred[:,ii]-y_train0[:,ii])/y_slope[ii])**2)
    train_mape = np.mean(abs(y_train_pred[:,ii]-y_train0[:,ii])/y_train0[:,ii])
    test_mse = np.mean(((y_test_pred[:,ii]-y_test0[:,ii])/y_slope[ii])**2)
    test_mape = np.mean(abs(y_test_pred[:,ii]-y_test0[:,ii])/y_test0[:,ii])

    print(row_format.format(output_stations[ii],train_mse, train_mape, test_mse, test_mape))
    print("-"*(cell_width*5+6))
