In [1]:
import sys
sys.path.append("../src")

import datetime
import pandas as pd
from sklearn.linear_model import LinearRegression
from tqdm.notebook import tqdm

from dataio import *

In [2]:
def make_df(idx,daily=False):
    rivout = load_dam_rivout(idx,daily=daily)
    
    discharge = load_dam_discharge(idx)
    discharge = pd.concat(load_dam_discharge(idx))
    #processing missing data
    discharge[discharge<0] = np.nan
    discharge.fillna(method='ffill',inplace=True,limit=3)
    discharge.dropna(inplace=True)
    
    if daily:
        discharge.index = discharge.index.date
        discharge.index.name = 'date'
        discharge = discharge.groupby(level=['date']).mean()
        discharge.index = pd.to_datetime(discharge.index)
    
    df = pd.concat([rivout,discharge],axis=1)
    df = df.dropna(how='any')
    df.columns=['x','y']
    return df 

In [3]:
df = pd.read_pickle('data_details.pkl')
idxs = df[~(df['lack of data']) & (df['network exist']) & ~(df['other dams']) & (df['gauge available'])].index

## 1. Daily Evaluation

In [4]:
results = {}

for idx in tqdm(idxs):
    if idx == 1368060475060:
        continue
    df = make_df(idx,daily=True)
    x = df.values[:,0].reshape(-1, 1)
    y = df.values[:,1].reshape(-1, 1)
    
    msk_tr = df.index<datetime.datetime(2018,9,1)
    tr_idx = np.where(msk_tr)[0]
    te_idx = get_prediction(idx).index

    train_x, test_x = x[tr_idx], df.loc[te_idx,'x'].values.reshape(-1,1)
    train_y, test_y = y[tr_idx], df.loc[te_idx,'y'].values.reshape(-1,1)
    
    # caribration
    lr = LinearRegression()
    lr.fit(train_x,train_y)
    predict = lr.predict(test_x)
    
    
    predict = predict.reshape(-1,)
    test_y = test_y.reshape(-1,)
    
    inputs = {}
    inputs["pred"]=predict
    inputs["y"]=test_y
    inputs = pd.DataFrame(inputs)
    inputs.index = df.loc[te_idx].index
    results[idx]=inputs

# save results
dirname = f"./predictions/daily/CaMa-Flood"
os.makedirs(dirname, exist_ok=True)

for k, result in results.items():
    result.to_pickle(f"{dirname}/{k}.pkl")

  0%|          | 0/86 [00:00<?, ?it/s]

In [5]:
get_results(model='CaMa-Flood').mean()

RMSE     14.247957
MSE     573.989450
NSE       0.439285
KGE       0.525098
bias     -1.948944
dtype: float64

## HOURLY

In [6]:
results = {}

for idx in tqdm(idxs):
    if idx == 1368060475060:
        continue
    # use daily data to get bias and scale
    df = make_df(idx,daily=True)
    x = df.values[:,0].reshape(-1, 1)
    y = df.values[:,1].reshape(-1, 1)
    
    msk_tr = df.index<datetime.datetime(2018,9,1)
    tr_idx = np.where(msk_tr)[0]
    te_idx = np.where(~msk_tr)[0]

    train_x, test_x = x[tr_idx], x[te_idx].reshape(-1,1)
    train_y, test_y = y[tr_idx], y[te_idx].reshape(-1,1)
    
    # caribration
    lr = LinearRegression()
    lr.fit(train_x,train_y)
    
    # HOURLY prediction 
    df = make_df(idx,daily=False)
    x = df.values[:,0].reshape(-1, 1)
    y = df.values[:,1].reshape(-1, 1)
    
    msk_tr = df.index<datetime.datetime(2018,9,1)
    tr_idx = np.where(msk_tr)[0]
    te_idx = np.where(~msk_tr)[0]

    train_x, test_x = x[tr_idx], x[te_idx].reshape(-1,1)
    train_y, test_y = y[tr_idx], y[te_idx].reshape(-1,1)
    
    # caribration
    predict = lr.predict(test_x)
    
    predict = predict.reshape(-1,)
    test_y = test_y.reshape(-1,)
  
    inputs = {}
    inputs["pred"]=predict
    inputs["y"]=test_y
    inputs = pd.DataFrame(inputs)
    inputs.index = df.iloc[te_idx].index
    results[idx]=inputs

# save results
dirname = f"./predictions/hourly/CaMa-Flood"
os.makedirs(dirname, exist_ok=True)

for k, result in results.items():
    result.to_pickle(f"{dirname}/{k}.pkl")

  0%|          | 0/86 [00:00<?, ?it/s]

In [7]:
get_results(model='CaMa-Flood', hourly=True).mean()

RMSE      24.230551
MSE     1193.432396
NSE       -0.044781
KGE        0.415215
bias      -1.298303
dtype: float64