In [1]:
import pandas as pd
import numpy as np
from utils.concentration import rainfall_events, emc_cal, conc_interpolate, event_emc
import datetime

# read the discrete storm events
# Read daily loads and flow
from common_settings import obspath, outpath, events_name, \
    obs_events, day_load_flow, hour_load_flow, conct_name, modpath, mod_load_flow

In [2]:
from utils.concentration import cumulative_lq, excel_save
from utils.signatures import update_cumul_df, load_flow_loc

## Linear regression of C-Q

In [3]:
# import necessary packages
from sklearn.metrics import r2_score
from utils.signatures import residual, nonlinear_fit
from utils.plotting import regression_plot
import lmfit

In [4]:
# define x and y
time_range = [['/7/1', '/10/1'], ['/10/1', '/1/1'], ['/1/1', '/4/1'], ['/4/1', '/7/1']]
cols = day_load_flow.columns
# day_load_flow.loc[:, cols[0]] = day_load_flow.loc[:, cols[0]]*1000
cols

Index(['Loads (kg)', 'Concentration (mg/L)', 'Flow (ML)', 'Flow  (Cumecs)',
       'Load(t)'],
      dtype='object')

In [21]:
x_dict, y_dict = {}, {}
k = 1
for tt in time_range:
    x, y = np.array([]), np.array([])
    for year in range(2009, 2019):
        start = pd.to_datetime(f'{year}{tt[0]}')
        if tt[1] == '/1/1':
            end = pd.to_datetime(f'{year+1}{tt[1]}') - datetime.timedelta(days=1)
        else:
            end = pd.to_datetime(f'{year}{tt[1]}') - datetime.timedelta(days=1)
        df_temp = load_flow_loc([start, end], day_load_flow, timestep ='d')
        x = np.append(x, df_temp.values[:, 2])
        y = np.append(y, df_temp.values[:, 0])
    x_dict[f'{k}_x'] = x
    x_dict[f'{k}_y'] = y
    k += 1

In [22]:
# variables are x and y
index_labs=['R2', 'a', 'b', 'c', 'a_error', 'b_error', 'c_error']
coeff_regress = pd.DataFrame(columns = np.arange(1, 5), index=index_labs)
columns=['value', 'min', 'max', 'stderr', 'vary', 'expr', 'brute_step']
for k in range(1, 5):
    x = x_dict[f'{k}_x']
    y = x_dict[f'{k}_y']
    p = lmfit.Parameters()
    p.add_many(('a', 0.1, True, 0, 10), ('b', 2, True, 0, 2), ('c', 0, True, 0, 10))
    out1, out2, ci, trace = nonlinear_fit(p, residual, x, y, opti_method='differential_evolution')# lmfit, x=x_input, y=y_output,
    
    #  Extract standard error of coefficients
    for name, values in sorted(out2.params.items()):
        pvalues = {j: getattr(values, j) for j in columns}
        # stderr is a special case: it is either numeric or None (i.e. str)
        coeff_regress.loc[name + '_error':, k] = pvalues['stderr'] 

    # compare coefficient of determination
    para_values = {}
    for param in ['a', 'b', 'c']: 
        para_values[param] = np.round(trace['a'][param][0], 4)
    y_mod = para_values['a'] * x ** para_values['b']+ para_values['c']
    r2 = r2_score(np.log(y), np.log(y_mod))
    abs_bias = np.abs(np.average(y_mod - y))
    rel_bias = abs_bias / np.average(y)
    coeff_regress.loc[index_labs[0:4], k] = [r2, para_values['a'], para_values['b'], para_values['c']]

  spercent = '({:.2%})'.format(abs(par.stderr/par.value))


[[Variables]]
    a:  0.31785112 +/- 0.05376697 (16.92%) (init = 0.3178511)
    b:  1.04411256 +/- 0.04606186 (4.41%) (init = 1.044113)
    c:  0.00000000 +/- 1.1682e-09 (inf%) (init = 0)
[[Correlations]] (unreported correlations are < 0.100)
    C(a, b) = -0.952
    C(a, c) = -0.794
    C(b, c) =  0.675


  warn(errmsg)


      95.45%    68.27%    _BEST_    68.27%    95.45%
 a:  -0.04761  -0.00435   0.31785  +0.01180  +0.02462
 b:  -0.00896  -0.00242   1.04411  +0.01097  +0.02300
 c:      -inf      -inf   0.00000  +0.01769  +0.06771


  spercent = '({:.2%})'.format(abs(par.stderr/par.value))


[[Variables]]
    a:  0.24968621 +/- 0.03539623 (14.18%) (init = 0.2496862)
    b:  1.06874892 +/- 0.03341643 (3.13%) (init = 1.068749)
    c:  0.00000000 +/- 6.8172e-10 (inf%) (init = 0)
[[Correlations]] (unreported correlations are < 0.100)
    C(a, b) = -0.886
    C(a, c) = -0.813
    C(b, c) =  0.629


  warn(errmsg)


      95.45%    68.27%    _BEST_    68.27%    95.45%
 a:  -0.02163  -0.01103   0.24969  +0.01150  +0.02351
 b:  -0.02464  -0.01220   1.06875  +0.01202  +0.02386
 c:      -inf      -inf   0.00000  +0.00401  +0.01608


  spercent = '({:.2%})'.format(abs(par.stderr/par.value))


[[Variables]]
    a:  0.88971081 +/- 0.07462014 (8.39%) (init = 0.8897108)
    b:  0.78196957 +/- 0.01383442 (1.77%) (init = 0.7819696)
    c:  0.00000000 +/- 1.4798e-09 (inf%) (init = 0)
[[Correlations]] (unreported correlations are < 0.100)
    C(a, b) = -0.943
    C(a, c) = -0.557
    C(b, c) =  0.481


  warn(errmsg)


      95.45%    68.27%    _BEST_    68.27%    95.45%
 a:  -0.04542  -0.02299   0.88971  +0.02714  +0.04794
 b:  -0.00917  -0.00459   0.78197  +0.00460  +0.00920
 c:      -inf      -inf   0.00000  +0.03787  +0.15094


  spercent = '({:.2%})'.format(abs(par.stderr/par.value))


[[Variables]]
    a:  1.40701102 +/- 0.12209723 (8.68%) (init = 1.407011)
    b:  0.78150124 +/- 0.02046426 (2.62%) (init = 0.7815012)
    c:  0.00000000 +/- 1.7560e-09 (inf%) (init = 0)
[[Correlations]] (unreported correlations are < 0.100)
    C(a, b) = -0.960
    C(a, c) = -0.455
    C(b, c) =  0.396


  warn(errmsg)


      95.45%    68.27%    _BEST_    68.27%    95.45%
 a:  -0.05131  -0.03237   1.40701  +0.03296  +0.06672
 b:  -0.01521  -0.00375   0.78150  +0.00560  +0.01170
 c:      -inf      -inf   0.00000  +0.06085      +inf


  warn(errmsg)


In [24]:
coeff_regress.to_csv(outpath+'obs_cl_regress_error.csv')

## Calculate Mass First Flush Ratio (MFFR)

In [3]:
# Read cumulative of normalized data
fn_day = 'obs_year_cumulative_ratio_day'
df_day = pd.read_excel(f'{outpath}{fn_day}.xlsx', sheet_name=[f'obs_year_{i}' for i in range(9)]);
fn_mod = 'mod_year_cumulative_ratio_day'
df_mod = pd.read_excel(f'{outpath}{fn_mod}.xlsx', None);

In [4]:
import scipy.interpolate
def cal_mffr(r_flow, r_load, flow_ratio_define = 0.20):
    """Obtain the flow ratio closest to flow_ratio_define (here, 0.20 if not given)"""
    def find_nearest(array, value):
        idx = np.searchsorted(array, value, side='right')
        return idx
    
    # Interpolate to obtain the load ratio when the flow ratio is at flow_ratio_define
    idx = find_nearest(r_flow, flow_ratio_define)
    y_interp = scipy.interpolate.interp1d(r_flow[idx-1:idx+1], r_load[idx-1:idx+1])
    r_load_res = y_interp(flow_ratio_define)
    
    return r_load_res / flow_ratio_define

In [7]:
# Calculate MFFR for all years with observed and simulated datasets.
mffr = pd.DataFrame(index = np.arange(2009, 2018), columns = ['obs', 'mod'])
ii = 2009
for k, v in df_day.items():
    mod_key = 'mod_storm' + k[-2:]
    mffr.loc[ii, 'obs'] = cal_mffr(v.cumul_flow_ratio, v.cumul_load_ratio, flow_ratio_define = 0.30)
    mffr.loc[ii, 'mod'] = cal_mffr(df_mod[mod_key].cumul_flow_ratio, df_mod[mod_key].cumul_load_ratio, flow_ratio_define = 0.30)
    ii += 1
mffr.to_csv(outpath + 'mffr_30.csv')

## Calculate the cumulative ratio for each event.

In [3]:
time_ranges = [[f'{year}/7/1', f'{year+1}/6/30'] for year in range(2009, 2020)]
# Read loads explained by each parameter.
fn_cons = 'contribution_each_param.csv'
data_cons = pd.read_csv(f'../output/{fn_cons}')
data_cons['Flow (Cumecs)'] = day_load_flow['Flow (Cumecs)'].values
data_cons.set_index('Date', inplace=True)


In [4]:
for col in data_cons.columns[1:-1]:
    data_temp = data_cons.filter([col, data_cons.columns[-1]])
    double_mass_ratio = {}
    for ii in range(0, len(time_ranges)-2):
        df_temp = load_flow_loc(time_ranges[ii], data_temp, timestep='d')
        df_temp = update_cumul_df(df_temp, df_temp.values[:, 0], df_temp.values[:, 1])
        double_mass_ratio[f'obs_year_{time_ranges[ii][0][0:4]}'] = df_temp
        # save outputs into one excel
    fn = f'{outpath}{col}_cumulative_ratio_day.xlsx'
    excel_save(double_mass_ratio, fn, True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'cumul_flow_ratio'] = cumulative_ratio['flow_ratio']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'cumul_load_ratio'] = cumulative_ratio['loads_ratio']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'cumul_flow_ratio'] = cumulative_ratio['flow_ratio']
A value is tryi