In [30]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns

# PyRQP (forward calculation)

## Input parameters

In [31]:
# River
riv_flow_mean = 100
riv_flow_sd = 93  # TODO Need to use 5th percentile
riv_wq_mean = 2
riv_wq_sd = 1

# Discharge
dis_flow_mean = 20
dis_flow_sd = 8
dis_wq_mean = 15
dis_wq_sd = 7

# Correlations
corr_riv_dis_flow = 0.6
corr_riv_flow_wq = -0.3
corr_dis_flow_wq = -0.2

In [32]:
# from scipy.stats import norm
# https://stats.stackexchange.com/questions/586114/percentile-of-the-arithmetic-mean-of-a-lognormal-distribution
# norm.ppf(0.05, loc=100, scale=59)
# np.log(20)

## Prepare all functionality

In [33]:
def calculate_std_from_percentile(lg_mean, lg_perc):
    """ """
    # Look at stackoverflow answer
    return lg_sd

In [34]:
def transform_log_to_normal(lg_mean, lg_sd):
    """ """
    mean = np.log(lg_mean / ((1 + ((lg_sd**2) / (lg_mean**2))) ** 0.5))
    sd = (np.log(1 + (lg_sd**2) / (lg_mean**2))) ** 0.5
    return mean, sd

In [35]:
def calculate_covariance(corr, std_1, std_2):
    """
    This formula takes a correlation and two std
    and calculates the covariance matrix
    """
    x = corr * np.sqrt(std_1**2 * std_2**2)
    cov = [[std_1**2, x], [x, std_2**2]]
    return cov

In [36]:
def calculate_multivariate_normal(mean_1, mean_2, cov):
    """
    This formula takes a covariation matrix and two
    mean values and calculates a two random series of
    multivariate variables of the specified size
    """
    data = np.random.multivariate_normal([mean_1, mean_2], cov, size=10000)
    return data

In [37]:
def calculate_multivariate_log_normal(corr, mean_1, mean_2, std_1, std_2):
    """ """
    # Transform to 'normal' statistical moments
    mean_1, std_1 = transform_log_to_normal(mean_1, std_1)
    mean_2, std_2 = transform_log_to_normal(mean_2, std_2)
    # Calculate covariance
    cov = calculate_covariance(corr, std_1, std_2)
    # Calcualate random multivariate data
    pts = calculate_multivariate_normal(mean_1, mean_2, cov)
    # Transform to lognormal
    pts = np.exp(pts)
    return pts

## Calculate correlated lognormal random variable for each correlated series of data

In [38]:
# Correlation between river flow and quality
pts_1 = calculate_multivariate_log_normal(
    corr_riv_flow_wq, riv_flow_mean, riv_wq_mean, riv_flow_sd, riv_wq_sd
)
pts_1 = pd.DataFrame(pts_1, columns=["River flow", "River quality"]).sort_values(
    "River flow"
)

# Correlation between river flow and discharge flow
pts_3 = calculate_multivariate_log_normal(
    corr_riv_dis_flow, riv_flow_mean, dis_flow_mean, riv_flow_sd, dis_flow_sd
)
pts_3 = pd.DataFrame(pts_3, columns=["River flow", "Discharge flow"]).sort_values(
    "River flow"
)

# Correlation between discharge flow and quality
pts_2 = calculate_multivariate_log_normal(
    corr_dis_flow_wq, dis_flow_mean, dis_wq_mean, dis_flow_sd, dis_wq_sd
)
pts_2 = pd.DataFrame(
    pts_2, columns=["Discharge flow", "Discharge quality"]
).sort_values("Discharge flow")

In [52]:
pts_3.corr()
# TODO Correlations are coming up too small
# TODO Correlations do not seem to be preserved in the final dataframe

Unnamed: 0,River flow,Discharge flow
River flow,1.0,0.531224
Discharge flow,0.531224,1.0


## Build master dataframe with all data

In [39]:
df = pd.concat(
    [pts_1, pts_2["Discharge quality"], pts_3["Discharge flow"]],
    axis=1,
    ignore_index=True,
)
df.columns = ["River_flow", "River_quality", "Discharge_quality", "Discharge_flow"]

In [50]:
df.corr()

Unnamed: 0,River_flow,River_quality,Discharge_quality,Discharge_flow,Downstream_flow,Downstream_wq
River_flow,1.0,-0.220542,0.002271,-0.000693,0.996153,-0.475134
River_quality,-0.220542,1.0,0.00096,0.007439,-0.219055,0.426252
Discharge_quality,0.002271,0.00096,1.0,0.004605,0.002666,0.558358
Discharge_flow,-0.000693,0.007439,0.004605,1.0,0.086944,0.280303
Downstream_flow,0.996153,-0.219055,0.002666,0.086944,1.0,-0.448771
Downstream_wq,-0.475134,0.426252,0.558358,0.280303,-0.448771,1.0


## Calculate downstream quality column

In [40]:
df = df.eval("Downstream_flow = River_flow + Discharge_flow")

df = df.eval(
    "Downstream_wq = (River_flow * River_quality + Discharge_flow * Discharge_quality) / Downstream_flow"
)

## Calculate descriptive statistics

In [42]:
stats = df.describe().T

In [43]:
stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
River_flow,10000.0,99.669924,90.535833,2.296989,43.033032,73.5671,125.723831,1642.252863
River_quality,10000.0,2.005442,1.001687,0.315449,1.302094,1.798922,2.459119,10.53113
Discharge_quality,10000.0,14.85495,6.929022,2.688474,9.957376,13.4074,18.154576,84.648378
Discharge_flow,10000.0,19.929258,7.964164,4.127645,14.183467,18.413503,24.026687,71.293885
Downstream_flow,10000.0,119.599182,90.879955,16.906494,63.175466,94.472096,146.031128,1662.076759
Downstream_wq,10000.0,4.944275,2.823561,0.710699,3.008313,4.275745,6.069018,32.052629


In [44]:
stats["90pc"] = df.quantile(0.90)
stats["95pc"] = df.quantile(0.95)
stats["99pc"] = df.quantile(0.99)

# TODO

- Transform into a proper GitHub library
- Add extra functionality
    - Backward calculations
    - Decay (with optional decay rate and time or distance plus alpha/beta values)
    - Visualisation (e.g., histograms)