# Fitting Mixed Distributions when Floods come from Different Drivers

In [None]:
!pip install dataretrieval
!pip install lmoments3
!pip install pynverse

In [None]:
from google.colab import drive
import numpy as np
import scipy.stats as ss
import pandas as pd
import matplotlib.pyplot as plt
import dataretrieval.nwis as nwis

# allow access to google drive
drive.mount('/content/drive')

!cp "drive/MyDrive/Colab Notebooks/CE6280/CodingExamples/utils.py" .
from utils import *

Load annual maxima from rainfall and snowmelt-driven floods at North Saint Vrain Creek at Longmont Dam near Lyons, CO. This is USGS site [06722000](https://waterdata.usgs.gov/nwis/inventory?site_no=06722000&agency_cd=USGS). This report shows the annual peaks from each type of flood: https://pubs.usgs.gov/of/1982/0426/report.pdf.

In [None]:
flow_df = pd.read_csv("drive/MyDrive/Colab Notebooks/CE6280/Data/N_St_Vrain_CO.csv")
flow_df

In [None]:
flow_df.index = flow_df["Year"]
flow_df.drop(["Year"],axis=1,inplace=True)
flow_df

In [None]:
flow_df.plot()

Get the annual maxima across the two types of floods.

In [None]:
flow_df["Max"] = np.maximum(flow_df["Rainfall"].values,flow_df["Snowmelt"].values)
flow_df

In [None]:
flow_df.plot()

Snowmelt-driven peaks tend to be more moderate, with higher lows but lower maxima. Let's see if the conditional distribution of annual maxima given they came from rainfall or snowmelt are statistically different using the K-S test.

In [None]:
rain_peaks = flow_df['Max'].iloc[np.where(flow_df["Max"]==flow_df["Rainfall"])[0]]
snow_peaks = flow_df['Max'].iloc[np.where(flow_df["Max"]==flow_df["Snowmelt"])[0]]
D, p_value = ss.ks_2samp(rain_peaks.values, snow_peaks.values)
print("p-value of K-S test: " + f"{p_value:.2E}")

We reject the null hypothesis that these peaks come from the same distribution! To fit a mixture distribution, we need to fit the conditional distributions of annual maxima 1) given they came from rainfall and 2) given they came from snowmelt, as well as calculate the probability they came from each type. Let's use an LN2 or LN3 distribution for each fit.

In [None]:
prob_rain = len(rain_peaks) / (len(rain_peaks) + len(snow_peaks))
print("Probability of rainfall-driven peak: " + f"{prob_rain:.2f}")

In [None]:
methods = ["MOM", "MLE", "Lmom"]
npars = [2, 3]
p = ss.mstats.plotting_positions(rain_peaks)
p = np.sort(p)

for method in methods:
  for npar in npars:
    distfit = LogNormal()
    distfit.fit(rain_peaks, method, npar, initialize=False)
    distfit.plotHistPDF(rain_peaks, 0, 2500, "LN" + str(npar) + " " + str(method) + " Fit")
    result = ss.kstest(rain_peaks, ss.lognorm.ppf(p, s=distfit.sigma, loc=distfit.tau, scale=np.exp(distfit.mu)), alternative='two-sided')
    print("p-value of 2-sided K-S test for LN%d %s fit to rain peaks: %f" % (npar, method, result.pvalue))

Several of these distributions had p-values of 1.0 with the K-S test (PPCC would probably distinguish them better). For simplicity, let's use the LN2 MOM fit.

Now fit the snowmelt-driven annual maxima.

In [None]:
p = ss.mstats.plotting_positions(snow_peaks)
p = np.sort(p)

for method in methods:
  for npar in npars:
    distfit = LogNormal()
    try:
      distfit.fit(snow_peaks, method, npar, initialize=False)
      distfit.plotHistPDF(snow_peaks, 0, 1000, "LN" + str(npar) + " " + str(method) + " Fit")
      result = ss.kstest(snow_peaks, ss.lognorm.ppf(p, s=distfit.sigma, loc=distfit.tau, scale=np.exp(distfit.mu)), alternative='two-sided')
      print("p-value of 2-sided K-S test for LN%d %s fit to snow peaks: %f" % (npar, method, result.pvalue))
    except:
      pass

LN3 MLE had the highest p-value, so we'll use that.

Now make a histogram of the observations vs. the PDF of the mixed distribution between these two types of floods and perform a K-S test between them.

In [None]:
import scipy.integrate as integrate
from pynverse import inversefunc
from scipy.optimize import minimize

class MixedLogNormalDist:
  def __init__(self, sigma1, tau1, mu1, sigma2, tau2, mu2, prob1):
    self.sigma1 = sigma1
    self.tau1 = tau1
    self.mu1 = mu1
    self.sigma2 = sigma2
    self.tau2 = tau2
    self.mu2 = mu2
    self.prob1 = prob1

  def pdf(self, x):
    f_x = self.prob1 * ss.lognorm.pdf(x, s=self.sigma1, loc=self.tau1, scale=np.exp(self.mu1)) + \
          (1-self.prob1) * ss.lognorm.pdf(x, s=self.sigma2, loc=self.tau2, scale=np.exp(self.mu2))
    return f_x

  def cdf(self, x):
    # integrate pdf from lower bound to x
    F_x = self.prob1 * ss.lognorm.cdf(x, s=self.sigma1, loc=self.tau1, scale=np.exp(self.mu1)) + \
          (1-self.prob1) * ss.lognorm.cdf(x, s=self.sigma2, loc=self.tau2, scale=np.exp(self.mu2))
    return F_x
    #return integrate.quad(lambda x: self.pdf(x), min(self.tau1,self.tau2), x)[0]

  def ppf(self, q, x0):
    # invert cdf at q by finding value of x for which absolute value of difference
    # between q and cdf(x) is minimized (it will be 0 at the inverse)
    f_inv_opt = lambda x: np.abs(self.cdf(x) - q)
    result = minimize(f_inv_opt, x0 = x0)
    return result.x[0]

  def moment(self, m):
    # integrate x^m * f(x) from 0 to UB
    return integrate.quad(lambda x: (x**m * self.pdf(x)), min(self.tau1,self.tau2), np.inf)[0]

  def central_moment(self, m):
    # integrate (x-mu)^m * f(x) from 0 to UB
    mu = self.moment(1)
    return integrate.quad(lambda x: ((x-mu)**m * self.pdf(x)), min(self.tau1,self.tau2),  np.inf)[0]

In [None]:
rainfit = LogNormal()
rainfit.fit(rain_peaks, "MOM", 2, initialize=False)

snowfit = LogNormal()
snowfit.fit(snow_peaks, "MLE", 3, initialize=False)

mixedfit = MixedLogNormalDist(rainfit.sigma, rainfit.tau, rainfit.mu, \
                              snowfit.sigma, snowfit.tau, snowfit.mu, prob_rain)

# get mixed PDF and plt on top of histogram
x = np.linspace(0,2500,100)
f_x = mixedfit.pdf(x)

plt.hist(flow_df["Max"], density=True)
plt.plot(x,f_x)
plt.xlim([0, 2500])
plt.title("Mixed Distribution of All Peaks")
plt.xlabel('Flow')
plt.ylabel('Probability Density')
plt.show()

# get inverse CDF to compare fitted and empirical quantiles
p = ss.mstats.plotting_positions(flow_df['Max'])
iF_x = np.zeros(len(flow_df['Max'])) # inverse CDF
for i in range(len(flow_df['Max'])):
  iF_x[i] = mixedfit.ppf(p[i], flow_df['Max'].iloc[i])

result = ss.kstest(flow_df["Max"], np.sort(iF_x), alternative='two-sided')
print("p-value of 2-sided K-S test for mixed distribution fit: %f" % result.pvalue)

Compare the mixed distribution fit to the fit of an LN2 or LN3 distribution to all of the data.

In [None]:
for method in methods:
  for npar in npars:
    distfit = LogNormal()
    distfit.fit(flow_df['Max'],method, npar, initialize=False)
    distfit.plotHistPDF(flow_df['Max'], 0, 2500, "LN" + str(npar) + " " + str(method) + " Fit")
    result = ss.kstest(flow_df['Max'], ss.lognorm.ppf(p, s=distfit.sigma, loc=distfit.tau, scale=np.exp(distfit.mu)), alternative='two-sided')
    print("p-value of 2-sided K-S test for LN%d %s fit to all peaks: %f" % (npar, method, result.pvalue))

All distributions have a p-value of 0.997 under a single fit as well as the mixed distribution fit.

How do the 100-yr flood estimates compare between these?

In [None]:
for method in methods:
  for npar in npars:
    distfit = LogNormal()
    distfit.fit(flow_df['Max'], method, npar, initialize=False)
    q100_all = distfit.findReturnPd(100)
    print("100-yr flood based on LN%d %s fit to all peaks: %f" % (npar, method, q100_all))

q100_mixed = mixedfit.ppf(0.99, q100_all)
print("100-yr flood based on mixed Lognormal fit: %f" % q100_mixed)

The mixed distribution generally estimated higher 100-year floods. How different do these distributions look?

In [None]:
# get mixed PDF
x = np.linspace(0,2500,100)
f_mixed = mixedfit.pdf(x)
f_rain = ss.lognorm.pdf(x, s=rainfit.sigma, loc=rainfit.tau, scale=np.exp(rainfit.mu))
f_snow = ss.lognorm.pdf(x, s=snowfit.sigma, loc=snowfit.tau, scale=np.exp(snowfit.mu))

for method in methods:
  for npar in npars:
    distfit = LogNormal()
    distfit.fit(flow_df['Max'], method, npar, initialize=False)
    f_all = ss.lognorm.pdf(x, s=distfit.sigma, loc=distfit.tau, scale=np.exp(distfit.mu))

    plt.plot(x, f_rain, label='Rainfall Conditional')
    plt.plot(x, f_snow, label='Snowmelt Conditional')
    plt.plot(x, f_all, label='Annual Maxima Distribution')
    plt.plot(x, f_mixed, label='Mixed Distribution')
    plt.title("LN" + str(npar) + " " + str(method) + " Fit to Annual Maxima")
    plt.xlabel('Flow')
    plt.ylabel('Probability Density')
    handles, labels = plt.gca().get_legend_handles_labels()
    plt.legend(handles,labels)
    plt.show()

Let's zoom in on the lower and upper tails.

In [None]:
# get mixed PDF
x1 = np.linspace(0,200,100)
x2 = np.linspace(1500,2500,100)
xs = [x1, x2]
f_mixed = []
f_rain = []
f_snow = []
f_all = []
for x in xs:
  f_mixed.append(mixedfit.pdf(x))
  f_rain.append(ss.lognorm.pdf(x, s=rainfit.sigma, loc=rainfit.tau, scale=np.exp(rainfit.mu)))
  f_snow.append(ss.lognorm.pdf(x, s=snowfit.sigma, loc=snowfit.tau, scale=np.exp(snowfit.mu)))

for method in methods:
  for npar in npars:
    distfit = LogNormal()
    distfit.fit(flow_df['Max'], method, npar, initialize=False)
    fig, axes = plt.subplots(1,2,figsize=(8,4))
    for i, x in enumerate(xs):
      f_all.append(ss.lognorm.pdf(x, s=distfit.sigma, loc=distfit.tau, scale=np.exp(distfit.mu)))
      axes[i].plot(x, f_rain[i], label='Rainfall Conditional')
      axes[i].plot(x, f_snow[i], label='Snowmelt Conditional')
      axes[i].plot(x, f_all[i], label='Annual Maxima Distribution')
      axes[i].plot(x, f_mixed[i], label='Mixed Distribution')
      axes[i].set_xlabel('Flow')
      if i == 0:
        axes[i].legend()
        axes[i].set_ylabel('Probability Density')
    plt.suptitle("LN" + str(npar) + " " + str(method) + " Fit to Annual Maxima")
    plt.show()

As you can see, the lower tail of the mixed distribution (red) tends to be closer to the snowmelt distribution (orange) than when using all annual maxima (green), while the upper tail of the mixed distribution  (red) tends to be closer to the rainfall distribution (blue) than when using all the annual maxima (green). Since the largest rainfall-driven floods (blue) tends to be larger than the largest snowmelt-driven floods (orange), this is why the mixed distribution  (red) tends to estimate higher 100-yr floods than fitting a single distribution to all annual maxima (green).