# Example code for fitting Normal and Log-Normal Distributions using MLE and MOM

First, import the following libraries
- pandas: to read in data  
- numpy: for basic mathematical functions over arrays  
- scipy.stats: for distribution-fitting functions
- matplotlib.pyplot: for plotting distribution fits
- google.colab.drive: for accessing data on Google Drive

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt
from google.colab import drive

Load data of annual maxima at Azibe Soltane on the Sebou River in Morocco

In [None]:
# allow access to google drive
drive.mount('/content/drive')

maxQ = pd.read_csv("drive/MyDrive/Colab Notebooks/CE6280/Data/Problem6.17.csv")
maxQ.head()

Visualize the data in a histogram

In [None]:
plt.hist(maxQ["Flow"], density=True)
plt.xlabel("Flow (cms)")
plt.ylabel("Density")
plt.title("PDF of Annual Maxima of Sebou River at Azibe Soltane")

Calculate the mean, variance/standard deviation, and skewness using default numpy functions

In [None]:
mu = np.mean(maxQ["Flow"])
sigmaSq = np.var(maxQ["Flow"])
sigma = np.std(maxQ["Flow"])
gamma = ss.skew(maxQ["Flow"])

# what are they?
print("mean %0.2f" % mu)
print("variance: %0.2f" % sigmaSq)
print("std. dev.: %0.2f" % sigma)
print("skewness: %0.2f" % gamma)

With the exception of the mean, these estimates are actually biased. How do they compare with the unbiased estimates?

In [None]:
# using unbiased estimators (formulas given in lecture)
sigmaSq_unbiased = np.var(maxQ["Flow"], ddof=1)
sigma_unbiased = np.std(maxQ["Flow"], ddof=1)
gamma_unbiased = ss.skew(maxQ["Flow"], bias=False)

# compare the estimators
print("unbiased variance: %0.2f" % sigmaSq_unbiased)
print("unbiased std. dev.: %0.2f" % sigma_unbiased)
print("unbiased skewness: %0.2f" % gamma_unbiased)

# how different are they?
print("percent difference in biased vs. unbiased variance: %0.1f" % np.abs((sigmaSq - sigmaSq_unbiased)*100 / (0.5*(sigmaSq + sigmaSq_unbiased))))
print("percent difference in biased vs. unbiased std. dev.: %0.1f" % np.abs((sigma - sigma_unbiased)*100 / (0.5*(sigma + sigma_unbiased))))
print("percent difference in biased vs. unbiased skewness: %0.1f" % np.abs((gamma - gamma_unbiased)*100 / (0.5*(gamma + gamma_unbiased))))


Fit a normal distribution to the data using MLE. This is the approach in scipy.stats.fit, which returns the location and scale parameters of the distribution.

In [None]:
loc, scale = ss.norm.fit(maxQ["Flow"])

print("loc: %0.2f" % loc)
print("scale: %0.2f" % scale)

This prints the location and scale parameters, which in this case are $\mu$ and $\sigma$. You can see from the code above, this is the same as the mean and BIASED variance. Let's rename this mu_fit and sigma_fit, the fitted values of $\mu$ and $\sigma$.

In [None]:
mu_fit = loc
sigma_fit = scale

How does the fit look? Let's compare the fitted PDF with the histogram of the data.  
ss.norm.pdf(x, loc, scale) calculates the value of a normal PDF, f(x), with parameters loc and scale at input values x

In [None]:
x = np.arange(0,4500,10)
f_x = ss.norm.pdf(x, mu_fit, sigma_fit)

plt.hist(maxQ["Flow"], density=True)
plt.plot(x,f_x)
plt.ylim([0,0.0015])
plt.xlim([0,4500])
plt.title('Normal MLE fit')
plt.xlabel('Flow (cms)')
plt.ylabel('Probability Density')

Clearly this is not a good fit! We need a skewed distribution. Let's use a log-normal.  
ss.lognorm.fit returns the shape, location and scale parameters of a 3-parameter log-normal distribution. You can fit a 2-parameter log-normal distribution by fixing the lower bound (location) parameter at 0 with floc=0.  
The parameter $\mu$ of a LN distribution is the log of the scale parameter reported by scipy.stats. The parameter $\sigma$ is the shape parameter.

In [None]:
# fit a log-normal distribution to the data
shape, loc, scale = ss.lognorm.fit(maxQ['Flow'], floc=0)

# convert shape and scale to estimates of mu and sigma
mu_LN_MLE = np.log(scale)
sigma_LN_MLE = shape

print('mu: %0.2f' % mu_LN_MLE)
print('sigma: %0.2f' % sigma_LN_MLE)

Compare these estimates with what we get from the formulas computed in class

In [None]:
# compute using equations from class for MLE
mu_LN_MLE_check = np.mean(np.log(maxQ["Flow"]))
sigma_LN_MLE_check = np.sqrt( np.mean( (np.log(maxQ["Flow"]) - mu_LN_MLE_check)**2 ) )

# compare estimates
print("Python mu: %0.2f" % mu_LN_MLE)
print("Class mu: %0.2f" % mu_LN_MLE_check)
print("Python sigma: %0.2f" % sigma_LN_MLE)
print("Class sigma: %0.2f" % sigma_LN_MLE_check)

How does this fit look?

In [None]:
x = np.arange(0,4500,10)
f_x = ss.lognorm.pdf(x, shape, loc, scale)

plt.hist(maxQ["Flow"], density=True)
plt.plot(x,f_x)
plt.ylim([0,0.0015])
plt.xlim([0,4500])
plt.title('2-parameter log-normal MLE fit')
plt.xlabel('Flow (cms)')
plt.ylabel('Probability Density')

Much better! What is our estimate of the 100-year flood from this fit? The 100-year flood occurs on average 1/100 years. 1/100 = 0.01, so there is a 1\% chance of it being exceeded each year. That means it is the 0.99 quantile of the distribution.  
We can estimates quantiles of distributions in scipy with ppf, the "point percentile function".

In [None]:
q0_99_LN_MLE = ss.lognorm.ppf(0.99, shape, loc, scale)
print("100-year flood estimate: %0.0f cfs" % q0_99_LN_MLE)

What if we wanted to fit the Log-normal distribution using MOM? scipy.stats does not do this, so we need to write our own code for it. Use the formulas we computed in class.

In [None]:
# compute using equations from class for MOM
sigma_LN_MOM = np.sqrt( np.log( 1 + np.var(maxQ["Flow"], ddof=1) / (np.mean(maxQ["Flow"])**2) ) )
mu_LN_MOM = np.log(np.mean(maxQ["Flow"])) - 0.5*sigma_LN_MOM**2
q0_99_LN_MOM = ss.lognorm.ppf(0.99, sigma_LN_MOM, 0, np.exp(mu_LN_MOM))

print("MOM mu: %0.2f" % mu_LN_MOM)
print("MOM sigma: %0.2f" % sigma_LN_MOM)
print("100-year flood estimate: %0.0f cfs" % q0_99_LN_MOM)

x = np.arange(0,4500,10)
f_x = ss.lognorm.pdf(x, sigma_LN_MOM, 0, np.exp(mu_LN_MOM))

plt.hist(maxQ["Flow"], density=True)
plt.plot(x,f_x)
plt.ylim([0,0.0015])
plt.xlim([0,4500])
plt.title('2-parameter log-normal MLE fit')
plt.xlabel('Flow (cms)')
plt.ylabel('Probability Density')

Small differences in the parameter estimates can result in big differences in the estimates of extremes events! Clearly uncertainty will be important to quantify here.  
What if we used the 3-parameter log-normal distribution? First, let's use MLE.

In [None]:
# fit a log-normal distribution to the data
shape, loc, scale = ss.lognorm.fit(maxQ['Flow'])
q0_99_LN3_MLE = ss.lognorm.ppf(0.99, shape, loc, scale)

# convert shape and scale to estimates of mu and sigma
mu_LN3_MLE = np.log(scale)
sigma_LN3_MLE = shape
tau_LN3_MLE = loc

print('tau: %0.2f' % tau_LN3_MLE)
print('mu: %0.2f' % mu_LN3_MLE)
print('sigma: %0.2f' % sigma_LN3_MLE)
print("100-year flood estimate: %0.0f cfs" % q0_99_LN3_MLE)

x = np.arange(0,4500,10)
f_x = ss.lognorm.pdf(x, shape, loc, scale)

plt.hist(maxQ["Flow"], density=True)
plt.plot(x,f_x)
plt.ylim([0,0.002])
plt.xlim([0,4500])
plt.title('3-parameter log-normal MLE fit')
plt.xlabel('Flow (cms)')
plt.ylabel('Probability Density')

And for LN3 using MOM. For this, we'll need to use root-finding to find the value of $\sigma$ that makes the difference between the theoretical and empirical moments 0. This parameter is bounded below by 0 and above by std(log(x)). However, it is undefined at 0, so we will pass a lower bound of 0.01.

In [None]:
from scipy.optimize import brentq as root

sigma_LN3_MOM = root(lambda x: (np.exp(3*x**2)-3*np.exp(x**2)+2) / (np.exp(x**2)-1)**(3/2) - ss.skew(maxQ["Flow"],bias=False),
             0.01, np.std(np.log(maxQ["Flow"]),ddof=1))
mu_LN3_MOM = 0.5 * (np.log(np.var(maxQ["Flow"],ddof=1) / (np.exp(sigma_LN3_MOM**2)-1)) - sigma_LN3_MOM**2)
tau_LN3_MOM = np.mean(maxQ["Flow"]) - np.exp(mu_LN3_MOM + 0.5*sigma_LN3_MOM**2)
q0_99_LN3_MOM = ss.lognorm.ppf(0.99, sigma_LN3_MOM, tau_LN3_MOM, np.exp(mu_LN3_MOM))

print('tau: %0.2f' % tau_LN3_MOM)
print('mu: %0.2f' % mu_LN3_MOM)
print('sigma: %0.2f' % sigma_LN3_MOM)
print("100-year flood estimate: %0.0f cfs" % q0_99_LN3_MOM)

x = np.arange(0,4500,10)
f_x = ss.lognorm.pdf(x, sigma_LN3_MOM, tau_LN3_MOM, np.exp(mu_LN3_MOM))

plt.hist(maxQ["Flow"], density=True)
plt.plot(x,f_x)
plt.ylim([0,0.002])
plt.xlim([0,4500])
plt.title('3-parameter log-normal MOM fit')
plt.xlabel('Flow (cms)')
plt.ylabel('Probability Density')

We can write a function to fit LN2 or LN3 using MOM or MLE depending on the input arguments.

In [None]:
def findMoments(data):
  xbar = np.mean(data)
  var = np.var(data, ddof=1)
  skew = ss.skew(data, bias=False)
  kurtosis = ss.kurtosis(data, bias=False)

  return xbar, var, skew, kurtosis

def fitLN(data, method, npars):
  assert method == 'MLE' or method == 'MOM',"method must = 'MLE' or 'MOM'"
  assert npars == 2 or npars == 3,"npars must = 2 or 3"

  xbar, var, skew, kurtosis = findMoments(data)
  if method == 'MLE':
    if npars == 2:
      shape, loc, scale = ss.lognorm.fit(data, floc=0)
    elif npars == 3:
      shape, loc, scale = ss.lognorm.fit(data)

    mu = np.log(scale)
    sigma = shape
    tau = loc
  elif method == 'MOM':
    if npars == 2:
      sigma = np.sqrt(np.log(1+var/xbar**2))
      mu = np.log(xbar) - 0.5*sigma**2
      tau = 0
    elif npars == 3:
      sigma = root(lambda x: (np.exp(3*x**2)-3*np.exp(x**2)+2) / (np.exp(x**2)-1)**(3/2) - skew,
                   0.01, np.std(np.log(data),ddof=1))
      mu = 0.5 * (np.log(var / (np.exp(sigma**2)-1)) - sigma**2)
      tau = xbar - np.exp(mu + 0.5*sigma**2)

  return sigma, tau, mu

def findLNreturnPd(sigma, tau, mu, T):
  q_T = ss.lognorm.ppf(1-1/T, sigma, tau, np.exp(mu))

  return q_T

Now run it and report the parameters

In [None]:
sigma_LN2_MOM, tau_LN2_MOM, mu_LN2_MOM = fitLN(maxQ["Flow"], "MOM", 2)
q0_99_LN2_MOM = findLNreturnPd(sigma_LN2_MOM, tau_LN2_MOM, mu_LN2_MOM, 100)

sigma_LN2_MLE, tau_LN2_MLE, mu_LN2_MLE = fitLN(maxQ["Flow"], "MLE", 2)
q0_99_LN2_MLE = findLNreturnPd(sigma_LN2_MLE, tau_LN2_MLE, mu_LN2_MLE, 100)

sigma_LN3_MOM, tau_LN3_MOM, mu_LN3_MOM = fitLN(maxQ["Flow"], "MOM", 3)
q0_99_LN3_MOM = findLNreturnPd(sigma_LN3_MOM, tau_LN3_MOM, mu_LN3_MOM, 100)

sigma_LN3_MLE, tau_LN3_MLE, mu_LN3_MLE = fitLN(maxQ["Flow"], "MLE", 3)
q0_99_LN3_MLE = findLNreturnPd(sigma_LN3_MLE, tau_LN3_MLE, mu_LN3_MLE, 100)

print("LN2 MOM mu: %0.2f" % mu_LN2_MOM)
print("LN2 MOM sigma: %0.2f" % sigma_LN2_MOM)
print("LN2 MOM tau: %0.2f" % tau_LN2_MOM)
print("LN2 MOM 100-yr flood: %0.0f" % q0_99_LN2_MOM)

print("LN2 MLE mu: %0.2f" % mu_LN2_MLE)
print("LN2 MLE sigma: %0.2f" % sigma_LN2_MLE)
print("LN2 MLE tau: %0.2f" % tau_LN2_MLE)
print("LN2 MLE 100-yr flood: %0.0f" % q0_99_LN2_MLE)

print("LN3 MOM mu: %0.2f" % mu_LN3_MOM)
print("LN3 MOM sigma: %0.2f" % sigma_LN3_MOM)
print("LN3 MOM tau: %0.2f" % tau_LN3_MOM)
print("LN3 MOM 100-yr flood: %0.0f" % q0_99_LN3_MOM)

print("LN3 MLE mu: %0.2f" % mu_LN3_MLE)
print("LN3 MLE sigma: %0.2f" % sigma_LN3_MLE)
print("LN3 MLE tau: %0.2f" % tau_LN3_MLE)
print("LN3 MLE 100-yr flood: %0.0f" % q0_99_LN3_MLE)

We could even make these methods part of a LogNormal class, which could be a subclass of the Distribution class.

In [None]:
class Distribution:
  def __init__(self):
    self.xbar = None
    self.var = None
    self.skew = None
    self.kurtosis = None

  def findMoments(self, data):
    self.xbar = np.mean(data)
    self.var = np.var(data, ddof=1)
    self.skew = ss.skew(data, bias=False)
    self.kurtosis = ss.kurtosis(data, bias=False)

class LogNormal(Distribution):
  def __init__(self):
    super().__init__()
    self.mu = None
    self.sigma = None
    self.tau = None

  def fit(self, data, method, npars):
    assert method == 'MLE' or method == 'MOM',"method must = 'MLE' or 'MOM'"
    assert npars == 2 or npars == 3,"npars must = 2 or 3"

    self.findMoments(data)
    if method == 'MLE':
      if npars == 2:
        shape, loc, scale = ss.lognorm.fit(data, floc=0)
      elif npars == 3:
        shape, loc, scale = ss.lognorm.fit(data)

      self.mu = np.log(scale)
      self.sigma = shape
      self.tau = loc
    elif method == 'MOM':
      if npars == 2:
        self.sigma = np.sqrt(np.log(1+self.var/self.xbar**2))
        self.mu = np.log(self.xbar) - 0.5*self.sigma**2
        self.tau = 0
      elif npars == 3:
        self.sigma = root(lambda x: (np.exp(3*x**2)-3*np.exp(x**2)+2) / (np.exp(x**2)-1)**(3/2) - self.skew,
                   0.01, np.std(np.log(data),ddof=1))
        self.mu = 0.5 * (np.log(self.var / (np.exp(self.sigma**2)-1)) - self.sigma**2)
        self.tau = self.xbar - np.exp(self.mu + 0.5*self.sigma**2)

  def findReturnPd(self, T):
    q_T = ss.lognorm.ppf(1-1/T, self.sigma, self.tau, np.exp(self.mu))
    return q_T

  def plotHistPDF(self, data, min, max):
    x = np.arange(min, max,(max-min)/100)
    f_x = ss.lognorm.pdf(x, self.sigma, self.tau, np.exp(self.mu))

    plt.hist(data, density=True)
    plt.plot(x,f_x)
    plt.xlim([min, max])
    plt.title('Log-normal fit')
    plt.xlabel('Flow')
    plt.ylabel('Probability Density')

Now run the code again using these classes.

In [None]:
LN2_MOM_Fit = LogNormal()
LN2_MOM_Fit.fit(maxQ["Flow"],"MOM", 2)
LN2_MOM_q100 = LN2_MOM_Fit.findReturnPd(100)
LN2_MOM_Fit.plotHistPDF(maxQ["Flow"], 0, 4500)

LN2_MLE_Fit = LogNormal()
LN2_MLE_Fit.fit(maxQ["Flow"],"MLE", 2)
LN2_MLE_q100 = LN2_MLE_Fit.findReturnPd(100)
LN2_MLE_Fit.plotHistPDF(maxQ["Flow"], 0, 4500)

LN3_MOM_Fit = LogNormal()
LN3_MOM_Fit.fit(maxQ["Flow"],"MOM", 3)
LN3_MOM_q100 = LN3_MOM_Fit.findReturnPd(100)
LN3_MOM_Fit.plotHistPDF(maxQ["Flow"], 0, 4500)

LN3_MLE_Fit = LogNormal()
LN3_MLE_Fit.fit(maxQ["Flow"],"MLE", 3)
LN3_MLE_q100 = LN3_MLE_Fit.findReturnPd(100)
LN3_MLE_Fit.plotHistPDF(maxQ["Flow"], 0, 4500)

print("LN2 MOM mu: %0.2f" % LN2_MOM_Fit.mu)
print("LN2 MOM sigma: %0.2f" % LN2_MOM_Fit.sigma)
print("LN2 MOM tau: %0.2f" % LN2_MOM_Fit.tau)
print("LN2 MOM 100-yr flood: %0.0f" % LN2_MOM_q100)

print("LN2 MLE mu: %0.2f" % LN2_MLE_Fit.mu)
print("LN2 MLE sigma: %0.2f" % LN2_MLE_Fit.sigma)
print("LN2 MLE tau: %0.2f" % LN2_MLE_Fit.tau)
print("LN2 MLE 100-yr flood: %0.0f" % LN2_MLE_q100)

print("LN3 MOM mu: %0.2f" % LN3_MOM_Fit.mu)
print("LN3 MOM sigma: %0.2f" % LN3_MOM_Fit.sigma)
print("LN3 MOM tau: %0.2f" % LN3_MOM_Fit.tau)
print("LN3 MOM 100-yr flood: %0.0f" % LN3_MOM_q100)

print("LN3 MLE mu: %0.2f" % LN3_MLE_Fit.mu)
print("LN3 MLE sigma: %0.2f" % LN3_MLE_Fit.sigma)
print("LN3 MLE tau: %0.2f" % LN3_MLE_Fit.tau)
print("LN3 MLE 100-yr flood: %0.0f" % LN3_MLE_q100)