# Testing for change points with regression, and non-parametric tests for trends and change points

In [None]:
!pip install dataretrieval

Load annual maxima from the Turkey River in Garber, Iowa. This is USGS site [05412500](https://waterdata.usgs.gov/nwis/inventory?site_no=05412500&agency_cd=USGS), which has data from Aug 8, 1913 to the present, but is only complete from 1932-10-01, so we'll load the data from then on.

In [None]:
import pandas as pd
import numpy as np
import dataretrieval.nwis as nwis

flow_df = nwis.get_record(sites='05412500', service='peaks', start='1932-10-01', end='2024-09-30') # Turkey River at Garber, IA
flow_df.head()

In [None]:
flow_df["peak_va"].plot()

## Testing for Normality

In [None]:
!pip install lmoments3

In [None]:
from google.colab import drive

# allow access to google drive
drive.mount('/content/drive')

!cp "drive/MyDrive/Colab Notebooks/CE6280/CodingExamples/utils.py" .
from utils import *

In [None]:
import scipy.stats as ss
import matplotlib.pyplot as plt
from lmoments3 import distr

class Normal(Distribution):
  def __init__(self):
    super().__init__()
    self.mu = None
    self.sigma = None

  def fit(self, data, method):
    assert method == 'MLE' or method == 'MOM' or method == 'Lmom',"method must = 'MLE', 'MOM' or 'Lmom'"

    self.findMoments(data)
    if method == 'MLE':
      self.mu, self.sigma = ss.norm.fit(data)
    elif method == 'MOM':
      self.mu = self.xbar
      self.sigma = np.sqrt(self.var)
    elif method == 'Lmom':
      norm_params = distr.nor.lmom_fit(data)
      self.mu = norm_params["loc"]
      self.sigma = norm_params["scale"]

  def findReturnPd(self, T):
    q_T = ss.norm.ppf(1-1/T, self.mu, self.sigma)
    return q_T

  def plotHistPDF(self, data, min, max, title):
    x = np.arange(min, max,(max-min)/100)
    f_x = ss.lognorm.pdf(x, self.mu, self.sigma)
    self.plotDistFit(data, x, f_x, min, max, title)

  def ppccTest(self, data, title, m=10000):
    # calculate test statistic, rho
    x_sorted = np.sort(data)
    p_observed = ss.mstats.plotting_positions(x_sorted)
    x_fitted = ss.norm.ppf(p_observed, self.mu, self.sigma)
    self.ppcc_rho = np.corrcoef(x_sorted, x_fitted)[0,1]

    # generate m synthetic samples of n observations to estimate null distribution of rho
    rhoVector = np.zeros(m)
    for i in range(m):
      np.random.seed(i)
      x = ss.norm.rvs(self.mu, self.sigma, size=len(data))
      rhoVector[i] = np.corrcoef(np.sort(x), x_fitted)[0,1]

    # calculate p-value of test and make QQ plot
    count = 0
    for i in range(len(rhoVector)):
      if self.ppcc_rho < rhoVector[i]:
        count = count + 1

    self.p_value_PPCC = 1 - count/(len(rhoVector) + 1)

    # make Q-Q plot
    plt.scatter(x_sorted,x_fitted,color='b')
    plt.plot(x_sorted,x_sorted,color='r')
    plt.xlabel('Observations')
    plt.ylabel('Fitted Values')
    plt.title(title)
    plt.show()

  def calcCI(self, data, p, CI, method, npars, seed):
    n = len(data)
    alpha = (100.0-CI)/100.0
    # calculate theoretical confidence interval using formula from slides
    z_p = ss.norm.ppf(p)
    z_crit = ss.norm.ppf(1-alpha/2)
    x_p = self.mu + z_p*self.sigma
    LB = x_p - z_crit * np.sqrt(self.sigma**2 * (1+0.5*z_p**2)/n)
    UB = x_p + z_crit * np.sqrt(self.sigma**2 * (1+0.5*z_p**2)/n)
    return LB, UB

In [None]:
dist = Normal()
dist.fit(flow_df["peak_va"], "MLE")
dist.ppccTest(flow_df["peak_va"], "Normal Fit")
dist.p_value_PPCC

What if we use a log-transformation?

In [None]:
flow_df["logQ"] = np.log(flow_df["peak_va"])

dist = Normal()
dist.fit(flow_df["logQ"], "MLE")
dist.ppccTest(flow_df["logQ"], "Normal Fit")
dist.p_value_PPCC

That looks pretty good and we don't reject that the data is normal! Let's proceed we that.

## Looking for seasonality

We'll look for seasonality using the periodogram.

In [None]:
from scipy.signal import periodogram

# look at periodogram of transformed annual maxima
f, P = periodogram(flow_df["logQ"])
plt.plot(f, P)
plt.xlabel('Frequency')
plt.ylabel('Squared Amplitude')

# what are the periods of the largest values?
Psorted = np.argsort(P)[::-1] # sort from largest to smallest
1/f[Psorted] # find corresponding periods

Most prominent peak corresponds to a period of 4.8 years, followed by a period of 2.5 years. There's no reason to believe annual maxima would have seasonality at exactly those periods; this could be noise or elements of ENSO, but that will be more cyclic than seasonal. So we'll assume there is no seasonality.

## Looking for Autocorrelation

In [None]:
import statsmodels.api as sm

fig = plt.figure()
ax = fig.add_subplot(2,1,1)
sm.graphics.tsa.plot_acf(flow_df["logQ"],ax=ax)
ax.set_xlim([0,10])

ax = fig.add_subplot(2,1,2)
sm.graphics.tsa.plot_pacf(flow_df["logQ"],ax = ax)
ax.set_xlim([0,10])

fig.tight_layout()
fig.show()

There is no significant auto-correlation we need to control for either! So we can go straight to testing for a trend.

## Testing for a trend

In [None]:
X = np.array([np.ones(len(flow_df["logQ"])), range(1933,2025)]).T
y = flow_df["logQ"]

trend_model = sm.OLS(y, X)
trend_result = trend_model.fit()
print(trend_result.summary())

The trend is not statistically significant. What does it look like?

In [None]:
l1, = plt.plot(flow_df["logQ"], color="tab:blue")
l2, = plt.plot(trend_result.fittedvalues, color="tab:red")
plt.legend([l1,l2],['True Values','Predictions'], loc='upper left')

Check the regression assumptions of independence, normality, and constant variance of residuals.

### Independence of Residuals

In [None]:
fig = plt.figure()
ax = fig.add_subplot(2,1,1)
sm.graphics.tsa.plot_acf(trend_result.resid, ax=ax)
ax.set_xlim([0,20])
ax.set_ylim([-1.2,1.2])

ax = fig.add_subplot(2,1,2)
sm.graphics.tsa.plot_pacf(trend_result.resid, ax=ax)
ax.set_xlim([0,20])
ax.set_ylim([-1.2,1.2])

fig.tight_layout()

fig.show()

This looks good - we have removed all autocorrelation (there wasn't any to begin with, so not surprising).

### Normality of Residuals

In [None]:
# qq plot of residuals vs. normal fit
sm.qqplot(trend_result.resid,ss.norm,fit=True,line='45')

This looks good as well. By making the data normal, we ensured the residuals were as well.

### Constant Variance of Residuals

In [None]:
import seaborn as sns

sns.residplot(x=trend_result.fittedvalues, y=trend_result.resid, lowess=True,
              scatter_kws={'alpha': 0.5},
              line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})

This isn't bad. The mean is relatively constant at 0 (red line is close to 0 throughout). There is a little more variance in the residuals at lower fitted values, though.

# Non-Parametric Trend Test

We were actually able to meet the assumptions of regression fairly well, but what if we did not? Then we'd have to use the Mann-Kendall non-parametric trend test. We can get a function for this from the [pymannkendall](https://pypi.org/project/pymannkendall/) library. Examples on how to use this library can be found [here](https://github.com/mmhs013/pyMannKendall/blob/master/Examples/Example_pyMannKendall.ipynb). We'll show it on this data for illustrative purposes.

In [None]:
!pip install pymannkendall

In [None]:
import pymannkendall as mk
mk.original_test(flow_df["logQ"], alpha=0.05)

This also says there is no statistically significant trend. How about a change point?

## Change Point Detection

### For change in mean via regression

In [None]:
import statsmodels.formula.api as smf

# create a new columns of 0/1 depending on whether it is before or after time T
# loop through different values of T and find p-value of dummy variable
# find T that minimizes p-value --> most likely change point
flow_df['Dummy'] = np.ones(len(flow_df['logQ']))
p_values = np.ones(len(flow_df['logQ'])-1)

# loop through each time step, changing the dummy to 0 one-at-a-time
# fit the regression model each time and record the p-value
for i in range(len(flow_df['logQ'])-1):
  flow_df.loc[flow_df.index[i],"Dummy"] = 0
  mod = smf.ols(formula='logQ ~ Dummy',data=flow_df)
  result = mod.fit()
  p_values[i] = result.pvalues["Dummy"]

l1, = plt.plot(range(1934,1934+len(flow_df['logQ'])-1),p_values,c='tab:red')
l2, = plt.plot([1934,1934+len(flow_df['logQ'])-1],[0.05,0.05],c='k')
plt.legend([l1,l2],['p-values','0.05 threshold'],bbox_to_anchor=(0.5, 0.15),loc='center')
plt.xlabel('Water Year')
plt.ylabel('p-value')
plt.title('Change Point Detection')

print("Minimum p-value: %0.2f" % np.min(p_values))
print("Most likely change point: %d" % (1934 + np.argmin(p_values)))

The p-value is never below 0.05, so there is no location with a statistically significant change point in the mean of the annual maxima.

### For change in median with Wilcoxon rank sum test

In [None]:
# test for changepoint with Wilcoxon rank-sum test
p_values = np.ones(len(flow_df["logQ"])-1)
for i in range(len(flow_df["logQ"])-1):
  D, p_values[i] = ss.ranksums(flow_df["logQ"].iloc[:(i+1)], flow_df["logQ"].iloc[(i+1):])

l1, = plt.plot(range(1934,1934+len(flow_df['logQ'])-1),p_values,c='tab:red')
l2, = plt.plot([1934,1934+len(flow_df['logQ'])-1],[0.05,0.05],c='k')
plt.legend([l1,l2],['p-values','0.05 threshold'],bbox_to_anchor=(0.5, 0.15),loc='center')
plt.xlabel('Water Year')
plt.ylabel('p-value')
plt.title('Change Point Detection')

print("Minimum p-value: %0.2f" % np.min(p_values))
print("Most likely change point: %d" % (1934 + np.argmin(p_values)))

There are a few locations where the p-value dips below 0.05. The lowest p-value occurs in 2013, indicating this is the most likely time of a change in the median of the annual maxima.

### For change in distribution with K-S test

In [None]:
# test for changepoint with K-S test
p_values = np.ones(len(flow_df["logQ"])-1)
for i in range(len(flow_df["logQ"])-1):
  D, p_values[i] = ss.ks_2samp(flow_df["logQ"].iloc[:(i+1)], flow_df["logQ"].iloc[(i+1):])

l1, = plt.plot(range(1934,1934+len(flow_df['logQ'])-1),p_values,c='tab:red')
l2, = plt.plot([1934,1934+len(flow_df['logQ'])-1],[0.05,0.05],c='k')
plt.legend([l1,l2],['p-values','0.05 threshold'],bbox_to_anchor=(0.5, 0.15),loc='center')
plt.xlabel('Water Year')
plt.ylabel('p-value')
plt.title('Change Point Detection')

print("Minimum p-value: %0.2f" % np.min(p_values))
print("Most likely change point: %d" % (1934 + np.argmin(p_values)))

There are several locations where the p-value dips below 0.05. The lowest p-value occurs in 2013, indicating this is the most likely time of a change in the distribution of the annual maxima.

### For change in mean with bootstrapping

In [None]:
!pip install astropy

In [None]:
from astropy.stats import bootstrap as bootstrap

# test for changepoint with bootstrapping
nSamples=1000
p_values = np.ones(len(flow_df["logQ"])-1)
for i in range(len(flow_df["logQ"])-1):
  np.random.seed(i+1)
  x = bootstrap(np.array(flow_df["logQ"].iloc[:(i+1)]),nSamples)
  y = bootstrap(np.array(flow_df["logQ"].iloc[(i+1):]),nSamples)
  S = 0
  for j in range(nSamples):
    if np.mean(x[j,:]) > np.mean(y[j,:]):
      S+=1

  if S/nSamples > 0.5:
    p_values[i] = 2*(1-S/nSamples)
  else:
    p_values[i] = 2*S/nSamples

l1, = plt.plot(range(1934,1934+len(flow_df['logQ'])-1),p_values,c='tab:red')
l2, = plt.plot([1934,1934+len(flow_df['logQ'])-1],[0.05,0.05],c='k')
plt.legend([l1,l2],['p-values','0.05 threshold'],bbox_to_anchor=(0.5, 0.15),loc='center')
plt.xlabel('Water Year')
plt.ylabel('p-value')
plt.title('Change Point Detection')

print("Minimum p-value: %0.2f" % np.min(p_values))
print("Most likely change point: %d" % (1934 + np.argmin(p_values)))

According to bootstrapping, the most likely change in the mean occurs in 1934, but there's only one year before that, so there isn't really enough data before then to be confident in that conclusion.

### For change in variance with bootstrapping

In [None]:
# is the variance changing?
nSamples=1000
p_values = np.ones(len(flow_df["logQ"])-1)
for i in range(len(flow_df["logQ"])-1):
  np.random.seed(i+1)
  x = bootstrap(np.array(flow_df["logQ"].iloc[:(i+1)]),nSamples)
  y = bootstrap(np.array(flow_df["logQ"].iloc[(i+1):]),nSamples)
  S = 0
  for j in range(nSamples):
    if np.var(x[j,:]) > np.var(y[j,:]):
      S+=1

  if S/nSamples > 0.5:
    p_values[i] = 2*(1-S/nSamples)
  else:
    p_values[i] = 2*S/nSamples

l1, = plt.plot(range(1934,1934+len(flow_df['logQ'])-1),p_values,c='tab:red')
l2, = plt.plot([1934,1934+len(flow_df['logQ'])-1],[0.05,0.05],c='k')
plt.legend([l1,l2],['p-values','0.05 threshold'],bbox_to_anchor=(0.5, 0.15),loc='center')
plt.xlabel('Water Year')
plt.ylabel('p-value')
plt.title('Change Point Detection')

print("Minimum p-value: %0.2f" % np.min(p_values))
print("Most likely change point: %d" % (1934 + np.argmin(p_values)))

The most likely change point in the variance is again 1934, but there is no variance before that because there is only one data point to sample. This is true at the last year as well. There are several years in between that also show a statistically significant difference before and after, so there isn't an obvious single change point.