# Detecting distribution shifts

## Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.random.seed(0)
from scipy.optimize import fsolve
from scipy.special import erf
from scipy.integrate import quad
from functools import partial
from scipy.stats import binomtest

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
from statsmodels.stats.multitest import multipletests


from crepes import ConformalRegressor, ConformalPredictiveSystem

from crepes.fillings import (sigma_variance, 
                            sigma_variance_oob,
                            sigma_knn,
                            binning)

In this notebook we explain how to implement the shift detection algorithm. We start downloading the data and modeling it.

In [2]:
# First we load the dataset
dataset = fetch_openml(name="house_sales",version=3)

X = dataset.data.values.astype(float)
y = dataset.target.values.astype(float)
print('Before normalization range:',y.min(), y.max())

# and also normalize it such that the y values are in the range [0,1]
#y = (np.tanh(np.array([(y[i]-y.min())/(y.max()-y.min()) for i in range(len(y))])*2-1)+1)/2.
#y = np.array([(y[i]-y.min())/(y.max()-y.min()) for i in range(len(y))])
y = np.tanh(y/y.mean())
print('After normalization range:',y.min(), y.max())

  warn(


Before normalization range: 75000.0 7700000.0
After normalization range: 0.13798043023679007 0.9999999999991728


Now we spit the data in the main and shifted data, and form tuples for comparison.

In [3]:
# Then we split the data such that we can later on introduce a distribution shift
X_main, X_shift, y_main, y_shift = X[(4e4 >= X[:, 3])] , X[(4e4 < X[:, 3])] , y[(4e4 >= X[:, 3])] , y[(4e4 < X[:, 3])]

# We want to have an even number of samples, to create tuples
if len(X_shift) % 2 == 1:
    X_shift = X_shift[:-1]
    y_shift = y_shift[:-1]
if len(X_main) % 2 == 1:
    X_main = X_main[:-1]
    y_main = y_main[:-1]

## The input will be two copies of X, one for each house
X_main = X_main.reshape(X_main.shape[0]//2,-1)
X_shift = X_shift.reshape(X_shift.shape[0]//2,-1)
print('After creating tuples',X_main.shape, X_shift.shape)

## We want to estimate the difference between the two y values
y_main = y_main.reshape(y_main.shape[0]//2,-1)
y_main = y_main[:,0] - y_main[:,1]
print(y_main.shape)

## and also in the shifted dataset
y_shift = y_shift.reshape(y_shift.shape[0]//2,-1)
y_shift = y_shift[:,0] - y_shift[:,1]
print(y_shift.shape)

After creating tuples (10202, 42) (604, 42)
(10202,)
(604,)


Finally we create a calibration dataset.

In [4]:
X_train, X_cal, y_train, y_cal = train_test_split(X_main, y_main, test_size=500)

Now we create the model and train it.

In [5]:
# Then we train a random forest regressor on the training set
random_forest_model = RandomForestRegressor(n_jobs=-1, n_estimators=500) 
random_forest_model.fit(X_train, y_train)

## Conformal model

We will target a success probability $1-\delta = 0.95$. Then we train the conformal model on the residuals on the predictions of the random forest.

In [6]:
delta = 0.05

cr_std = ConformalRegressor()
y_hat_cal = random_forest_model.predict(X_cal)
residuals_cal = y_cal - y_hat_cal
# and fit it to the residuals
cr_std.fit(residuals=residuals_cal)

ConformalRegressor(fitted=True, normalized=False, mondrian=False)

We can now make prediction on a given test set.

In [7]:
# Using such model we can now predict the residuals on the test set
y_hat_shift = random_forest_model.predict(X_shift)
intervals_std = cr_std.predict(y_hat=y_hat_shift, confidence=1-delta)

### Optional -- More complex models

The previous models assigns the same uncertainty interval to all predictions. We can alternatively use two more models that callibrate the confidence interval for each prediction based on their heuristic difficulty using k-nearest neighbours.

In [8]:
# Second model based on k-nn
sigmas_cal_knn = sigma_knn(X=X_cal, residuals=residuals_cal)
cr_norm_knn = ConformalRegressor()
cr_norm_knn.fit(residuals=residuals_cal, sigmas=sigmas_cal_knn)
sigmas_test_knn = sigma_knn(X=X_cal, residuals=residuals_cal, X_test=X_shift)
intervals_norm_knn = cr_norm_knn.predict(y_hat=y_hat_shift, 
                                        sigmas=sigmas_test_knn,
                                        y_min=0, y_max=1)

# Third model based on binning
bins_cal, bin_thresholds = binning(values=sigmas_cal_knn, bins=20)
cr_mond = ConformalRegressor()
cr_mond.fit(residuals=residuals_cal, bins=bins_cal)
bins_test = binning(values=sigmas_test_knn, bins=bin_thresholds)
intervals_mond = cr_mond.predict(y_hat=y_hat_shift, bins=bins_test, 
                                        y_min=0, y_max=1)

We can compute the coverage of the three models

In [9]:
coverages = []
mean_sizes = []
median_sizes = []

prediction_intervals = {
    "Std CR":intervals_std,
    "Norm CR knn":intervals_norm_knn,
    "Mond CR":intervals_mond,
}

for name in prediction_intervals.keys():
    intervals = prediction_intervals[name]
    coverages.append(np.sum([1 if (y_shift[i]>=intervals[i,0] and 
                                   y_shift[i]<=intervals[i,1]) else 0 
                            for i in range(len(y_shift))])/len(y_shift))
    mean_sizes.append((intervals[:,1]-intervals[:,0]).mean())
    median_sizes.append(np.median((intervals[:,1]-intervals[:,0])))

pred_int_df = pd.DataFrame({"Coverage":coverages, 
                            "Mean size":mean_sizes, 
                            "Median size":median_sizes}, 
                           index=list(prediction_intervals.keys()))

pred_int_df.loc["Mean"] = [pred_int_df["Coverage"].mean(), 
                           pred_int_df["Mean size"].mean(),
                           pred_int_df["Median size"].mean()]

display(pred_int_df.round(4))

Unnamed: 0,Coverage,Mean size,Median size
Std CR,0.8791,0.4294,0.4294
Norm CR knn,0.4603,0.2744,0.2657
Mond CR,0.4354,0.2328,0.2333
Mean,0.5916,0.3122,0.3095


## Auxiliary functions

In the previous section we have used pre-defined conformal models to compute the confidence intervals. We can also assume gaussian intervals derived from these residuals and reverse engineer the confidence intervals.

First we define function that computes the standard deviation from an interval and $1-\delta$.

In [10]:
def compute_sigma(delta, initial_interval, mean):
    """ Computes the standard deviation of a normal distribution such that the probability of the interval is 1-delta"""
    def cdf(sigma_):
        return 0.5*(erf((initial_interval[1]-mean)/(np.sqrt(2)*sigma_))-erf((initial_interval[0]-mean)/(np.sqrt(2)*sigma_))) - (1-delta)

    sigma = fsolve(cdf, (initial_interval[1]-initial_interval[0])/2)[0]
    return sigma

Then we define a function to compute the interval in which the (assumed normal) probability distribution is larger than $\alpha$. In other words, $\mathcal{C}_\alpha$.

In [11]:
def confidence_interval(mean, sigma, alpha, initial_interval_guess):
    """ We want to compute the conformal integral in which the probability density is larger than alpha. """
    def normal_distribution(x, mean, sigma):
        return np.exp(-((x-mean)**2)/(2*sigma**2))

    interval_alpha = fsolve(lambda x: normal_distribution(x, mean, sigma) - alpha, x0 = initial_interval_guess)
    return interval_alpha

Finally, we define a function to integrate the loss on a given interval.

In [12]:
def compute_loss(y_hat, y_gt, sigma, interval_alpha):
    """ Integrates the  quadratic loss over the interval defined by the probability density being larger than alpha."""
    def normal_loss(x, y, mean, sigma):
        return np.exp(-((x-mean)**2)/(2*sigma**2))*np.abs(x-y)

    normal_l = partial(normal_loss, y=y_gt, mean=y_hat, sigma = sigma)
    loss = quad(normal_l, interval_alpha[0], interval_alpha[1])[0]
    return loss

We also want to compute the p-value of the hypothesis $\mathcal{H}_\alpha$ that the risk $R(\alpha)\leq \lambda$ for a chosen $\lambda$.

In [13]:
lambda_ = 0.5

def pvalue(y_gt, y_hat, sigma, lambda_):
    """ Computes the p-value of the hypothesis that the risk is lower than some \lambda."""

    # For each y compute confidence interval first compute the loss
    alpha = np.linspace(0.01, 0.99, 100)
    interval_alpha = confidence_interval(mean = y_hat, sigma = sigma, alpha = alpha, initial_interval_guess=[y_hat - sigma, y_hat+sigma])
    loss = compute_loss(y_hat, y_gt, sigma, interval_alpha)

    # Step 1: Compute p-values
    pvalues = np.exp(-2*len(y_hat)*(lambda_ - loss)**2)

    # Step 2: Family-wise error correction
    reject, pvals_corrected, _, bonferroni_delta = multipletests(pvalues, delta, method = 'bonferroni')

    return pvals_corrected, reject

## Full conformal prediction and inductive conformal predictors


We are going to compute the loss of exchanging one element of the training set with the shifted data. Then we will compute the corresponding p-values.

In [14]:
new_x = np.expand_dims(X_shift[0], axis = 0)
new_y = np.expand_dims(y_shift[0], axis = 0)

alpha = 1e-5
ncal = len(y_cal)

# First we fit a model to the data
random_forest_model = RandomForestRegressor(n_estimators=100)
random_forest_model.fit(X_train, y_train)

def find_lamdas(new_x, new_y, delta, random_forest_model: RandomForestRegressor, X_cal, y_cal):
    lambdas_i = []
    ncal = len(X_cal)
    for i in range(ncal + 1):
        if i < ncal:
            j = np.random.randint(0, len(X_cal))
        
            X_cal_j = np.concatenate([X_cal[:j],new_x, X_cal[j+1:]])
            y_cal_j = np.concatenate([y_cal[:j],new_y, y_cal[j+1:]])

            X_test_j = np.expand_dims(X_cal[j], axis = 0)
            y_test_j = np.expand_dims(y_cal[j], axis = 0)

        else:
            X_cal_j = X_cal
            y_cal_j = y_cal

            X_test_j = new_x
            y_test_j = new_y

        # We can now compute the residuals
        y_hat_cal_j = random_forest_model.predict(X_cal_j)
        residuals_cal = y_cal_j - y_hat_cal_j

        # We can now fit a model to the residuals
        cr_std = ConformalRegressor()
        cr_std.fit(residuals=residuals_cal)

        # Compute confidence intervals on sample X_train[j]
        y_hat_j = random_forest_model.predict(X_test_j)
        interval_j_std = cr_std.predict(y_hat=y_hat_j, confidence=1-delta)[0]

        # Compute loss
        sigma = compute_sigma(delta = delta, initial_interval = interval_j_std, mean = y_hat_j)
        #interval_alpha = confidence_interval(mean = y_hat_j, sigma = sigma, alpha = alpha, initial_interval_guess = interval_j_std)
        interval_alpha = interval_j_std
        lambda_i = compute_loss(y_hat_j, y_test_j, sigma, interval_alpha)

        # Compute lambda_i
        lambdas_i.append(lambda_i)

        # Save the last sigma, corresponding to the new sample, which we will use later on.
        if i == ncal: 
            sigma_new = sigma
            interval_alpha_new = interval_alpha

    lambdas_i, lambda_y = lambdas_i[:-1], lambdas_i[-1]

    return lambdas_i, lambda_y, sigma_new, interval_alpha_new

lambdas_i, lambda_y, sigma_new, interval_alpha_new = find_lamdas(new_x, new_y, delta, random_forest_model, X_cal, y_cal)



We now want to compute the possible values of y such that the corresponding p-value is larger than $\epsilon$, $p^y \geq \epsilon$ for 
$$p^y := \frac{|\{ i \in 0,\ldots,n_{cal}| \lambda_i \leq \lambda_y\}|+1}{n_{cal}+1}\geq \epsilon.$$
First we compute how the p-value should be


In [15]:
epsilon = 0.05
lambda_y_lower_bound = np.quantile(lambdas_i, (epsilon*(ncal + 1) - 1)/len(lambdas_i))
if lambda_y < lambda_y_lower_bound:
    print("We detect distribution shift, based on the p-value above")
else:
    print("We do not detect distribution shift, based on the p-value above")

We do not detect distribution shift, based on the p-value above


We now want to find the values of y that satisfies such bound

In [16]:
compute_loss_partial = partial(compute_loss, y_gt = new_y, sigma = sigma_new, interval_alpha = interval_alpha_new)

y_lower_bound = fsolve(lambda x: compute_loss_partial(x) - lambda_y_lower_bound, x0 = new_y-0.5)
y_upper_bound = fsolve(lambda x: compute_loss_partial(x) - lambda_y_lower_bound, x0 = new_y+0.5)

In practice we want to compute, for each value of x in the shifted dataset, the corresponding bounds on y

In [17]:
def compute_bounds(x, y, epsilon, delta, delta_prime, X_cal, Y_cal, model):
    """ Computes the bounds on y for a given x."""
    # Compute the lambda_i values in the full conformal model
    find_lamdas_partial = partial(find_lamdas, delta = delta, random_forest_model = random_forest_model, X_cal = X_cal, y_cal = y_cal)
    lambdas_i, lambda_y, sigma_new, interval_alpha_new = find_lamdas_partial(x, y)

    # Compute the lower bound on lambda_y
    ncal = len(X_cal)
    epsilon = epsilon + np.sqrt(-np.log(delta_prime)/(2*ncal))
    lambda_y_lower_bound = np.quantile(lambdas_i, (epsilon*(ncal + 1) - 1)/len(lambdas_i))

    # Compute the bounds on y
    compute_loss_partial = partial(compute_loss, y_gt = y, sigma = sigma_new, interval_alpha = interval_alpha_new)
    y_lower_bound = fsolve(lambda x: compute_loss_partial(x) - lambda_y_lower_bound, x0 = y-0.5) #todo: this does not seem to work very well
    y_upper_bound = fsolve(lambda x: compute_loss_partial(x) - lambda_y_lower_bound, x0 = y+0.5)

    return y_lower_bound, y_upper_bound


Now we can check whether the shifted dataset is in the confidence interval of the full conformal model with probability $1-\epsilon$.

In [18]:
X_shift, y_shift = X_shift[:10], y_shift[:10]

In [19]:
from tqdm import tqdm

cummulative = 0
delta_prime = 1/2**4 # This is the delta in the definition of the inductive conformal predictor.
delta = 1/2**4 # This is the delta used in the for the estimation of uncertainty intervals in the conformal model, to integrate the loss

for new_x, new_y in tqdm(zip(X_shift, y_shift)):
    new_x, new_y = np.expand_dims(new_x, axis = 0), np.expand_dims(new_y, axis = 0)
    lower_bound, upper_bound = compute_bounds(new_x, new_y, epsilon, delta, delta_prime, X_cal, y_cal, random_forest_model)
    if new_y > lower_bound and new_y < upper_bound: cummulative += 1

cummulative /= len(X_shift)
if cummulative > 1 - epsilon: #todo: study the certainty of this statement
    print('No distribution shift detected')
else:
    print('Distribution shift detected')

10it [10:11, 61.18s/it]

Distribution shift detected





We can furthermore compute the p-value of the hypothesis that the shifted dataset is in the confidence interval of the full conformal model, using a binomial test.

In [27]:
test = binomtest(int(cummulative*len(X_shift)), len(X_shift), 1-epsilon, alternative='greater')
print('p-value: ', test.pvalue)
print('Confidence interval: ', test.proportion_ci())

p-value:  1.0
Confidence interval:  ConfidenceInterval(low=0.0, high=1.0)


**Important question}: here we want to disprove that with probability $1-\delta$ sampled from all possible training sets, the loss of the test set is larger than $\lambda$ with probability $1-\epsilon$. Would we have to run many binomial tests, get the pvalue of each of them and then compute the p-value over the hypothesis that the failure happens with probability $1-\delta$?**

In [None]:
new_y, lower_bound, upper_bound

Note that a simpler non-full conformal version of this test can be found in the appendix D of Angelopolous paper. A slightly modified version is the following: If $(X_i, Y_i)$ and $X_{test}, Y_{test}$ are iid, and we define
$$\hat{\lambda} = \inf\left\{\lambda: \frac{|\{i: L(X, Y)\leq \lambda\}|}{n}\geq \frac{\lceil (n+1)(1-\epsilon) \rceil}{n}\right\}, $$
the probability of $Y_{test}$ being in the conformal prediction set of $X_{test}$ is
$$\mathbb{P}(Y_{test}\in \mathcal{C}_{\hat{\lambda}}(X_{test})) \geq 1-\epsilon, $$
where 
$$\mathcal{C}_{\hat{\lambda}}(X_{test}) = \{y: L(X,y)\leq \hat{\lambda}\}.$$
Thus a simple binomial test can be used to compute the p-value of the hypothesis that the shifted dataset is in the confidence interval of the full conformal model. Eg, if $$\mathbb{P}(Y_{test}\in \mathcal{C}_{\hat{\lambda}}(X_{test})) \geq 1-\epsilon$$

## Time-stratified loss metric

An alternative easy way is to check whether the loss of in the new dataset is larger than in the original model.

In [21]:
y_train_hat = random_forest_model.predict(X_train)
intervals_y_train = cr_std.predict(y_hat=y_hat_shift, confidence=1-delta)

In [22]:
def quick_loss(y_hat, y_train, alpha, intervals, delta):
    """ Computes the loss function for a given sample."""
    
    sigma = compute_sigma(delta, intervals, y_train)
    interval_alpha = confidence_interval(y_hat, sigma, alpha, [y_hat - sigma, y_hat + sigma])
    return compute_loss(y_hat, y_train, sigma, interval_alpha)


In [23]:
losses = []
for j in tqdm(range(20)):
    X_train_subset = X_train[j*485:(j+1)*485]
    y_train_subset = y_train_hat[j*485:(j+1)*485]

    y_train_hat_subset = random_forest_model.predict(X_train_subset)
    intervals_y_train = cr_std.predict(y_hat=y_train_hat_subset, confidence=1-delta)
    losses_ = []
    for i in range(len(y_train_subset)):
        losses_.append(quick_loss(y_train_hat_subset[i], y_train_subset[i], alpha, intervals_y_train[i], delta))

    loss_train = np.mean(losses_)
    losses.append(loss_train)

y_shift_hat = random_forest_model.predict(X_shift)
intervals_shift = cr_std.predict(y_hat=y_shift_hat, confidence=1-delta)
loss_test = np.mean([quick_loss(y_shift_hat[i], y_shift[i], alpha, intervals_shift[i], delta) for i in range(len(y_shift))])

if loss_test > np.quantile(losses, 0.95): #todo: change this by certainty level
    print('Distribution shift detected')
else:
    print('No distribution shift detected')

100%|██████████| 20/20 [00:18<00:00,  1.10it/s]

Distribution shift detected



  improvement from the last ten iterations.


In [24]:
loss_test

0.13951661722836595

In [25]:
losses

[0.024523803530562934,
 0.02452380353056293,
 0.024523803530562934,
 0.024523803530562934,
 0.02452380353056293,
 0.024523803530562934,
 0.02452380353056293,
 0.02452380353056293,
 0.024523803530562934,
 0.02452380353056293,
 0.024523803530562934,
 0.02452380353056293,
 0.024523803530562934,
 0.02452380353056293,
 0.02452380353056293,
 0.02452380353056293,
 0.02452380353056293,
 0.02452380353056293,
 0.02452380353056293,
 0.02452380353056293]

In [None]:
y_train