In [1]:
import numpy as np
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook
from bokeh.layouts import gridplot
from bokeh.palettes import Pastel1
from bokeh.models import Span

In [2]:
# @title
def plot_data(dist_a, dist_b, dist_c, drifts=None, warnings=None):
    output_notebook()
    color_0 = Pastel1[3][0]
    color_1 = Pastel1[3][1]
    color_2 = Pastel1[3][2]

    graph = figure(width=900, height=400,
                  tools="pan,box_zoom,reset,save",
                  title="drift stream",
                  x_axis_label='samples', y_axis_label='value',
                  background_fill_color="#fafafa"
                  )
    # add some renderers
    graph.scatter(x=[n for n in range(1000)] , y=dist_a, legend_label=r"concept_a",
                fill_color=color_0, line_color=color_0, size=4, marker='circle')
    graph.scatter(x=[n for n in range(1000, 2000, 1)], y=dist_b, legend_label=r"concept_b",
                fill_color=color_1, line_color=color_1, size=4, marker='circle')
    graph.scatter(x=[n for n in range(2000, 3000, 1)], y=dist_c, legend_label=r"concept_c",
                fill_color=color_2, line_color=color_2, size=4, marker='circle')

    if drifts is not None:
        for drift_loc in drifts:
            drift_line = Span(location=drift_loc, dimension='height',
                              line_color='red', line_width=2)
            graph.add_layout(drift_line)
    if warnings is not None:
        for warning_loc in warnings:
            warning_line = Span(location=warning_loc, dimension='height',
                              line_color='blue', line_width=2,line_dash='dashed')
            graph.add_layout(warning_line)

    show(graph)

In [3]:
np.random.seed(42)

concept_a = np.zeros((1000,), dtype=int)
concept_b = np.full((1000,), 2)
concept_c = np.ones((1000,), dtype=int)

data_stream = np.concatenate((concept_a, concept_b, concept_c))

In [None]:
plot_data(concept_a,concept_b,concept_c)

As observed above, the data stream has **2 drifts**.

The goal is to detect that drift has occurred after samples **1000**, and **2000**

## CUSUM
---
It gives an alarm when the mean of the input data is significantly different from zero.
- $g_0 = 0$
- $\hat{x}$ update
- $sum_t = max(0,sum_{t-1}+(x_t - \hat{x}) - \delta)$
- $n += 1$
- if $n > min_{obs}$ and $sum_t > \lambda:$ Change

Use $\delta=0.005$, $\lambda=50$, and $min_{obs}=30$

In [None]:
class CUSUM():
    def __init__(self,delta,lamb,min_obs):
        # Initialization
        self._n = 1
        self._x_mean = 0.0
        self._sum = 0.0
        self._delta = delta
        self._lambda = lamb
        self._min_obs = min_obs
        self.warning_detected = False
        self.drift_detected = False

    def update(self,value):
        self._x_mean += (value - self._x_mean) / self._n
        self._sum = max(0,self._sum + value - self._x_mean - self._delta)
        self._n += 1

        if self._n >= self._min_obs and self._sum > self._lambda:
            self.drift_detected = True


    def reset(self):
        self._n = 1
        self._x_mean = 0.0
        self._sum = 0.0
        self.drift_detected = False

In [None]:
drift_detector = CUSUM(delta=0.005,lamb=50,min_obs=30)
drifts = []

for i, val in enumerate(data_stream):
    drift_detector.update(val)           # Data is processed one sample at a time
    if drift_detector.drift_detected:
        print(f'Change detected at index {i}')
        drifts.append(i)
        drift_detector.reset()          # As a best practice, we reset the detector

Change detected at index 1025


In [None]:
plot_data(concept_a,concept_b,concept_c,drifts)

## Page Hinkley
---
In this example, we will use the [Page Hinkley](https://riverml.xyz/0.21.2/api/drift/PageHinkley/) drift detection method. This change detection method works by computing the observed values and their mean up to the current moment. Page-Hinkley does not signal warning zones, only change detections. The method works by means of the Page-Hinkley test. In general lines it will detect a concept drift if the observed mean at some instant is greater then a threshold value lambda.

In [6]:
from river.drift import PageHinkley

drift_detector = PageHinkley()
drifts = []

for i, val in enumerate(data_stream):
    drift_detector.update(val)           # Data is processed one sample at a time
    if drift_detector.drift_detected:
        print(f'Change detected at index {i}')
        drifts.append(i)

Change detected at index 1025
Change detected at index 2051


In [7]:
plot_data(concept_a,concept_b,concept_c,drifts)

## SEA Generator

SEA synthetic dataset.

Each observation is composed of 3 features. Only the first two features are relevant. The target is binary, and is positive if the sum of the features exceeds a certain threshold. There are 4 thresholds to choose from. Concept drift can be introduced by switching the threshold anytime during the stream.

- **Variant 0**: True if att1+att2>8

- **Variant 1**: True if att1+att2>9

- **Variant 2**: True if att1+att2>7

- **Variant 3**: True if att1+att2>9.5

**Parameter**:
- **noise**: determines the amount of observations for which the target sign will be flipped.

In [8]:
# @title
def plot_data2(stream, drifts=None, warnings=None):
    output_notebook()
    color_0 = Pastel1[3][0]
    color_1 = Pastel1[3][1]
    color_2 = Pastel1[3][2]

    graph = figure(width=900, height=400,
                  tools="pan,box_zoom,reset,save",
                  title="Error rate",
                  x_axis_label='samples', y_axis_label='value',
                  background_fill_color="#fafafa"
                  )
    # add some renderers
    graph.scatter(x=[n for n in range(3000)] , y=stream, legend_label=r"error_rate",
                fill_color=color_0, line_color=color_0, size=4, marker='circle')

    if drifts is not None:
        for drift_loc in drifts:
            drift_line = Span(location=drift_loc, dimension='height',
                              line_color='red', line_width=2)
            graph.add_layout(drift_line)
    if warnings is not None:
        for warning_loc in warnings:
            warning_line = Span(location=warning_loc, dimension='height',
                              line_color='blue', line_width=2,line_dash='dashed')
            graph.add_layout(warning_line)

    show(graph)

In [24]:
from itertools import chain
from river.datasets import synth
from river.drift import ADWIN
from river.naive_bayes import GaussianNB

segments = [
    synth.SEA(variant=0, noise=0.0, seed=42).take(1000),
    synth.SEA(variant=0, noise=0.5, seed=42).take(1000),
    synth.SEA(variant=0, noise=0.1, seed=42).take(1000),
]

stream = chain.from_iterable(segments)

## ADWIN
---
In this example, we will use the [ADaptive WINdowing (`ADWIN`)](https://riverml.xyz/0.21.2/api/drift/ADWIN/) drift detection method.

In [19]:
from river.evaluate import progressive_val_score
from river.metrics import Accuracy

adwin = ADWIN()
model = GaussianNB()
metric = Accuracy()
drifts = []

error_rate = []
cum_err = 0

for i, (x, y) in enumerate(stream, start=1):
    y_pred = model.predict_one(x)
    model.learn_one(x, y)              # prequential: predict-then-learn

    err = 0 if y_pred == y else 1
    adwin.update(err)

    cum_err += err
    error_rate.append(cum_err / i)

    if adwin.drift_detected:
        print(f"Change detected at index {i}")
        drifts.append(i)

Change detected at index 1056
Change detected at index 2080


In [20]:
plot_data2(error_rate,drifts)

## DDM
---
In this example, we will use the [DDM](https://riverml.xyz/0.21.2/api/drift/binary/DDM/) drift detection method. It is based on the PAC learning model premise, that the learner's error rate will decrease as the number of analysed samples increase, as long as the data distribution is stationary.

If the algorithm detects an increase in the error rate, that surpasses a calculated threshold, either change is detected or the algorithm will warn the user that change may occur in the near future, which is called the warning zone.

In [22]:
from river.drift.binary import DDM

ddm = DDM()
model = GaussianNB()
metric = Accuracy()
drifts = []

error_rate = []
cum_err = 0

for i, (x, y) in enumerate(stream, start=1):
    y_pred = model.predict_one(x)
    model.learn_one(x, y)              # prequential: predict-then-learn

    err = 0 if y_pred == y else 1
    ddm.update(err)

    cum_err += err
    error_rate.append(cum_err / i)

    if ddm.drift_detected:
        print(f"Change detected at index {i}")
        drifts.append(i)

Change detected at index 1035


In [23]:
plot_data2(error_rate,drifts)

## EDDM
---
In this example, we will use the [EDDM](https://riverml.xyz/0.21.2/api/drift/binary/EDDM/) drift detection method. It works by keeping track of the average distance between two errors instead of only the error rate. For this, it is necessary to keep track of the running average distance and the running standard deviation, as well as the maximum distance and the maximum standard deviation.

In [25]:
from river.drift.binary import EDDM

eddm = EDDM()
model = GaussianNB()
metric = Accuracy()
drifts = []

error_rate = []
cum_err = 0

for i, (x, y) in enumerate(stream, start=1):
    y_pred = model.predict_one(x)
    model.learn_one(x, y)              # prequential: predict-then-learn

    err = 0 if y_pred == y else 1
    eddm.update(err)

    cum_err += err
    error_rate.append(cum_err / i)

    if eddm.drift_detected:
        print(f"Change detected at index {i}")
        drifts.append(i)

Change detected at index 1028
Change detected at index 1180
Change detected at index 1258
Change detected at index 1323
Change detected at index 1437


In [26]:
plot_data2(error_rate,drifts)