TutteInstitute · jc-healy · Nov 2, 2021 · Oct 27, 2021 · Oct 28, 2021 · Oct 28, 2021
diff --git a/README.rst b/README.rst
@@ -94,7 +94,7 @@ To manually install this package:
 Help and Support
 ----------------
 
-This project is still young. The documentation is still growing. In the meantime please
+This project is still young. The `documentation <https://vectorizers.readthedocs.io/en/latest/>`_ is still growing. In the meantime please
 `open an issue <https://github.com/TutteInstitute/vectorizers/issues/new>`_
 and we will try to provide any help and guidance that we can. Please also check
 the docstrings on the code, which provide some descriptions of the parameters.

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -0,0 +1,34 @@
+# Python package
+# Create and test a Python package on multiple Python versions.
+# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
+# https://docs.microsoft.com/azure/devops/pipelines/languages/python
+
+trigger:
+- master
+
+pool:
+  vmImage: ubuntu-latest
+strategy:
+  matrix:
+    Python37:
+      python.version: '3.7'
+    Python38:
+      python.version: '3.8'
+    Python39:
+      python.version: '3.9'
+
+steps:
+- task: UsePythonVersion@0
+  inputs:
+    versionSpec: '$(python.version)'
+  displayName: 'Use Python $(python.version)'
+
+- script: |
+    python -m pip install --upgrade pip
+    pip install -r requirements.txt
+  displayName: 'Install dependencies'
+
+- script: |
+    pip install pytest pytest-azurepipelines
+    pytest vectorizers/tests
+  displayName: 'pytest'
diff --git a/vectorizers/_window_kernels.py b/vectorizers/_window_kernels.py
@@ -1,8 +1,7 @@
 import numpy as np
 import numba
 
-from vectorizers.utils import flatten
-
+EPSILON = 1e-8
 
 # The window function
 
@@ -181,3 +180,73 @@ def gaussian_weight_kernel(n_cols, sigma, *kernel_params):
     "weight": weight_kernel,
     "gaussian_weight": gaussian_weight_kernel,
 }
+
+# Copied from the SciPy implementation
+@numba.njit()
+def binom(n, k):
+    n = int(n)
+    k = int(k)
+
+    if k > n or n < 0 or k < 0:
+        return 0
+
+    m = n + 1
+    nterms = min(k, n - k)
+
+    numerator = 1
+    denominator = 1
+    for j in range(1, nterms + 1):
+        numerator *= m - j
+        denominator *= j
+
+    return numerator // denominator
+
+# A couple of changepoint based kernels that can be useful. The goal
+# is to detect changepoints in seuquences of count of time interval
+# data (where the intervals are between events).
+#
+# We can model count data with Poisson's and interval data as inter-arrival
+# times (which can can convert to count-like data by taking reciprocals.
+#
+# Essentially we start with a baseline prior given by a gamma distribution,
+# and then update the prior with the data in the window up to, but not
+# including, the last element. The return value is then the predictive
+# posterior (a negative binomial) of observing the final element of
+# the window.
+
+def count_changepoint_kernel(alpha=1.0, beta=1):
+    @numba.njit()
+    def _kernel(window):
+        model_window = window[:-1]
+        observation = window[-1]
+        alpha_prime = alpha + model_window.sum()
+        beta_prime = beta + len(model_window)
+        nb_r = alpha_prime
+        nb_p = 1.0 / (1.0 + beta_prime)
+
+        prob = binom(observation + nb_r - 1, observation) * (1 - nb_p) ** nb_r * nb_p ** observation
+
+        return np.array([-np.log(prob)])
+
+    return _kernel
+
+def inter_arrival_changepoint_kernel(alpha=1.0, beta=1):
+    @numba.njit()
+    def _kernel(window):
+        model_window = 1.0 / (window[:-1] + EPSILON)
+        observation = 1.0 / (window[-1] + EPSILON)
+        alpha_prime = alpha + model_window.sum()
+        beta_prime = beta + len(model_window)
+        nb_r = alpha_prime
+        nb_p = 1.0 / (1.0 + beta_prime)
+
+        prob = binom(observation + nb_r - 1, observation) * (1 - nb_p) ** nb_r * nb_p ** observation
+
+        return np.array([-np.log(prob)])
+
+    return _kernel
+
+_SLIDING_WINDOW_FUNCTION_KERNELS = {
+    "count_changepoint" : count_changepoint_kernel,
+    "timespan_changepoint": inter_arrival_changepoint_kernel,
+}
diff --git a/vectorizers/tests/test_transformers.py b/vectorizers/tests/test_transformers.py
@@ -37,6 +37,10 @@
     np.random.random(size=44),
 ]
 
+changepoint_position = np.random.randint(11, 100) # changepoint position must be at least window_width in
+changepoint_sequence = np.random.poisson(0.75, size=100)
+changepoint_sequence[changepoint_position] = 10
+
 
 @pytest.mark.parametrize("include_column_name", [True, False])
 @pytest.mark.parametrize("unique_values", [True, False])
@@ -291,7 +295,7 @@ def test_sliding_window_transformer_basic(pad_width, kernel, sample):
         ("weight", np.array([0.1, 0.75, 1.5, 1.0, 0.25])),
         ("gaussian_weight", 2),
         np.random.random((5, 5)),
-        numba.njit(lambda x: x.cumsum()),
+        numba.njit(lambda x: x.cumsum(), cache=True),
     ],
 )
 @pytest.mark.parametrize("sample", [None, np.arange(5), [4, 1, 3, 2, 0]])
@@ -320,6 +324,13 @@ def test_sliding_window_generator_matches_transformer(pad_width, kernel, sample)
         for j, point in enumerate(point_cloud):
             assert np.allclose(point, generator_result[i][j])
 
+@pytest.mark.parametrize("window_width", [5, 10])
+def test_sliding_window_count_changepoint(window_width):
+    swt = SlidingWindowTransformer(
+        window_width=window_width, kernels=[("count_changepoint", 1.0, 2.0)],
+    )
+    changepoint_scores = swt.fit_transform([changepoint_sequence])[0].flatten()
+    assert np.argmax(changepoint_scores) + window_width - 1 == changepoint_position
 
 @pytest.mark.parametrize("pad_width", [0, 1])
 @pytest.mark.parametrize(

diff --git a/vectorizers/transformers/sliding_windows.py b/vectorizers/transformers/sliding_windows.py
@@ -3,7 +3,7 @@
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils.validation import check_is_fitted
 
-from vectorizers._window_kernels import _SLIDING_WINDOW_KERNELS
+from vectorizers._window_kernels import _SLIDING_WINDOW_KERNELS, _SLIDING_WINDOW_FUNCTION_KERNELS
 
 
 @numba.njit(nogil=True)
@@ -93,19 +93,54 @@ def _kernel_func(data):
 
 def build_callable_kernel(kernel_list, test_window):
 
-    tuple_of_kernels = tuple(kernel_list)
+    tuple_of_kernels = []
+    for kernel in kernel_list:
+        if type(kernel) in (tuple, list):
+            kernel, *kernel_params = kernel
+        else:
+            kernel_params = ()
 
-    @numba.njit(nogil=True)
-    def _kernel_func(data):
-        result = data
-        for kernel in tuple_of_kernels:
-            result = kernel(result)
-        return result
+        if type(kernel) == str and kernel in _SLIDING_WINDOW_FUNCTION_KERNELS:
+            tuple_of_kernels.append(_SLIDING_WINDOW_FUNCTION_KERNELS[kernel](*kernel_params))
+        elif callable(kernel):
+            tuple_of_kernels.append(kernel)
+        else:
+            raise ValueError(f"Bad kernel {kernel} in kernel list")
+
+    tuple_of_kernels = tuple(tuple_of_kernels)
+
+    if len(tuple_of_kernels) == 1:
+        _kernel_func = tuple_of_kernels[0]
+    else:
+        @numba.njit(nogil=True)
+        def _kernel_func(data):
+            result = data
+            for kernel in tuple_of_kernels:
+                result = kernel(result)
+            return result
 
     kernel_output = _kernel_func(test_window)
 
     return _kernel_func, kernel_output.shape[0], kernel_output.dtype
 
+def check_function_kernels(kernels):
+
+    if kernels is None or len(kernels) < 1:
+        return False
+
+    for kernel in kernels:
+        if callable(kernel):
+            return True
+
+        if type(kernel) in (tuple, list):
+            kernel, *kernel_params = kernel
+        else:
+            kernel_params = ()
+
+        if type(kernel) is str and kernel in _SLIDING_WINDOW_FUNCTION_KERNELS:
+            return True
+    else:
+        return False
 
 
 def sliding_window_generator(
@@ -197,7 +232,7 @@ def sliding_window_generator(
     else:
         window_sample_ = np.asarray(window_sample, dtype=np.int32)
 
-    if any(callable(x) for x in kernels):
+    if check_function_kernels(kernels):
         if test_window is None:
             raise ValueError("Callable kernels need to also provide a test sequence to "
                              "determine kernel output size and type")
@@ -356,7 +391,9 @@ def fit(self, X, y=None, **fit_params):
                 "kernels must be None or a list or tuple of kernels to apply"
             )
 
-        if self.kernels is not None and any(callable(x) for x in self.kernels):
+        use_function_kernels = check_function_kernels(self.kernels)
+
+        if use_function_kernels:
             test_window = np.asarray(X[0])[:self.window_width][self.window_sample_]
             (
                 self.kernel_,