diff --git a/README.rst b/README.rst
index 88f04f3..97cb280 100644
--- a/README.rst
+++ b/README.rst
@@ -94,7 +94,7 @@ To manually install this package:
Help and Support
----------------
-This project is still young. The documentation is still growing. In the meantime please
+This project is still young. The `documentation `_ is still growing. In the meantime please
`open an issue `_
and we will try to provide any help and guidance that we can. Please also check
the docstrings on the code, which provide some descriptions of the parameters.
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
new file mode 100644
index 0000000..2edd822
--- /dev/null
+++ b/azure-pipelines.yml
@@ -0,0 +1,34 @@
+# Python package
+# Create and test a Python package on multiple Python versions.
+# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
+# https://docs.microsoft.com/azure/devops/pipelines/languages/python
+
+trigger:
+- master
+
+pool:
+ vmImage: ubuntu-latest
+strategy:
+ matrix:
+ Python37:
+ python.version: '3.7'
+ Python38:
+ python.version: '3.8'
+ Python39:
+ python.version: '3.9'
+
+steps:
+- task: UsePythonVersion@0
+ inputs:
+ versionSpec: '$(python.version)'
+ displayName: 'Use Python $(python.version)'
+
+- script: |
+ python -m pip install --upgrade pip
+ pip install -r requirements.txt
+ displayName: 'Install dependencies'
+
+- script: |
+ pip install pytest pytest-azurepipelines
+ pytest vectorizers/tests
+ displayName: 'pytest'
diff --git a/vectorizers/_window_kernels.py b/vectorizers/_window_kernels.py
index 9eea9ff..b4c9435 100644
--- a/vectorizers/_window_kernels.py
+++ b/vectorizers/_window_kernels.py
@@ -1,8 +1,7 @@
import numpy as np
import numba
-from vectorizers.utils import flatten
-
+EPSILON = 1e-8
# The window function
@@ -181,3 +180,73 @@ def gaussian_weight_kernel(n_cols, sigma, *kernel_params):
"weight": weight_kernel,
"gaussian_weight": gaussian_weight_kernel,
}
+
+# Copied from the SciPy implementation
+@numba.njit()
+def binom(n, k):
+ n = int(n)
+ k = int(k)
+
+ if k > n or n < 0 or k < 0:
+ return 0
+
+ m = n + 1
+ nterms = min(k, n - k)
+
+ numerator = 1
+ denominator = 1
+ for j in range(1, nterms + 1):
+ numerator *= m - j
+ denominator *= j
+
+ return numerator // denominator
+
+# A couple of changepoint based kernels that can be useful. The goal
+# is to detect changepoints in seuquences of count of time interval
+# data (where the intervals are between events).
+#
+# We can model count data with Poisson's and interval data as inter-arrival
+# times (which can can convert to count-like data by taking reciprocals.
+#
+# Essentially we start with a baseline prior given by a gamma distribution,
+# and then update the prior with the data in the window up to, but not
+# including, the last element. The return value is then the predictive
+# posterior (a negative binomial) of observing the final element of
+# the window.
+
+def count_changepoint_kernel(alpha=1.0, beta=1):
+ @numba.njit()
+ def _kernel(window):
+ model_window = window[:-1]
+ observation = window[-1]
+ alpha_prime = alpha + model_window.sum()
+ beta_prime = beta + len(model_window)
+ nb_r = alpha_prime
+ nb_p = 1.0 / (1.0 + beta_prime)
+
+ prob = binom(observation + nb_r - 1, observation) * (1 - nb_p) ** nb_r * nb_p ** observation
+
+ return np.array([-np.log(prob)])
+
+ return _kernel
+
+def inter_arrival_changepoint_kernel(alpha=1.0, beta=1):
+ @numba.njit()
+ def _kernel(window):
+ model_window = 1.0 / (window[:-1] + EPSILON)
+ observation = 1.0 / (window[-1] + EPSILON)
+ alpha_prime = alpha + model_window.sum()
+ beta_prime = beta + len(model_window)
+ nb_r = alpha_prime
+ nb_p = 1.0 / (1.0 + beta_prime)
+
+ prob = binom(observation + nb_r - 1, observation) * (1 - nb_p) ** nb_r * nb_p ** observation
+
+ return np.array([-np.log(prob)])
+
+ return _kernel
+
+_SLIDING_WINDOW_FUNCTION_KERNELS = {
+ "count_changepoint" : count_changepoint_kernel,
+ "timespan_changepoint": inter_arrival_changepoint_kernel,
+}
\ No newline at end of file
diff --git a/vectorizers/tests/test_transformers.py b/vectorizers/tests/test_transformers.py
index 8267fd0..c82d1e9 100644
--- a/vectorizers/tests/test_transformers.py
+++ b/vectorizers/tests/test_transformers.py
@@ -37,6 +37,10 @@
np.random.random(size=44),
]
+changepoint_position = np.random.randint(11, 100) # changepoint position must be at least window_width in
+changepoint_sequence = np.random.poisson(0.75, size=100)
+changepoint_sequence[changepoint_position] = 10
+
@pytest.mark.parametrize("include_column_name", [True, False])
@pytest.mark.parametrize("unique_values", [True, False])
@@ -291,7 +295,7 @@ def test_sliding_window_transformer_basic(pad_width, kernel, sample):
("weight", np.array([0.1, 0.75, 1.5, 1.0, 0.25])),
("gaussian_weight", 2),
np.random.random((5, 5)),
- numba.njit(lambda x: x.cumsum()),
+ numba.njit(lambda x: x.cumsum(), cache=True),
],
)
@pytest.mark.parametrize("sample", [None, np.arange(5), [4, 1, 3, 2, 0]])
@@ -320,6 +324,13 @@ def test_sliding_window_generator_matches_transformer(pad_width, kernel, sample)
for j, point in enumerate(point_cloud):
assert np.allclose(point, generator_result[i][j])
+@pytest.mark.parametrize("window_width", [5, 10])
+def test_sliding_window_count_changepoint(window_width):
+ swt = SlidingWindowTransformer(
+ window_width=window_width, kernels=[("count_changepoint", 1.0, 2.0)],
+ )
+ changepoint_scores = swt.fit_transform([changepoint_sequence])[0].flatten()
+ assert np.argmax(changepoint_scores) + window_width - 1 == changepoint_position
@pytest.mark.parametrize("pad_width", [0, 1])
@pytest.mark.parametrize(
diff --git a/vectorizers/transformers/sliding_windows.py b/vectorizers/transformers/sliding_windows.py
index 18d4db4..a1a89fd 100644
--- a/vectorizers/transformers/sliding_windows.py
+++ b/vectorizers/transformers/sliding_windows.py
@@ -3,7 +3,7 @@
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
-from vectorizers._window_kernels import _SLIDING_WINDOW_KERNELS
+from vectorizers._window_kernels import _SLIDING_WINDOW_KERNELS, _SLIDING_WINDOW_FUNCTION_KERNELS
@numba.njit(nogil=True)
@@ -93,19 +93,54 @@ def _kernel_func(data):
def build_callable_kernel(kernel_list, test_window):
- tuple_of_kernels = tuple(kernel_list)
+ tuple_of_kernels = []
+ for kernel in kernel_list:
+ if type(kernel) in (tuple, list):
+ kernel, *kernel_params = kernel
+ else:
+ kernel_params = ()
- @numba.njit(nogil=True)
- def _kernel_func(data):
- result = data
- for kernel in tuple_of_kernels:
- result = kernel(result)
- return result
+ if type(kernel) == str and kernel in _SLIDING_WINDOW_FUNCTION_KERNELS:
+ tuple_of_kernels.append(_SLIDING_WINDOW_FUNCTION_KERNELS[kernel](*kernel_params))
+ elif callable(kernel):
+ tuple_of_kernels.append(kernel)
+ else:
+ raise ValueError(f"Bad kernel {kernel} in kernel list")
+
+ tuple_of_kernels = tuple(tuple_of_kernels)
+
+ if len(tuple_of_kernels) == 1:
+ _kernel_func = tuple_of_kernels[0]
+ else:
+ @numba.njit(nogil=True)
+ def _kernel_func(data):
+ result = data
+ for kernel in tuple_of_kernels:
+ result = kernel(result)
+ return result
kernel_output = _kernel_func(test_window)
return _kernel_func, kernel_output.shape[0], kernel_output.dtype
+def check_function_kernels(kernels):
+
+ if kernels is None or len(kernels) < 1:
+ return False
+
+ for kernel in kernels:
+ if callable(kernel):
+ return True
+
+ if type(kernel) in (tuple, list):
+ kernel, *kernel_params = kernel
+ else:
+ kernel_params = ()
+
+ if type(kernel) is str and kernel in _SLIDING_WINDOW_FUNCTION_KERNELS:
+ return True
+ else:
+ return False
def sliding_window_generator(
@@ -197,7 +232,7 @@ def sliding_window_generator(
else:
window_sample_ = np.asarray(window_sample, dtype=np.int32)
- if any(callable(x) for x in kernels):
+ if check_function_kernels(kernels):
if test_window is None:
raise ValueError("Callable kernels need to also provide a test sequence to "
"determine kernel output size and type")
@@ -356,7 +391,9 @@ def fit(self, X, y=None, **fit_params):
"kernels must be None or a list or tuple of kernels to apply"
)
- if self.kernels is not None and any(callable(x) for x in self.kernels):
+ use_function_kernels = check_function_kernels(self.kernels)
+
+ if use_function_kernels:
test_window = np.asarray(X[0])[:self.window_width][self.window_sample_]
(
self.kernel_,