diff --git a/README.rst b/README.rst index 88f04f3..97cb280 100644 --- a/README.rst +++ b/README.rst @@ -94,7 +94,7 @@ To manually install this package: Help and Support ---------------- -This project is still young. The documentation is still growing. In the meantime please +This project is still young. The `documentation `_ is still growing. In the meantime please `open an issue `_ and we will try to provide any help and guidance that we can. Please also check the docstrings on the code, which provide some descriptions of the parameters. diff --git a/azure-pipelines.yml b/azure-pipelines.yml new file mode 100644 index 0000000..2edd822 --- /dev/null +++ b/azure-pipelines.yml @@ -0,0 +1,34 @@ +# Python package +# Create and test a Python package on multiple Python versions. +# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more: +# https://docs.microsoft.com/azure/devops/pipelines/languages/python + +trigger: +- master + +pool: + vmImage: ubuntu-latest +strategy: + matrix: + Python37: + python.version: '3.7' + Python38: + python.version: '3.8' + Python39: + python.version: '3.9' + +steps: +- task: UsePythonVersion@0 + inputs: + versionSpec: '$(python.version)' + displayName: 'Use Python $(python.version)' + +- script: | + python -m pip install --upgrade pip + pip install -r requirements.txt + displayName: 'Install dependencies' + +- script: | + pip install pytest pytest-azurepipelines + pytest vectorizers/tests + displayName: 'pytest' diff --git a/vectorizers/_window_kernels.py b/vectorizers/_window_kernels.py index 9eea9ff..b4c9435 100644 --- a/vectorizers/_window_kernels.py +++ b/vectorizers/_window_kernels.py @@ -1,8 +1,7 @@ import numpy as np import numba -from vectorizers.utils import flatten - +EPSILON = 1e-8 # The window function @@ -181,3 +180,73 @@ def gaussian_weight_kernel(n_cols, sigma, *kernel_params): "weight": weight_kernel, "gaussian_weight": gaussian_weight_kernel, } + +# Copied from the SciPy implementation +@numba.njit() +def binom(n, k): + n = int(n) + k = int(k) + + if k > n or n < 0 or k < 0: + return 0 + + m = n + 1 + nterms = min(k, n - k) + + numerator = 1 + denominator = 1 + for j in range(1, nterms + 1): + numerator *= m - j + denominator *= j + + return numerator // denominator + +# A couple of changepoint based kernels that can be useful. The goal +# is to detect changepoints in seuquences of count of time interval +# data (where the intervals are between events). +# +# We can model count data with Poisson's and interval data as inter-arrival +# times (which can can convert to count-like data by taking reciprocals. +# +# Essentially we start with a baseline prior given by a gamma distribution, +# and then update the prior with the data in the window up to, but not +# including, the last element. The return value is then the predictive +# posterior (a negative binomial) of observing the final element of +# the window. + +def count_changepoint_kernel(alpha=1.0, beta=1): + @numba.njit() + def _kernel(window): + model_window = window[:-1] + observation = window[-1] + alpha_prime = alpha + model_window.sum() + beta_prime = beta + len(model_window) + nb_r = alpha_prime + nb_p = 1.0 / (1.0 + beta_prime) + + prob = binom(observation + nb_r - 1, observation) * (1 - nb_p) ** nb_r * nb_p ** observation + + return np.array([-np.log(prob)]) + + return _kernel + +def inter_arrival_changepoint_kernel(alpha=1.0, beta=1): + @numba.njit() + def _kernel(window): + model_window = 1.0 / (window[:-1] + EPSILON) + observation = 1.0 / (window[-1] + EPSILON) + alpha_prime = alpha + model_window.sum() + beta_prime = beta + len(model_window) + nb_r = alpha_prime + nb_p = 1.0 / (1.0 + beta_prime) + + prob = binom(observation + nb_r - 1, observation) * (1 - nb_p) ** nb_r * nb_p ** observation + + return np.array([-np.log(prob)]) + + return _kernel + +_SLIDING_WINDOW_FUNCTION_KERNELS = { + "count_changepoint" : count_changepoint_kernel, + "timespan_changepoint": inter_arrival_changepoint_kernel, +} \ No newline at end of file diff --git a/vectorizers/tests/test_transformers.py b/vectorizers/tests/test_transformers.py index 8267fd0..c82d1e9 100644 --- a/vectorizers/tests/test_transformers.py +++ b/vectorizers/tests/test_transformers.py @@ -37,6 +37,10 @@ np.random.random(size=44), ] +changepoint_position = np.random.randint(11, 100) # changepoint position must be at least window_width in +changepoint_sequence = np.random.poisson(0.75, size=100) +changepoint_sequence[changepoint_position] = 10 + @pytest.mark.parametrize("include_column_name", [True, False]) @pytest.mark.parametrize("unique_values", [True, False]) @@ -291,7 +295,7 @@ def test_sliding_window_transformer_basic(pad_width, kernel, sample): ("weight", np.array([0.1, 0.75, 1.5, 1.0, 0.25])), ("gaussian_weight", 2), np.random.random((5, 5)), - numba.njit(lambda x: x.cumsum()), + numba.njit(lambda x: x.cumsum(), cache=True), ], ) @pytest.mark.parametrize("sample", [None, np.arange(5), [4, 1, 3, 2, 0]]) @@ -320,6 +324,13 @@ def test_sliding_window_generator_matches_transformer(pad_width, kernel, sample) for j, point in enumerate(point_cloud): assert np.allclose(point, generator_result[i][j]) +@pytest.mark.parametrize("window_width", [5, 10]) +def test_sliding_window_count_changepoint(window_width): + swt = SlidingWindowTransformer( + window_width=window_width, kernels=[("count_changepoint", 1.0, 2.0)], + ) + changepoint_scores = swt.fit_transform([changepoint_sequence])[0].flatten() + assert np.argmax(changepoint_scores) + window_width - 1 == changepoint_position @pytest.mark.parametrize("pad_width", [0, 1]) @pytest.mark.parametrize( diff --git a/vectorizers/transformers/sliding_windows.py b/vectorizers/transformers/sliding_windows.py index 18d4db4..a1a89fd 100644 --- a/vectorizers/transformers/sliding_windows.py +++ b/vectorizers/transformers/sliding_windows.py @@ -3,7 +3,7 @@ from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_is_fitted -from vectorizers._window_kernels import _SLIDING_WINDOW_KERNELS +from vectorizers._window_kernels import _SLIDING_WINDOW_KERNELS, _SLIDING_WINDOW_FUNCTION_KERNELS @numba.njit(nogil=True) @@ -93,19 +93,54 @@ def _kernel_func(data): def build_callable_kernel(kernel_list, test_window): - tuple_of_kernels = tuple(kernel_list) + tuple_of_kernels = [] + for kernel in kernel_list: + if type(kernel) in (tuple, list): + kernel, *kernel_params = kernel + else: + kernel_params = () - @numba.njit(nogil=True) - def _kernel_func(data): - result = data - for kernel in tuple_of_kernels: - result = kernel(result) - return result + if type(kernel) == str and kernel in _SLIDING_WINDOW_FUNCTION_KERNELS: + tuple_of_kernels.append(_SLIDING_WINDOW_FUNCTION_KERNELS[kernel](*kernel_params)) + elif callable(kernel): + tuple_of_kernels.append(kernel) + else: + raise ValueError(f"Bad kernel {kernel} in kernel list") + + tuple_of_kernels = tuple(tuple_of_kernels) + + if len(tuple_of_kernels) == 1: + _kernel_func = tuple_of_kernels[0] + else: + @numba.njit(nogil=True) + def _kernel_func(data): + result = data + for kernel in tuple_of_kernels: + result = kernel(result) + return result kernel_output = _kernel_func(test_window) return _kernel_func, kernel_output.shape[0], kernel_output.dtype +def check_function_kernels(kernels): + + if kernels is None or len(kernels) < 1: + return False + + for kernel in kernels: + if callable(kernel): + return True + + if type(kernel) in (tuple, list): + kernel, *kernel_params = kernel + else: + kernel_params = () + + if type(kernel) is str and kernel in _SLIDING_WINDOW_FUNCTION_KERNELS: + return True + else: + return False def sliding_window_generator( @@ -197,7 +232,7 @@ def sliding_window_generator( else: window_sample_ = np.asarray(window_sample, dtype=np.int32) - if any(callable(x) for x in kernels): + if check_function_kernels(kernels): if test_window is None: raise ValueError("Callable kernels need to also provide a test sequence to " "determine kernel output size and type") @@ -356,7 +391,9 @@ def fit(self, X, y=None, **fit_params): "kernels must be None or a list or tuple of kernels to apply" ) - if self.kernels is not None and any(callable(x) for x in self.kernels): + use_function_kernels = check_function_kernels(self.kernels) + + if use_function_kernels: test_window = np.asarray(X[0])[:self.window_width][self.window_sample_] ( self.kernel_,