Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add named function kernels for sliding windows #86

Merged
merged 5 commits into from Nov 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.rst
Expand Up @@ -94,7 +94,7 @@ To manually install this package:
Help and Support
----------------

This project is still young. The documentation is still growing. In the meantime please
This project is still young. The `documentation <https://vectorizers.readthedocs.io/en/latest/>`_ is still growing. In the meantime please
`open an issue <https://github.com/TutteInstitute/vectorizers/issues/new>`_
and we will try to provide any help and guidance that we can. Please also check
the docstrings on the code, which provide some descriptions of the parameters.
Expand Down
34 changes: 34 additions & 0 deletions azure-pipelines.yml
@@ -0,0 +1,34 @@
# Python package
# Create and test a Python package on multiple Python versions.
# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
# https://docs.microsoft.com/azure/devops/pipelines/languages/python

trigger:
- master

pool:
vmImage: ubuntu-latest
strategy:
matrix:
Python37:
python.version: '3.7'
Python38:
python.version: '3.8'
Python39:
python.version: '3.9'

steps:
- task: UsePythonVersion@0
inputs:
versionSpec: '$(python.version)'
displayName: 'Use Python $(python.version)'

- script: |
python -m pip install --upgrade pip
pip install -r requirements.txt
displayName: 'Install dependencies'

- script: |
pip install pytest pytest-azurepipelines
pytest vectorizers/tests
displayName: 'pytest'
73 changes: 71 additions & 2 deletions vectorizers/_window_kernels.py
@@ -1,8 +1,7 @@
import numpy as np
import numba

from vectorizers.utils import flatten

EPSILON = 1e-8

# The window function

Expand Down Expand Up @@ -181,3 +180,73 @@ def gaussian_weight_kernel(n_cols, sigma, *kernel_params):
"weight": weight_kernel,
"gaussian_weight": gaussian_weight_kernel,
}

# Copied from the SciPy implementation
@numba.njit()
def binom(n, k):
n = int(n)
k = int(k)

if k > n or n < 0 or k < 0:
return 0

m = n + 1
nterms = min(k, n - k)

numerator = 1
denominator = 1
for j in range(1, nterms + 1):
numerator *= m - j
denominator *= j

return numerator // denominator

# A couple of changepoint based kernels that can be useful. The goal
# is to detect changepoints in seuquences of count of time interval
# data (where the intervals are between events).
#
# We can model count data with Poisson's and interval data as inter-arrival
# times (which can can convert to count-like data by taking reciprocals.
#
# Essentially we start with a baseline prior given by a gamma distribution,
# and then update the prior with the data in the window up to, but not
# including, the last element. The return value is then the predictive
# posterior (a negative binomial) of observing the final element of
# the window.

def count_changepoint_kernel(alpha=1.0, beta=1):
@numba.njit()
def _kernel(window):
model_window = window[:-1]
observation = window[-1]
alpha_prime = alpha + model_window.sum()
beta_prime = beta + len(model_window)
nb_r = alpha_prime
nb_p = 1.0 / (1.0 + beta_prime)

prob = binom(observation + nb_r - 1, observation) * (1 - nb_p) ** nb_r * nb_p ** observation

return np.array([-np.log(prob)])

return _kernel

def inter_arrival_changepoint_kernel(alpha=1.0, beta=1):
@numba.njit()
def _kernel(window):
model_window = 1.0 / (window[:-1] + EPSILON)
observation = 1.0 / (window[-1] + EPSILON)
alpha_prime = alpha + model_window.sum()
beta_prime = beta + len(model_window)
nb_r = alpha_prime
nb_p = 1.0 / (1.0 + beta_prime)

prob = binom(observation + nb_r - 1, observation) * (1 - nb_p) ** nb_r * nb_p ** observation

return np.array([-np.log(prob)])

return _kernel

_SLIDING_WINDOW_FUNCTION_KERNELS = {
"count_changepoint" : count_changepoint_kernel,
"timespan_changepoint": inter_arrival_changepoint_kernel,
}
13 changes: 12 additions & 1 deletion vectorizers/tests/test_transformers.py
Expand Up @@ -37,6 +37,10 @@
np.random.random(size=44),
]

changepoint_position = np.random.randint(11, 100) # changepoint position must be at least window_width in
changepoint_sequence = np.random.poisson(0.75, size=100)
changepoint_sequence[changepoint_position] = 10


@pytest.mark.parametrize("include_column_name", [True, False])
@pytest.mark.parametrize("unique_values", [True, False])
Expand Down Expand Up @@ -291,7 +295,7 @@ def test_sliding_window_transformer_basic(pad_width, kernel, sample):
("weight", np.array([0.1, 0.75, 1.5, 1.0, 0.25])),
("gaussian_weight", 2),
np.random.random((5, 5)),
numba.njit(lambda x: x.cumsum()),
numba.njit(lambda x: x.cumsum(), cache=True),
],
)
@pytest.mark.parametrize("sample", [None, np.arange(5), [4, 1, 3, 2, 0]])
Expand Down Expand Up @@ -320,6 +324,13 @@ def test_sliding_window_generator_matches_transformer(pad_width, kernel, sample)
for j, point in enumerate(point_cloud):
assert np.allclose(point, generator_result[i][j])

@pytest.mark.parametrize("window_width", [5, 10])
def test_sliding_window_count_changepoint(window_width):
swt = SlidingWindowTransformer(
window_width=window_width, kernels=[("count_changepoint", 1.0, 2.0)],
)
changepoint_scores = swt.fit_transform([changepoint_sequence])[0].flatten()
assert np.argmax(changepoint_scores) + window_width - 1 == changepoint_position

@pytest.mark.parametrize("pad_width", [0, 1])
@pytest.mark.parametrize(
Expand Down
57 changes: 47 additions & 10 deletions vectorizers/transformers/sliding_windows.py
Expand Up @@ -3,7 +3,7 @@
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted

from vectorizers._window_kernels import _SLIDING_WINDOW_KERNELS
from vectorizers._window_kernels import _SLIDING_WINDOW_KERNELS, _SLIDING_WINDOW_FUNCTION_KERNELS


@numba.njit(nogil=True)
Expand Down Expand Up @@ -93,19 +93,54 @@ def _kernel_func(data):

def build_callable_kernel(kernel_list, test_window):

tuple_of_kernels = tuple(kernel_list)
tuple_of_kernels = []
for kernel in kernel_list:
if type(kernel) in (tuple, list):
kernel, *kernel_params = kernel
else:
kernel_params = ()

@numba.njit(nogil=True)
def _kernel_func(data):
result = data
for kernel in tuple_of_kernels:
result = kernel(result)
return result
if type(kernel) == str and kernel in _SLIDING_WINDOW_FUNCTION_KERNELS:
tuple_of_kernels.append(_SLIDING_WINDOW_FUNCTION_KERNELS[kernel](*kernel_params))
elif callable(kernel):
tuple_of_kernels.append(kernel)
else:
raise ValueError(f"Bad kernel {kernel} in kernel list")

tuple_of_kernels = tuple(tuple_of_kernels)

if len(tuple_of_kernels) == 1:
_kernel_func = tuple_of_kernels[0]
else:
@numba.njit(nogil=True)
def _kernel_func(data):
result = data
for kernel in tuple_of_kernels:
result = kernel(result)
return result

kernel_output = _kernel_func(test_window)

return _kernel_func, kernel_output.shape[0], kernel_output.dtype

def check_function_kernels(kernels):

if kernels is None or len(kernels) < 1:
return False

for kernel in kernels:
if callable(kernel):
return True

if type(kernel) in (tuple, list):
kernel, *kernel_params = kernel
else:
kernel_params = ()

if type(kernel) is str and kernel in _SLIDING_WINDOW_FUNCTION_KERNELS:
return True
else:
return False


def sliding_window_generator(
Expand Down Expand Up @@ -197,7 +232,7 @@ def sliding_window_generator(
else:
window_sample_ = np.asarray(window_sample, dtype=np.int32)

if any(callable(x) for x in kernels):
if check_function_kernels(kernels):
if test_window is None:
raise ValueError("Callable kernels need to also provide a test sequence to "
"determine kernel output size and type")
Expand Down Expand Up @@ -356,7 +391,9 @@ def fit(self, X, y=None, **fit_params):
"kernels must be None or a list or tuple of kernels to apply"
)

if self.kernels is not None and any(callable(x) for x in self.kernels):
use_function_kernels = check_function_kernels(self.kernels)

if use_function_kernels:
test_window = np.asarray(X[0])[:self.window_width][self.window_sample_]
(
self.kernel_,
Expand Down