Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Numericalizer abc #227

Open
wants to merge 28 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
f82aee7
WIP: DatasetABC
ivansmokovic Oct 26, 2020
7dde35a
WIP: DatasetABC
ivansmokovic Oct 27, 2020
eced615
updated Dataset
ivansmokovic Oct 27, 2020
09a7b5e
updated ArrowDataset
ivansmokovic Oct 27, 2020
4f9d055
updated iterator.py
ivansmokovic Oct 27, 2020
9884c06
flake8 fixes
ivansmokovic Oct 27, 2020
73f711a
getattr -> dict notation
ivansmokovic Oct 28, 2020
7395f93
fixed datasetABC __repr__
ivansmokovic Oct 29, 2020
ed11aad
_get -> __getitem__
ivansmokovic Oct 29, 2020
09d401d
Small changes
ivansmokovic Oct 29, 2020
44bfbcd
made ._fields a tuple
ivansmokovic Nov 3, 2020
3b9c4aa
Added shuffled
ivansmokovic Nov 3, 2020
15be987
removed as_dataset
ivansmokovic Nov 3, 2020
7fcec4d
Added type hint to shuffled
ivansmokovic Nov 3, 2020
cb0e447
Added type hint overloads to __getitem__
ivansmokovic Nov 3, 2020
aa192b3
black correction
ivansmokovic Nov 3, 2020
23234dd
WIP: NumericalizerABC prototype
ivansmokovic Nov 5, 2020
07ec682
style changes
ivansmokovic Nov 5, 2020
bca8b63
Merge remote-tracking branch 'origin/master' into numericalizer-abc
ivansmokovic Dec 9, 2020
65906ec
Merged master
ivansmokovic Dec 9, 2020
125a015
minor style changes
ivansmokovic Dec 9, 2020
7a26a04
TODO docs
ivansmokovic Dec 9, 2020
5668253
_finalize -> mark_finalized
ivansmokovic Dec 10, 2020
f110c5f
black
ivansmokovic Dec 10, 2020
e28a777
docs
ivansmokovic Dec 10, 2020
affa494
Merge remote-tracking branch 'origin/master' into numericalizer-abc
ivansmokovic Dec 10, 2020
dac186c
added error message
ivansmokovic Dec 10, 2020
56beaea
black compliance
ivansmokovic Dec 10, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion podium/datasets/dataset_abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def finalize_fields(self, *datasets: "DatasetABC") -> None:
for example in dataset:
for field in fields_to_build:
_, tokenized = example[field.name]
field.update_vocab(tokenized)
field.update_numericalizer(tokenized)

for field in self.fields:
field.finalize()
Expand Down
1 change: 1 addition & 0 deletions podium/preproc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
TextCleanUp,
)
from .lemmatizer import CroatianLemmatizer
from .numericalizer_abc import NumericalizerABC
from .sentencizers import SpacySentencizer
from .stemmer import CroatianStemmer
from .tokenizers import get_tokenizer
93 changes: 93 additions & 0 deletions podium/preproc/numericalizer_abc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from abc import ABC, abstractmethod
from typing import List

import numpy as np


class NumericalizerABC(ABC):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would rename to Numericalizer, to follow naming conventions in collections.abc

"""ABC that contains the interface for Podium numericalizers. Numericalizers are used
to transform tokens into vectors or any other custom datatype during batching.

Attributes
----------
finalized: bool
Whether this numericalizer was finalized and is able to be used for
numericalization.
"""

def __init__(self, eager=True):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Type hint for eager is missing (eager: bool = True)

"""Initialises the Numericalizer.

Parameters
----------
eager: bool
Whether the Numericalizer is to be updated during loading of the dataset, or
after all data is loaded.

"""
self._finalized = False
self._eager = eager

@abstractmethod
def numericalize(self, tokens: List[str]) -> np.ndarray:
"""Converts `tokens` into a numericalized format used in batches.
Numericalizations are most often numpy vectors, but any custom datatype is
supported.

Parameters
----------
tokens: List[str]
A list of strings that represent the tokens of this data point. Can also be
any other datatype, as long as this Numericalizer supports it.

Returns
-------
Numericalization used in batches. Numericalizations are most often numpy vectors,
but any custom datatype is supported.
"""
pass

def finalize(self):
"""Finalizes the Numericalizer and prepares it for numericalization.
This method must be overridden in classes that require finalization before
numericalization. The override must call `mark_finalize` after successful
completion."""
self.mark_finalized()
pass

def update(self, tokens: List[str]) -> None:
"""Updates this Numericalizer with a single data point. Numericalizers that need
to be updated example by example must override this method. Numericalizers that
are eager get updated during the dataset loading process, while non-eager ones get
updated after loading is finished, after all eager numericalizers were fully
updated.

Parameters
----------
tokens: List[str]
A list of strings that represent the tokens of this data point. Can also be
any other datatype, as long as this Numericalizer supports it.

"""
pass

def mark_finalized(self) -> None:
"""Marks the field as finalized. This method must be called after finalization
completes successfully."""
self._finalized = True

Comment on lines +74 to +78
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Honestly, not a huge fan of mark_finalized, but let's see what @mttk and @FilipBolt have to say.

Copy link
Collaborator

@FilipBolt FilipBolt Dec 11, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd stay with finalize as the method name to keep things consistent and make the internal variable: is_finalized. Mark finalized feels redundant.

@property
def finalized(self) -> bool:
"""Whether this Numericalizer was finalized and is ready for numericalization."""
return self._finalized

@property
def eager(self) -> bool:
"""Whether this Numericalizer is eager. Numericalizers that
are eager get updated during the dataset loading process, while non-eager ones get
updated after loading is finished, after all eager numericalizers were fully
updated."""
return self._eager

def __call__(self, tokens: List[str]) -> np.ndarray:
return self.numericalize(tokens)
72 changes: 49 additions & 23 deletions podium/storage/field.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,16 @@

import numpy as np

from podium.preproc import NumericalizerABC
from podium.preproc.tokenizers import get_tokenizer
from podium.storage.vocab import Vocab


PretokenizationHookType = Callable[[Any], Any]
PosttokenizationHookType = Callable[[Any, List[str]], Tuple[Any, List[str]]]
TokenizerType = Optional[Union[str, Callable[[Any], List[str]]]]
NumericalizerType = Callable[[str], Union[int, float]]
NumericalizerCallableType = Callable[[str], Union[int, float]]
NumericalizerType = Union[NumericalizerABC, NumericalizerCallableType]


class PretokenizationPipeline:
Expand Down Expand Up @@ -205,6 +207,16 @@ def remove_pretokenize_hooks(self):
self._pretokenization_pipeline.clear()


class NumericalizerCallableWrapper(NumericalizerABC):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NumericalizerCallableWrapper seems too verbose. Plus, since this class is used internally, it makes sense to prefix its name with an underscore.

def __init__(self, numericalizer: NumericalizerType):
super().__init__(eager=True)
self._wrapped_numericalizer = numericalizer

def numericalize(self, tokens: List[str]) -> np.ndarray:
numericalized = [self._wrapped_numericalizer(tok) for tok in tokens]
return np.array(numericalized)


class Field:
"""Holds the preprocessing and numericalization logic for a single
field of a dataset.
Expand Down Expand Up @@ -321,12 +333,16 @@ def __init__(
else:
self._tokenizer = get_tokenizer(tokenizer)

if isinstance(numericalizer, Vocab):
self._vocab = numericalizer
self._numericalizer = self.vocab.__getitem__
else:
self._vocab = None
if isinstance(numericalizer, NumericalizerABC) or numericalizer is None:
self._numericalizer = numericalizer
elif isinstance(numericalizer, Callable):
self._numericalizer = NumericalizerCallableWrapper(numericalizer)
else:
err_msg = (
f"Field {name}: unsupported numericalizer type "
f'"{type(numericalizer).__name__}"'
)
raise TypeError(err_msg)
Comment on lines +341 to +345
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would put this error message directly inside the TypeError.


self._keep_raw = keep_raw

Expand Down Expand Up @@ -385,12 +401,20 @@ def eager(self):
whether this field has a Vocab and whether that Vocab is
marked as eager
"""
return self.vocab is not None and self.vocab.eager
# Pretend to be eager if no numericalizer provided
return self._numericalizer is None or self._numericalizer.eager

@property
def vocab(self):
""""""
return self._vocab
if not self.use_vocab:
numericalizer_type = type(self._numericalizer).__name__
err_msg = (
f'Field "{self.name}" has no vocab, numericalizer type is '
f"{numericalizer_type}."
)
raise TypeError(err_msg)
return self._numericalizer

@property
def use_vocab(self):
Expand All @@ -402,7 +426,7 @@ def use_vocab(self):
Whether the field uses a vocab or not.
"""

return self.vocab is not None
return isinstance(self._numericalizer, Vocab)

@property
def is_target(self):
Expand Down Expand Up @@ -547,6 +571,7 @@ def preprocess(

# Preprocess the raw input
# TODO keep unprocessed or processed raw?
# Keeping processed for now, may change in the future
processed_raw = self._run_pretokenization_hooks(data)
tokenized = (
self._tokenizer(processed_raw)
Expand All @@ -556,7 +581,7 @@ def preprocess(

return (self._process_tokens(processed_raw, tokenized),)

def update_vocab(self, tokenized: List[str]):
def update_numericalizer(self, tokenized: Union[str, List[str]]) -> None:
"""Updates the vocab with a data point in its tokenized form.
If the field does not do tokenization,

Expand All @@ -567,11 +592,11 @@ def update_vocab(self, tokenized: List[str]):
updated with.
"""

if not self.use_vocab:
if self._numericalizer is None:
return # TODO throw Error?

data = tokenized if isinstance(tokenized, (list, tuple)) else (tokenized,)
self._vocab += data
self._numericalizer.update(data)

@property
def finalized(self) -> bool:
Expand All @@ -584,13 +609,13 @@ def finalized(self) -> bool:
Whether the field's Vocab vas finalized. If the field has no
vocab, returns True.
"""
return True if self.vocab is None else self.vocab.finalized
return self._numericalizer is None or self._numericalizer.finalized

def finalize(self):
"""Signals that this field's vocab can be built."""

if self.use_vocab:
self.vocab.finalize()
if self._numericalizer is not None:
self._numericalizer.finalize()

def _process_tokens(
self, raw: Any, tokens: Union[Any, List[str]]
Expand All @@ -616,8 +641,12 @@ def _process_tokens(
raw, tokenized = self._run_posttokenization_hooks(raw, tokens)
raw = raw if self._keep_raw else None

if self.eager and not self.vocab.finalized:
self.update_vocab(tokenized)
if (
self.eager
and self._numericalizer is not None
and not self._numericalizer.finalized
):
self.update_numericalizer(tokenized)
return self.name, (raw, tokenized)

def get_default_value(self) -> Union[int, float]:
Expand Down Expand Up @@ -679,10 +708,7 @@ def numericalize(

tokens = tokenized if isinstance(tokenized, (list, tuple)) else [tokenized]

if self.use_vocab:
return self.vocab.numericalize(tokens)
else:
return np.array([self._numericalizer(t) for t in tokens])
return self._numericalizer.numericalize(tokens)

def _pad_to_length(
self,
Expand Down Expand Up @@ -1030,16 +1056,16 @@ def __init__(

def finalize(self):
"""Signals that this field's vocab can be built."""
super().finalize()
if self._num_of_classes is None:
self.fixed_length = self._num_of_classes = len(self.vocab)
self._fixed_length = self._num_of_classes = len(self.vocab)

if self.use_vocab and len(self.vocab) > self._num_of_classes:
raise ValueError(
"Number of classes in data is greater than the declared number "
f"of classes. Declared: {self._num_of_classes}, "
f"Actual: {len(self.vocab)}"
)
super().finalize()

def numericalize(
self, data: Tuple[Optional[Any], Optional[Union[Any, List[str]]]]
Expand Down
2 changes: 1 addition & 1 deletion podium/storage/vectorizers/tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def fit(self, dataset, field):
ValueError
If the vocab or fields vocab are None
"""
if self._vocab is None and (field is None or field.vocab is None):
if self._vocab is None and (field is None or not field.use_vocab):
raise ValueError(
"Vocab is not defined. User should define vocab in constructor "
"or by providing field with a non-empty vocab property."
Expand Down
14 changes: 9 additions & 5 deletions podium/storage/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
from collections import Counter
from enum import Enum
from itertools import chain
from typing import Iterable, Union
from typing import Iterable, List, Union

import numpy as np

from podium.preproc import NumericalizerABC


def unique(values: Iterable):
"""Generator that iterates over the first occurrence of every value in values,
Expand Down Expand Up @@ -60,7 +62,7 @@ class SpecialVocabSymbols(Enum):
PAD = "<pad>"


class Vocab:
class Vocab(NumericalizerABC):
"""Class for storing vocabulary. It supports frequency counting and size
limiting.

Expand Down Expand Up @@ -97,6 +99,7 @@ def __init__(
if true word frequencies will be saved for later use on
the finalization
"""
super(Vocab, self).__init__(eager)
self._freqs = Counter()
self._keep_freqs = keep_freqs
self._min_freq = min_freq
Expand All @@ -112,8 +115,6 @@ def __init__(
self.stoi.update({k: v for v, k in enumerate(self.itos)})

self._max_size = max_size
self.eager = eager
self.finalized = False # flag to know if we're ready to numericalize

@staticmethod
def _init_default_unk_index(specials):
Expand Down Expand Up @@ -194,6 +195,9 @@ def padding_index(self):
raise ValueError("Padding symbol is not in the vocabulary.")
return self.stoi[SpecialVocabSymbols.PAD]

def update(self, tokens: List[str]) -> None:
self.__iadd__(tokens)

def __iadd__(self, values: Union["Vocab", Iterable]):
"""Adds additional values or another Vocab to this Vocab.

Expand Down Expand Up @@ -375,7 +379,7 @@ def finalize(self):

if not self._keep_freqs:
self._freqs = None # release memory
self.finalized = True
self.mark_finalized()

def numericalize(self, data):
"""Method numericalizes given tokens.
Expand Down
12 changes: 7 additions & 5 deletions tests/arrow/test_pyarrow_tabular_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,21 +139,23 @@ def test_dump_and_load(pyarrow_dataset):
def test_finalize_fields(data, fields, mocker):
for field in fields:
mocker.spy(field, "finalize")
mocker.spy(field, "update_vocab")
mocker.spy(field, "update_numericalizer")

dataset = pyarrow_dataset(data, fields)

for f in fields:
# before finalization, no field's dict was updated
if f.vocab is not None:
assert not f.finalized
if f._numericalizer is not None:
assert not f._numericalizer.finalized

dataset.finalize_fields()

fields_to_finalize = [f for f in fields if not f.eager and f.use_vocab]
fields_to_finalize = [
f for f in fields if not f.eager and f._numericalizer is not None
]
for f in fields_to_finalize:
# during finalization, only non-eager field's dict should be updated
assert f.update_vocab.call_count == (len(data) if (not f.eager) else 0)
assert f.update_numericalizer.call_count == (len(data) if (not f.eager) else 0)
f.finalize.assert_called_once()
# all fields should be finalized
assert f.finalized
Expand Down
2 changes: 1 addition & 1 deletion tests/storage/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def preprocess(self, data):

return ((self.name, (raw, tokenized)),)

def update_vocab(self, tokenized):
def update_numericalizer(self, tokenized):
assert not self.eager
self.updated_count += 1

Expand Down
Loading