This repository has been archived by the owner on Jul 4, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 258
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #42 from benjamin-work/feature/remove-lambdas-for-…
…pickle remove lambdas for pickle
- Loading branch information
Showing
27 changed files
with
307 additions
and
59 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,36 @@ | ||
import pickle | ||
|
||
import pytest | ||
|
||
from torchnlp.text_encoders import CharacterEncoder | ||
from torchnlp.text_encoders import UNKNOWN_TOKEN | ||
from torchnlp.text_encoders.reserved_tokens import RESERVED_ITOS | ||
|
||
|
||
def test_character_encoder(): | ||
sample = ['The quick brown fox jumps over the lazy dog'] | ||
encoder = CharacterEncoder(sample) | ||
@pytest.fixture | ||
def sample(): | ||
return ['The quick brown fox jumps over the lazy dog'] | ||
|
||
|
||
@pytest.fixture | ||
def encoder(sample): | ||
return CharacterEncoder(sample) | ||
|
||
|
||
def test_character_encoder(encoder, sample): | ||
input_ = 'english-language pangram' | ||
output = encoder.encode(input_) | ||
assert encoder.vocab_size == len(set(list(sample[0]))) + len(RESERVED_ITOS) | ||
assert len(output) == len(input_) | ||
assert encoder.decode(output) == input_.replace('-', UNKNOWN_TOKEN) | ||
|
||
|
||
def test_character_encoder_min_occurrences(): | ||
sample = ['The quick brown fox jumps over the lazy dog'] | ||
def test_character_encoder_min_occurrences(sample): | ||
encoder = CharacterEncoder(sample, min_occurrences=10) | ||
input_ = 'English-language pangram' | ||
output = encoder.encode(input_) | ||
assert encoder.decode(output) == ''.join([UNKNOWN_TOKEN] * len(input_)) | ||
|
||
|
||
def test_is_pickleable(encoder): | ||
pickle.dumps(encoder) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,23 @@ | ||
import pickle | ||
|
||
import pytest | ||
|
||
from torchnlp.text_encoders import DelimiterEncoder | ||
from torchnlp.text_encoders import UNKNOWN_TOKEN | ||
from torchnlp.text_encoders import EOS_TOKEN | ||
|
||
|
||
def test_delimiter_encoder(): | ||
@pytest.fixture | ||
def encoder(): | ||
sample = ['people/deceased_person/place_of_death', 'symbols/name_source/namesakes'] | ||
encoder = DelimiterEncoder('/', sample, append_eos=True) | ||
return DelimiterEncoder('/', sample, append_eos=True) | ||
|
||
|
||
def test_delimiter_encoder(encoder): | ||
input_ = 'symbols/namesake/named_after' | ||
output = encoder.encode(input_) | ||
assert encoder.decode(output) == '/'.join(['symbols', UNKNOWN_TOKEN, UNKNOWN_TOKEN]) + EOS_TOKEN | ||
|
||
|
||
def test_is_pickleable(encoder): | ||
pickle.dumps(encoder) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import pickle | ||
|
||
import pytest | ||
|
||
from torchnlp.text_encoders import StaticTokenizerEncoder | ||
|
||
|
||
@pytest.fixture | ||
def input_(): | ||
return 'This is a sentence' | ||
|
||
|
||
@pytest.fixture | ||
def encoder(input_): | ||
return StaticTokenizerEncoder([input_]) | ||
|
||
|
||
def test_static_tokenizer_encoder(encoder, input_): | ||
tokens = encoder.encode(input_) | ||
assert encoder.decode(tokens) == input_ | ||
|
||
|
||
def test_is_pickleable(encoder): | ||
pickle.dumps(encoder) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,31 +1,37 @@ | ||
import unittest | ||
import pickle | ||
|
||
import pytest | ||
|
||
from torchnlp.text_encoders import SubwordEncoder | ||
from torchnlp.text_encoders import EOS_INDEX | ||
|
||
|
||
class SubwordEncoderTest(unittest.TestCase): | ||
class TestSubwordEncoder: | ||
|
||
def setUp(self): | ||
self.corpus = [ | ||
@pytest.fixture(scope='module') | ||
def corpus(self): | ||
return [ | ||
"One morning I shot an elephant in my pajamas. How he got in my pajamas, I don't", | ||
'know.', '', 'Groucho Marx', | ||
"I haven't slept for 10 days... because that would be too long.", '', 'Mitch Hedberg' | ||
] | ||
|
||
def test_build_vocab_target_size(self): | ||
@pytest.fixture | ||
def encoder(self, corpus): | ||
return SubwordEncoder(corpus, target_vocab_size=86, min_occurrences=2, max_occurrences=6) | ||
|
||
def test_build_vocab_target_size(self, encoder): | ||
# NOTE: `target_vocab_size` is approximate; therefore, it may not be exactly the target size | ||
encoder = SubwordEncoder( | ||
self.corpus, target_vocab_size=86, min_occurrences=2, max_occurrences=6) | ||
assert len(encoder.vocab) == 86 | ||
|
||
def test_encode(self): | ||
encoder = SubwordEncoder( | ||
self.corpus, target_vocab_size=86, min_occurrences=2, max_occurrences=6) | ||
def test_encode(self, encoder): | ||
input_ = 'This has UPPER CASE letters that are out of alphabet' | ||
assert encoder.decode(encoder.encode(input_)) == input_ | ||
|
||
def test_eos(self): | ||
encoder = SubwordEncoder(self.corpus, append_eos=True) | ||
def test_eos(self, corpus): | ||
encoder = SubwordEncoder(corpus, append_eos=True) | ||
input_ = 'This is a sentence' | ||
assert encoder.encode(input_)[-1] == EOS_INDEX | ||
|
||
def test_is_pickleable(self, encoder): | ||
pickle.dumps(encoder) |
Oops, something went wrong.