Skip to content
This repository has been archived by the owner on Jul 4, 2023. It is now read-only.

Commit

Permalink
Merge pull request #22 from floscha/spacy-encoder-language-arg
Browse files Browse the repository at this point in the history
SpacyEncoder language argument
  • Loading branch information
PetrochukM committed Apr 25, 2018
2 parents 7632a17 + fee9caf commit 049f534
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 14 deletions.
10 changes: 5 additions & 5 deletions build_tools/travis/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,16 @@ pip install spacy
pip install nltk

# SpaCy English web model
python -m spacy download en_core_web_sm
python -m spacy download en

# NLTK data needed for Moses tokenizer
python -m nltk.downloader perluniprops nonbreaking_prefixes

# Install PyTorch Dependancies
if [[ $TRAVIS_PYTHON_VERSION == '3.6' ]]; then
pip install http://download.pytorch.org/whl/cpu/torch-0.3.1-cp36-cp36m-linux_x86_64.whl
if [[ $TRAVIS_PYTHON_VERSION == '3.6' ]]; then
pip install http://download.pytorch.org/whl/cpu/torch-0.3.1-cp36-cp36m-linux_x86_64.whl
fi
if [[ $TRAVIS_PYTHON_VERSION == '3.5' ]]; then
pip install http://download.pytorch.org/whl/cpu/torch-0.3.1-cp35-cp35m-linux_x86_64.whl
if [[ $TRAVIS_PYTHON_VERSION == '3.5' ]]; then
pip install http://download.pytorch.org/whl/cpu/torch-0.3.1-cp35-cp35m-linux_x86_64.whl
fi
pip install torchvision
21 changes: 21 additions & 0 deletions tests/text_encoders/test_spacy_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,24 @@ def test_spacy_encoder():
encoder = SpacyEncoder([input_])
tokens = encoder.encode(input_)
assert encoder.decode(tokens) == input_


def test_spacy_encoder_not_installed_language():
error_message = ''
try:
SpacyEncoder([], language='fr')
except Exception as e:
error_message = str(e)

assert error_message.startswith("Language 'fr' not found.")


def test_spacy_encoder_unsupported_language():
error_message = ''
try:
SpacyEncoder([], language='python')
except Exception as e:
error_message = str(e)

assert error_message.startswith("No tokenizer available for language " +
"'python'.")
38 changes: 29 additions & 9 deletions torchnlp/text_encoders/spacy_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@


class SpacyEncoder(StaticTokenizerEncoder):
""" Encodes the text using the Spacy `en_core_web_sm` tokenizer.
""" Encodes the text using spaCy's tokenizer.
**Tokenizer Reference:**
https://spacy.io/api/tokenizer
Args:
sample (list of strings): Sample of data to build dictionary on
language (string, optional): Language to use for parsing. Accepted values
are 'en', 'de', 'es', 'pt', 'fr', 'it', 'nl' and 'xx'.
For details see https://spacy.io/models/#available-models
min_occurrences (int, optional): Minimum number of occurrences for a token to be added to
dictionary.
append_eos (bool, optional): If `True` append EOS token onto the end to the encoded vector.
Expand Down Expand Up @@ -36,15 +39,32 @@ def __init__(self, *args, **kwargs):

try:
import spacy
from spacy.lang.en import English

# Use the SpacyEncoder by downloading en_core_web_sm via:
# `python -m spacy download en_core_web_sm`
_MODEL = spacy.load('en_core_web_sm')
tokenizer = English().Defaults.create_tokenizer(_MODEL)
except ImportError:
print("Please install Spacy: "
"`pip install spacy` `python -m spacy download en_core_web_sm`")
print("Please install spaCy: "
"`pip install spacy`")
raise

# Use English as default when no language was specified
language = kwargs.get('language', 'en')

# All languages supported by spaCy can be found here:
# https://spacy.io/models/#available-models
supported_languages = ['en', 'de', 'es', 'pt', 'fr', 'it', 'nl', 'xx']

if language in supported_languages:
# Load the spaCy language model if it has been installed
try:
nlp = spacy.load(language)
except OSError:
raise ValueError(("Language '{0}' not found. Install using " +
"spaCy: `python -m spacy download {0}`"
).format(language))

from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)
else:
raise ValueError(("No tokenizer available for language '%s'. " +
"Currently supported are %s")
% (language, supported_languages))

super().__init__(*args, tokenize=lambda s: [w.text for w in tokenizer(s)], **kwargs)

0 comments on commit 049f534

Please sign in to comment.