Merge pull request #22 from floscha/spacy-encoder-language-arg

SpacyEncoder language argument
PetrochukM · Apr 25, 2018 · 049f534 · 049f534
2 parents 7632a17 + fee9caf
commit 049f534
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 14 deletions.
diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh
@@ -28,16 +28,16 @@ pip install spacy
 pip install nltk
 
 # SpaCy English web model
-python -m spacy download en_core_web_sm
+python -m spacy download en
 
 # NLTK data needed for Moses tokenizer
 python -m nltk.downloader perluniprops nonbreaking_prefixes
 
 # Install PyTorch Dependancies
-if [[ $TRAVIS_PYTHON_VERSION == '3.6' ]]; then 
-    pip install http://download.pytorch.org/whl/cpu/torch-0.3.1-cp36-cp36m-linux_x86_64.whl 
+if [[ $TRAVIS_PYTHON_VERSION == '3.6' ]]; then
+    pip install http://download.pytorch.org/whl/cpu/torch-0.3.1-cp36-cp36m-linux_x86_64.whl
 fi
-if [[ $TRAVIS_PYTHON_VERSION == '3.5' ]]; then 
-    pip install http://download.pytorch.org/whl/cpu/torch-0.3.1-cp35-cp35m-linux_x86_64.whl  
+if [[ $TRAVIS_PYTHON_VERSION == '3.5' ]]; then
+    pip install http://download.pytorch.org/whl/cpu/torch-0.3.1-cp35-cp35m-linux_x86_64.whl
 fi
 pip install torchvision
diff --git a/tests/text_encoders/test_spacy_encoder.py b/tests/text_encoders/test_spacy_encoder.py
@@ -6,3 +6,24 @@ def test_spacy_encoder():
     encoder = SpacyEncoder([input_])
     tokens = encoder.encode(input_)
     assert encoder.decode(tokens) == input_
+
+
+def test_spacy_encoder_not_installed_language():
+    error_message = ''
+    try:
+        SpacyEncoder([], language='fr')
+    except Exception as e:
+        error_message = str(e)
+
+    assert error_message.startswith("Language 'fr' not found.")
+
+
+def test_spacy_encoder_unsupported_language():
+    error_message = ''
+    try:
+        SpacyEncoder([], language='python')
+    except Exception as e:
+        error_message = str(e)
+
+    assert error_message.startswith("No tokenizer available for language " +
+                                    "'python'.")
diff --git a/torchnlp/text_encoders/spacy_encoder.py b/torchnlp/text_encoders/spacy_encoder.py
@@ -2,13 +2,16 @@
 
 
 class SpacyEncoder(StaticTokenizerEncoder):
-    """ Encodes the text using the Spacy `en_core_web_sm` tokenizer.
+    """ Encodes the text using spaCy's tokenizer.
 
     **Tokenizer Reference:**
     https://spacy.io/api/tokenizer
 
     Args:
         sample (list of strings): Sample of data to build dictionary on
+        language (string, optional): Language to use for parsing. Accepted values
+            are 'en', 'de', 'es', 'pt', 'fr', 'it', 'nl' and 'xx'.
+            For details see https://spacy.io/models/#available-models
         min_occurrences (int, optional): Minimum number of occurrences for a token to be added to
           dictionary.
         append_eos (bool, optional): If `True` append EOS token onto the end to the encoded vector.
@@ -36,15 +39,32 @@ def __init__(self, *args, **kwargs):
 
         try:
             import spacy
-            from spacy.lang.en import English
-
-            # Use the SpacyEncoder by downloading en_core_web_sm via:
-            # `python -m spacy download en_core_web_sm`
-            _MODEL = spacy.load('en_core_web_sm')
-            tokenizer = English().Defaults.create_tokenizer(_MODEL)
         except ImportError:
-            print("Please install Spacy: "
-                  "`pip install spacy` `python -m spacy download en_core_web_sm`")
+            print("Please install spaCy: "
+                  "`pip install spacy`")
             raise
 
+        # Use English as default when no language was specified
+        language = kwargs.get('language', 'en')
+
+        # All languages supported by spaCy can be found here:
+        #   https://spacy.io/models/#available-models
+        supported_languages = ['en', 'de', 'es', 'pt', 'fr', 'it', 'nl', 'xx']
+
+        if language in supported_languages:
+            # Load the spaCy language model if it has been installed
+            try:
+                nlp = spacy.load(language)
+            except OSError:
+                raise ValueError(("Language '{0}' not found. Install using " +
+                                  "spaCy: `python -m spacy download {0}`"
+                                  ).format(language))
+
+            from spacy.tokenizer import Tokenizer
+            tokenizer = Tokenizer(nlp.vocab)
+        else:
+            raise ValueError(("No tokenizer available for language '%s'. " +
+                              "Currently supported are %s")
+                             % (language, supported_languages))
+
         super().__init__(*args, tokenize=lambda s: [w.text for w in tokenizer(s)], **kwargs)