Permalink
Browse files

Fix spaCy preprocessing option

  • Loading branch information...
1 parent f59e881 commit cb4c2c3fdf5a78e6e9ad75a30b2140e1c3c58d5c Matthew Honnibal committed Mar 8, 2016
Showing with 3 additions and 3 deletions.
  1. +3 −3 prepro.py
View
@@ -17,8 +17,7 @@
import h5py
from nltk.tokenize import word_tokenize
import json
-from spacy.tokenizer import Tokenizer as SpacyTokenizer
-from spacy.vocab import Vocab as SpacyVocab
+import spacy.en
import re
@@ -256,7 +255,8 @@ def main(params):
parser.add_argument('--max_length', default=26, type=int, help='max length of a caption, in number of words. captions longer than this get clipped.')
parser.add_argument('--word_count_threshold', default=0, type=int, help='only words that occur more than this number of times will be put in vocab')
parser.add_argument('--num_test', default=0, type=int, help='number of test images (to withold until very very end)')
- parser.add_argument('--token_method', default='nltk', help='token method, nltk is much more slower.')
+ parser.add_argument('--token_method', default='nltk', help='token method. set "spacy" for unigram paraphrasing')
+ parser.add_argument('--spacy_data', default='spacy_data', help='location of spacy NLP model')
parser.add_argument('--batch_size', default=10, type=int)

0 comments on commit cb4c2c3

Please sign in to comment.