Skip to content

Commit

Permalink
Add script to create spaCy unigram paraphrase model
Browse files Browse the repository at this point in the history
  • Loading branch information
Matthew Honnibal committed Mar 8, 2016
1 parent 58b898d commit 4af143f
Showing 1 changed file with 2 additions and 5 deletions.
7 changes: 2 additions & 5 deletions create_spacy_paraphraser.py
Expand Up @@ -39,8 +39,6 @@ def main(params):
input_train_json = json.load(open(params['input_train_json'], 'r'))
print("Load spaCy with GloVe vectors")
nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
nlp = spacy.en.English(data_dir=params['spacy_data'],
parser=False, tagger=False, entity=False)
words_to_keep = build_vocab(
nlp.tokenizer,
[img['question'] for img in input_train_json],
Expand Down Expand Up @@ -73,11 +71,10 @@ def main(params):
if not os.path.exists(os.path.join(params['spacy_data'], 'tokenizer')):
os.mkdir(os.path.join(params['spacy_data'], 'tokenizer'))


nlp.vocab.dump(os.path.join(params['spacy_data'], 'vocab', 'lexemes.bin'))
with io.open(os.path.join(params['spacy_data'], 'vocab', 'strings.json'),
with io.open(os.path.join(params['spacy_data'], 'vocab', 'strings.json'), 'w',
encoding='utf8') as file_:
spacy.vocab.strings.dump(file_)
nlp.vocab.strings.dump(file_)


if __name__ == '__main__':
Expand Down

0 comments on commit 4af143f

Please sign in to comment.