Skip to content

Commit

Permalink
Removing Doc2Vec defaults so that it won't override Word2Vec defaults.
Browse files Browse the repository at this point in the history
…Fix #795 (#929)
  • Loading branch information
markroxor authored and tmylk committed Nov 22, 2016
1 parent 80e9c98 commit 14f12f4
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 13 deletions.
17 changes: 9 additions & 8 deletions gensim/models/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,9 +529,8 @@ def repeat(self, word_count):

class Doc2Vec(Word2Vec):
"""Class for training, using and evaluating neural networks described in http://arxiv.org/pdf/1405.4053v2.pdf"""
def __init__(self, documents=None, size=300, alpha=0.025, window=8, min_count=5,
max_vocab_size=None, sample=0, seed=1, workers=1, min_alpha=0.0001,
dm=1, hs=1, negative=0, dbow_words=0, dm_mean=0, dm_concat=0, dm_tag_count=1,
def __init__(self, documents=None, dm_mean=None,
dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1,
docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, **kwargs):
"""
Initialize the model from an iterable of `documents`. Each document is a
Expand Down Expand Up @@ -600,18 +599,20 @@ def __init__(self, documents=None, size=300, alpha=0.025, window=8, min_count=5,
of the model.
"""

super(Doc2Vec, self).__init__(
size=size, alpha=alpha, window=window, min_count=min_count, max_vocab_size=max_vocab_size,
sample=sample, seed=seed, workers=workers, min_alpha=min_alpha,
sg=(1+dm) % 2, hs=hs, negative=negative, cbow_mean=dm_mean,
sg=(1 + dm) % 2,
null_word=dm_concat, **kwargs)

if dm_mean is not None:
self.cbow_mean = dm_mean

self.dbow_words = dbow_words
self.dm_concat = dm_concat
self.dm_tag_count = dm_tag_count
if self.dm and self.dm_concat:
self.layer1_size = (self.dm_tag_count + (2 * self.window)) * self.vector_size
else:
self.layer1_size = size

self.docvecs = docvecs or DocvecsArray(docvecs_mapfile)
self.comment = comment
if documents is not None:
Expand Down
11 changes: 6 additions & 5 deletions gensim/test/test_doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def test_int_doctags(self):
model = doc2vec.Doc2Vec(min_count=1)
model.build_vocab(corpus)
self.assertEqual(len(model.docvecs.doctag_syn0), 300)
self.assertEqual(model.docvecs[0].shape, (300,))
self.assertEqual(model.docvecs[0].shape, (100,))
self.assertRaises(KeyError, model.__getitem__, '_*0')

def test_missing_string_doctag(self):
Expand All @@ -109,9 +109,10 @@ def test_string_doctags(self):

model = doc2vec.Doc2Vec(min_count=1)
model.build_vocab(corpus)

self.assertEqual(len(model.docvecs.doctag_syn0), 300)
self.assertEqual(model.docvecs[0].shape, (300,))
self.assertEqual(model.docvecs['_*0'].shape, (300,))
self.assertEqual(model.docvecs[0].shape, (100,))
self.assertEqual(model.docvecs['_*0'].shape, (100,))
self.assertTrue(all(model.docvecs['_*0'] == model.docvecs[0]))
self.assertTrue(max(d.offset for d in model.docvecs.doctags.values()) < len(model.docvecs.doctags))
self.assertTrue(max(model.docvecs._int_index(str_key) for str_key in model.docvecs.doctags.keys()) < len(model.docvecs.doctag_syn0))
Expand Down Expand Up @@ -175,15 +176,15 @@ def model_sanity(self, model):
def test_training(self):
"""Test doc2vec training."""
corpus = DocsLeeCorpus()
model = doc2vec.Doc2Vec(size=100, min_count=2, iter=20)
model = doc2vec.Doc2Vec(size=100, min_count=2, iter=20, workers=1)
model.build_vocab(corpus)
self.assertEqual(model.docvecs.doctag_syn0.shape, (300, 100))
model.train(corpus)

self.model_sanity(model)

# build vocab and train in one step; must be the same as above
model2 = doc2vec.Doc2Vec(corpus, size=100, min_count=2, iter=20)
model2 = doc2vec.Doc2Vec(corpus, size=100, min_count=2, iter=20, workers=1)
self.models_equal(model, model2)

def test_dbow_hs(self):
Expand Down

0 comments on commit 14f12f4

Please sign in to comment.