diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index fb73581c15..5ccb1d442c 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -213,7 +213,6 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): # do the actual filtering, then rebuild dictionary to remove gaps in ids self.filter_tokens(good_ids=good_ids) - self.compactify() logger.info("resulting dictionary: %s" % self) @@ -240,6 +239,7 @@ def filter_tokens(self, bad_ids=None, good_ids=None): self.dfs = dict((tokenid, freq) for tokenid, freq in iteritems(self.dfs) if tokenid in good_ids) + self.compactify() def compactify(self): diff --git a/gensim/test/test_corpora_dictionary.py b/gensim/test/test_corpora_dictionary.py index a8c9cc0e5c..f47e354f95 100644 --- a/gensim/test/test_corpora_dictionary.py +++ b/gensim/test/test_corpora_dictionary.py @@ -101,12 +101,38 @@ def testBuild(self): 'system': 5, 'time': 6, 'trees': 9, 'user': 7} self.assertEqual(d.token2id, expected) + def testMerge(self): + d = Dictionary(self.texts) + f = Dictionary(self.texts[:3]) + g = Dictionary(self.texts[3:]) + + f.merge_with(g) + self.assertEqual(sorted(d.token2id.keys()), sorted(f.token2id.keys())) + def testFilter(self): d = Dictionary(self.texts) d.filter_extremes(no_below=2, no_above=1.0, keep_n=4) expected = {0: 3, 1: 3, 2: 3, 3: 3} self.assertEqual(d.dfs, expected) + def testFilterTokens(self): + self.maxDiff = 10000 + d = Dictionary(self.texts) + + removed_word = d[0] + d.filter_tokens([0]) + + expected = {'computer': 0, 'eps': 8, 'graph': 10, 'human': 1, + 'interface': 2, 'minors': 11, 'response': 3, 'survey': 4, + 'system': 5, 'time': 6, 'trees': 9, 'user': 7} + del expected[removed_word] + self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys())) + + expected[removed_word] = len(expected) + d.add_documents([[removed_word]]) + self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys())) + + def test_doc2bow(self): d = Dictionary([["žluťoučký"], ["žluťoučký"]])