Browse files

improved wikipedia docs&comments

  • Loading branch information...
1 parent 5b35b34 commit 553f1cd4440cc5bd1dacdc2e2c72af403257379a @piskvorky piskvorky committed Mar 2, 2012
Showing with 13 additions and 4 deletions.
  1. +6 −1 gensim/corpora/wikicorpus.py
  2. +7 −3 gensim/utils.py
View
7 gensim/corpora/wikicorpus.py
@@ -21,7 +21,10 @@
disk space; gensim's corpus iterators can work with compressed input, too.
`VOCABULARY_SIZE` controls how many of the most frequent words to keep (after
-removing all tokens that appear in more than 10 percent documents). Defaults to 100,000.
+removing tokens that appear in more than 10%% of all documents). Defaults to 50,000.
+
+If you have the `pattern` package installed, this script will use a fancy lemmatization
+to get a lemma of each token (instead of plain alphabetic tokenizer).
Example: ./wikicorpus.py ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en
"""
@@ -215,6 +218,8 @@ def get_texts(self, return_raw=False):
yield result
if LEMMATIZE:
+ logger.info("all %i articles read; waiting for lemmatizer to finish the %i remaining jobs" %
+ (articles, articles - yielded))
while yielded < articles:
_, result = lemmatizer.read()
positions += len(result)
View
10 gensim/utils.py
@@ -653,9 +653,13 @@ def has_results(self):
return not self.qout.empty()
def __del__(self):
- for prc in self.prcs:
- prc.terminate()
- logger.info("terminated %i lemmatizer processes" % self.num_workers)
+ try:
+ for prc in self.prcs:
+ prc.terminate()
+ logger.info("terminated %i lemmatizer processes" % self.num_workers)
+ except:
+ # ignore errors at interpreter tear-down
+ pass
lemmatizer = Lemmatizer()
#endif HAS_PATTERN

0 comments on commit 553f1cd

Please sign in to comment.