Browse files

Merge remote branch 'quesada/master' into dedan

Conflicts:
	.gitignore
	CHANGELOG.txt
	MANIFEST.in
	README.txt
	TODO.txt
	docs/_sources/apiref.txt
	docs/_sources/dist_lda.txt
	docs/_sources/dist_lsi.txt
	docs/_sources/distributed.txt
	docs/_sources/index.txt
	docs/_sources/install.txt
	docs/_sources/intro.txt
	docs/_sources/models/models.txt
	docs/_sources/tut1.txt
	docs/_sources/tut2.txt
	docs/_sources/tut3.txt
	docs/_sources/tutorial.txt
	docs/_sources/utils.txt
	docs/_sources/wiki.txt
	docs/_static/default.css
	docs/apiref.html
	docs/corpora/bleicorpus.html
	docs/corpora/corpora.html
	docs/corpora/dictionary.html
	docs/corpora/dmlcorpus.html
	docs/corpora/lowcorpus.html
	docs/corpora/mmcorpus.html
	docs/corpora/svmlightcorpus.html
	docs/corpora/wikicorpus.html
	docs/dist_lda.html
	docs/dist_lsi.html
	docs/distributed.html
	docs/genindex.html
	docs/index.html
	docs/install.html
	docs/interfaces.html
	docs/intro.html
	docs/matutils.html
	docs/models/lda_dispatcher.html
	docs/models/lda_worker.html
	docs/models/ldamodel.html
	docs/models/lsi_dispatcher.html
	docs/models/lsi_worker.html
	docs/models/lsimodel.html
	docs/models/models.html
	docs/models/rpmodel.html
	docs/models/tfidfmodel.html
	docs/modindex.html
	docs/objects.inv
	docs/py-modindex.html
	docs/search.html
	docs/searchindex.js
	docs/similarities/docsim.html
	docs/src/_static/default.css
	docs/src/_templates/indexsidebar.html
	docs/src/_templates/layout.html
	docs/src/apiref.rst
	docs/src/conf.py
	docs/src/dist_lda.rst
	docs/src/dist_lsi.rst
	docs/src/distributed.rst
	docs/src/index.rst
	docs/src/install.rst
	docs/src/intro.rst
	docs/src/models/models.rst
	docs/src/tut1.rst
	docs/src/tut2.rst
	docs/src/tut3.rst
	docs/src/tutorial.rst
	docs/src/utils.rst
	docs/src/wiki.rst
	docs/tut1.html
	docs/tut2.html
	docs/tut3.html
	docs/tutorial.html
	docs/utils.html
	docs/wiki.html
	ez_setup.py
	setup.py
	src/gensim/__init__.py
	src/gensim/corpora/__init__.py
	src/gensim/corpora/bleicorpus.py
	src/gensim/corpora/dictionary.py
	src/gensim/corpora/dmlcorpus.py
	src/gensim/corpora/lowcorpus.py
	src/gensim/corpora/mmcorpus.py
	src/gensim/corpora/sources.py
	src/gensim/corpora/svmlightcorpus.py
	src/gensim/corpora/wikicorpus.py
	src/gensim/dmlcz/gensim_build.py
	src/gensim/dmlcz/gensim_genmodel.py
	src/gensim/dmlcz/gensim_xml.py
	src/gensim/dmlcz/runall.sh
	src/gensim/interfaces.py
	src/gensim/matutils.py
	src/gensim/models/__init__.py
	src/gensim/models/lda_dispatcher.py
	src/gensim/models/lda_worker.py
	src/gensim/models/ldamodel.py
	src/gensim/models/lsi_dispatcher.py
	src/gensim/models/lsi_worker.py
	src/gensim/models/lsimodel.py
	src/gensim/models/rpmodel.py
	src/gensim/models/tfidfmodel.py
	src/gensim/similarities/docsim.py
	src/gensim/test/test_corpora.py
	src/gensim/test/test_models.py
	src/gensim/test/testcorpus.low
	src/gensim/test/testcorpus.mm
	src/gensim/test/testcorpus.svmlight
	src/gensim/utils.py
  • Loading branch information...
2 parents 74c3aa1 + 5e150aa commit d96f1003e11d2906c4cb798ed6040b5532cd3be0 @piskvorky piskvorky committed Mar 12, 2011
Showing with 33,099 additions and 47 deletions.
  1. +7 −0 .gitignore
  2. +10 −0 MANIFEST.in
  3. +60 −0 README.git.txt
  4. +8 −0 parsing/__init__.py
  5. +375 −0 parsing/porter.py
  6. +96 −0 parsing/preprocessing.py
  7. +175 −0 parsing/tfidf.py
  8. +16 −16 src/gensim/corpora/dictionary.py
  9. BIN src/gensim/corpora/head500.noblanks.cor.bz2
  10. +2 −0 src/gensim/corpora/tst.svmlight
  11. BIN src/gensim/corpora/tst.svmlight.index.db
  12. +168 −0 src/gensim/corpora/wikiExternalParsingCorpus.py
  13. +163 −0 src/gensim/dmlcz/geteval.py
  14. +1 −0 src/gensim/dmlcz/geteval_topicintrusion.py
  15. +163 −0 src/gensim/dmlcz/geteval_topicintrusion.py.old
  16. +1 −0 src/gensim/dmlcz/geteval_wordintrusion.py
  17. +163 −0 src/gensim/dmlcz/geteval_wordintrusion.py.old
  18. +330 −0 src/gensim/models/cossim_compare.py
  19. +54 −0 src/gensim/nosy.py
  20. +8 −0 src/gensim/parsing/__init__.py
  21. +375 −0 src/gensim/parsing/porter.py
  22. +110 −0 src/gensim/parsing/preprocessing.py
  23. +175 −0 src/gensim/parsing/tfidf.py
  24. +1 −0 src/gensim/results
  25. BIN src/gensim/test/head500.noblanks.cor.bz2
  26. +5 −0 src/gensim/test/miIslita.cor
  27. +0 −31 src/gensim/test/test_corpora.py
  28. +110 −0 src/gensim/test/test_corpora_dictionary.py
  29. +120 −0 src/gensim/test/test_corpora_textcorpus.py
  30. +250 −0 src/gensim/test/test_data/head500.noblanks.cor
  31. BIN src/gensim/test/test_data/head500.noblanks.cor_tfidf.model
  32. +29,722 −0 src/gensim/test/test_data/head500.noblanks.cor_wordids.txt
  33. +6 −0 src/gensim/test/test_data/para2para_text1.txt
  34. +7 −0 src/gensim/test/test_data/para2para_text2.txt
  35. +80 −0 src/gensim/test/test_miislita.py
  36. +153 −0 src/gensim/test/test_parsing.py
  37. +103 −0 src/gensim/test/test_utils.py
  38. +8 −0 src/gensim/test/testcorpus.low
  39. +33 −0 src/gensim/test/testcorpus.mm
  40. +13 −0 src/gensim/test/testcorpus.svmlight
  41. +28 −0 src/gensim/utils.py
View
7 .gitignore
@@ -41,3 +41,10 @@ Thumbs.db
.pydevproject
.settings/
docs/src/_build/
+gensim.egg-info
+*,cover
+.idea
+*.dict
+*.index
+.coverage
+data
View
10 MANIFEST.in
@@ -1,7 +1,17 @@
+<<<<<<< HEAD
recursive-include docs *
recursive-include src/gensim/test testcorpus*
recursive-include src *.sh
prune docs/src*
include COPYING
include COPYING.LESSER
include ez_setup.py
+=======
+recursive-include docs *
+recursive-include src/gensim/test testcorpus*
+recursive-include src *.sh
+prune docs/src*
+include COPYING
+include COPYING.LESSER
+include ez_setup.py
+>>>>>>> quesada/master
View
60 README.git.txt
@@ -0,0 +1,60 @@
+This is my working version of gensim. I keep it synchronized with the upstream
+svn one at assembla.
+I have added some functional tests and utility functions to it. But the main
+reason I'm using the library is to replicate (Gabrilovich & Markovitch, 2006,
+2007b, 2009) Explicit semantic analisis (ESA).
+
+For other implementations try:
+C#: http://www.srcco.de/v/wikipedia-esa
+java: airhead research library. However the lack of sparse matrix support on
+java linear algebra libraries make java a poor choice.
+
+Currently (as of 27 Aug 2010) , gensim can parse wikipedia from xml wiki dumps quite efficiently.
+However, our ESA code uses a different parsing that we did before (following the
+method section of the paper).
+
+We use here a parsing from March 2008.
+
+Our parsings have three advantages:
+1- THey consider centrality measures, and this is not currently easy to do with
+ the xml dumps directly
+2-
+3- We did an unsupervised name entity recognition parsing (NER) using openNLP.
+THis is parallelized on 8 cores using java code, see ri.larkc.eu:8087/tools.
+We could have used
+
+NOTE:
+Because example corpora are big, the repository ignores the data folder. Our
+parsing is available online at: (TODO)
+download it and place it under (TODO)
+
+folder structure:
+
+/acme
+ contains my working scripts
+
+/data/corpora
+ contains corpora.
+
+/parsing
+ tfidf/preprocessing/porter in /parsing adapted from Mathieu Blondel:
+ git clone http://www.mblondel.org/code/tfidf.git
+
+how to replicate the paper
+--------------------------
+code is in /acme/lee-wiki
+
+First you need to create the tfidf space.
+There's a flag. Set createCorpus = True.
+The corpus creation takes about 1hr, with profuse logging.
+This is faster than parsing the corpus from xml (about 16 hrs) because we do not
+do any xml filtering, stopword removal etc (it's already done on the .cor file).
+
+Once the sparse matrix is on disk, it's faster to read the serialized objects than to
+parse the corpus again.
+
+References:
+------------
+E. Gabrilovich and S. Markovitch (2009) "Wikipedia-based Semantic Interpretation
+for Natural Language Processing", Journal of artificial intelligence research, Volume 34, pages 443-498
+doi:10.1613/jair.2669
View
8 parsing/__init__.py
@@ -0,0 +1,8 @@
+"""
+This package contains functions to preprocess raw text
+"""
+
+# bring model classes directly into package namespace, to save some typing
+from porter import PorterStemmer
+from tfidf import tfidf
+from preprocessing import *
View
375 parsing/porter.py
@@ -0,0 +1,375 @@
+#!/usr/bin/env python
+
+"""Porter Stemming Algorithm
+This is the Porter stemming algorithm, ported to Python from the
+version coded up in ANSI C by the author. It may be be regarded
+as canonical, in that it follows the algorithm presented in
+
+Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+no. 3, pp 130-137,
+
+only differing from it at the points maked --DEPARTURE-- below.
+
+See also http://www.tartarus.org/~martin/PorterStemmer
+
+The algorithm as described in the paper could be exactly replicated
+by adjusting the points of DEPARTURE, but this is barely necessary,
+because (a) the points of DEPARTURE are definitely improvements, and
+(b) no encoding of the Porter stemmer I have seen is anything like
+as exact as this version, even with the points of DEPARTURE!
+
+Vivake Gupta (v@nano.com)
+
+Release 1: January 2001
+
+Further adjustments by Santiago Bruno (bananabruno@gmail.com)
+to allow word input not restricted to one word per line, leading
+to:
+
+release 2: July 2008
+"""
+
+import sys
+
+class PorterStemmer:
+
+ def __init__(self):
+ """The main part of the stemming algorithm starts here.
+ b is a buffer holding a word to be stemmed. The letters are in b[k0],
+ b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
+ readjusted downwards as the stemming progresses. Zero termination is
+ not in fact used in the algorithm.
+
+ Note that only lower case sequences are stemmed. Forcing to lower case
+ should be done before stem(...) is called.
+ """
+
+ self.b = "" # buffer for word to be stemmed
+ self.k = 0
+ self.k0 = 0
+ self.j = 0 # j is a general offset into the string
+
+ def cons(self, i):
+ """cons(i) is TRUE <=> b[i] is a consonant."""
+ if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u':
+ return 0
+ if self.b[i] == 'y':
+ if i == self.k0:
+ return 1
+ else:
+ return (not self.cons(i - 1))
+ return 1
+
+ def m(self):
+ """m() measures the number of consonant sequences between k0 and j.
+ if c is a consonant sequence and v a vowel sequence, and <..>
+ indicates arbitrary presence,
+
+ <c><v> gives 0
+ <c>vc<v> gives 1
+ <c>vcvc<v> gives 2
+ <c>vcvcvc<v> gives 3
+ ....
+ """
+ n = 0
+ i = self.k0
+ while 1:
+ if i > self.j:
+ return n
+ if not self.cons(i):
+ break
+ i = i + 1
+ i = i + 1
+ while 1:
+ while 1:
+ if i > self.j:
+ return n
+ if self.cons(i):
+ break
+ i = i + 1
+ i = i + 1
+ n = n + 1
+ while 1:
+ if i > self.j:
+ return n
+ if not self.cons(i):
+ break
+ i = i + 1
+ i = i + 1
+
+ def vowelinstem(self):
+ """vowelinstem() is TRUE <=> k0,...j contains a vowel"""
+ for i in range(self.k0, self.j + 1):
+ if not self.cons(i):
+ return 1
+ return 0
+
+ def doublec(self, j):
+ """doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
+ if j < (self.k0 + 1):
+ return 0
+ if (self.b[j] != self.b[j-1]):
+ return 0
+ return self.cons(j)
+
+ def cvc(self, i):
+ """cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
+ and also if the second c is not w,x or y. this is used when trying to
+ restore an e at the end of a short e.g.
+
+ cav(e), lov(e), hop(e), crim(e), but
+ snow, box, tray.
+ """
+ if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2):
+ return 0
+ ch = self.b[i]
+ if ch == 'w' or ch == 'x' or ch == 'y':
+ return 0
+ return 1
+
+ def ends(self, s):
+ """ends(s) is TRUE <=> k0,...k ends with the string s."""
+ length = len(s)
+ if s[length - 1] != self.b[self.k]: # tiny speed-up
+ return 0
+ if length > (self.k - self.k0 + 1):
+ return 0
+ if self.b[self.k-length+1:self.k+1] != s:
+ return 0
+ self.j = self.k - length
+ return 1
+
+ def setto(self, s):
+ """setto(s) sets (j+1),...k to the characters in the string s, readjusting k."""
+ length = len(s)
+ self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:]
+ self.k = self.j + length
+
+ def r(self, s):
+ """r(s) is used further down."""
+ if self.m() > 0:
+ self.setto(s)
+
+ def step1ab(self):
+ """step1ab() gets rid of plurals and -ed or -ing. e.g.
+
+ caresses -> caress
+ ponies -> poni
+ ties -> ti
+ caress -> caress
+ cats -> cat
+
+ feed -> feed
+ agreed -> agree
+ disabled -> disable
+
+ matting -> mat
+ mating -> mate
+ meeting -> meet
+ milling -> mill
+ messing -> mess
+
+ meetings -> meet
+ """
+ if self.b[self.k] == 's':
+ if self.ends("sses"):
+ self.k = self.k - 2
+ elif self.ends("ies"):
+ self.setto("i")
+ elif self.b[self.k - 1] != 's':
+ self.k = self.k - 1
+ if self.ends("eed"):
+ if self.m() > 0:
+ self.k = self.k - 1
+ elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
+ self.k = self.j
+ if self.ends("at"): self.setto("ate")
+ elif self.ends("bl"): self.setto("ble")
+ elif self.ends("iz"): self.setto("ize")
+ elif self.doublec(self.k):
+ self.k = self.k - 1
+ ch = self.b[self.k]
+ if ch == 'l' or ch == 's' or ch == 'z':
+ self.k = self.k + 1
+ elif (self.m() == 1 and self.cvc(self.k)):
+ self.setto("e")
+
+ def step1c(self):
+ """step1c() turns terminal y to i when there is another vowel in the stem."""
+ if (self.ends("y") and self.vowelinstem()):
+ self.b = self.b[:self.k] + 'i' + self.b[self.k+1:]
+
+ def step2(self):
+ """step2() maps double suffices to single ones.
+ so -ization ( = -ize plus -ation) maps to -ize etc. note that the
+ string before the suffix must give m() > 0.
+ """
+ if self.b[self.k - 1] == 'a':
+ if self.ends("ational"): self.r("ate")
+ elif self.ends("tional"): self.r("tion")
+ elif self.b[self.k - 1] == 'c':
+ if self.ends("enci"): self.r("ence")
+ elif self.ends("anci"): self.r("ance")
+ elif self.b[self.k - 1] == 'e':
+ if self.ends("izer"): self.r("ize")
+ elif self.b[self.k - 1] == 'l':
+ if self.ends("bli"): self.r("ble") # --DEPARTURE--
+ # To match the published algorithm, replace this phrase with
+ # if self.ends("abli"): self.r("able")
+ elif self.ends("alli"): self.r("al")
+ elif self.ends("entli"): self.r("ent")
+ elif self.ends("eli"): self.r("e")
+ elif self.ends("ousli"): self.r("ous")
+ elif self.b[self.k - 1] == 'o':
+ if self.ends("ization"): self.r("ize")
+ elif self.ends("ation"): self.r("ate")
+ elif self.ends("ator"): self.r("ate")
+ elif self.b[self.k - 1] == 's':
+ if self.ends("alism"): self.r("al")
+ elif self.ends("iveness"): self.r("ive")
+ elif self.ends("fulness"): self.r("ful")
+ elif self.ends("ousness"): self.r("ous")
+ elif self.b[self.k - 1] == 't':
+ if self.ends("aliti"): self.r("al")
+ elif self.ends("iviti"): self.r("ive")
+ elif self.ends("biliti"): self.r("ble")
+ elif self.b[self.k - 1] == 'g': # --DEPARTURE--
+ if self.ends("logi"): self.r("log")
+ # To match the published algorithm, delete this phrase
+
+ def step3(self):
+ """step3() dels with -ic-, -full, -ness etc. similar strategy to step2."""
+ if self.b[self.k] == 'e':
+ if self.ends("icate"): self.r("ic")
+ elif self.ends("ative"): self.r("")
+ elif self.ends("alize"): self.r("al")
+ elif self.b[self.k] == 'i':
+ if self.ends("iciti"): self.r("ic")
+ elif self.b[self.k] == 'l':
+ if self.ends("ical"): self.r("ic")
+ elif self.ends("ful"): self.r("")
+ elif self.b[self.k] == 's':
+ if self.ends("ness"): self.r("")
+
+ def step4(self):
+ """step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
+ if self.b[self.k - 1] == 'a':
+ if self.ends("al"): pass
+ else: return
+ elif self.b[self.k - 1] == 'c':
+ if self.ends("ance"): pass
+ elif self.ends("ence"): pass
+ else: return
+ elif self.b[self.k - 1] == 'e':
+ if self.ends("er"): pass
+ else: return
+ elif self.b[self.k - 1] == 'i':
+ if self.ends("ic"): pass
+ else: return
+ elif self.b[self.k - 1] == 'l':
+ if self.ends("able"): pass
+ elif self.ends("ible"): pass
+ else: return
+ elif self.b[self.k - 1] == 'n':
+ if self.ends("ant"): pass
+ elif self.ends("ement"): pass
+ elif self.ends("ment"): pass
+ elif self.ends("ent"): pass
+ else: return
+ elif self.b[self.k - 1] == 'o':
+ if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass
+ elif self.ends("ou"): pass
+ # takes care of -ous
+ else: return
+ elif self.b[self.k - 1] == 's':
+ if self.ends("ism"): pass
+ else: return
+ elif self.b[self.k - 1] == 't':
+ if self.ends("ate"): pass
+ elif self.ends("iti"): pass
+ else: return
+ elif self.b[self.k - 1] == 'u':
+ if self.ends("ous"): pass
+ else: return
+ elif self.b[self.k - 1] == 'v':
+ if self.ends("ive"): pass
+ else: return
+ elif self.b[self.k - 1] == 'z':
+ if self.ends("ize"): pass
+ else: return
+ else:
+ return
+ if self.m() > 1:
+ self.k = self.j
+
+ def step5(self):
+ """step5() removes a final -e if m() > 1, and changes -ll to -l if
+ m() > 1.
+ """
+ self.j = self.k
+ if self.b[self.k] == 'e':
+ a = self.m()
+ if a > 1 or (a == 1 and not self.cvc(self.k-1)):
+ self.k = self.k - 1
+ if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
+ self.k = self.k -1
+
+ def stem(self, p, i=None, j=None):
+ """In stem(p,i,j), p is a char pointer, and the string to be stemmed
+ is from p[i] to p[j] inclusive. Typically i is zero and j is the
+ offset to the last character of a string, (p[j+1] == '\0'). The
+ stemmer adjusts the characters p[i] ... p[j] and returns the new
+ end-point of the string, k. Stemming never increases word length, so
+ i <= k <= j. To turn the stemmer into a module, declare 'stem' as
+ extern, and delete the remainder of this file.
+ """
+ # copy the parameters into statics
+ if i is None: i = 0
+ if j is None: j = len(p)-1
+ self.b = p
+ self.k = j
+ self.k0 = i
+ if self.k <= self.k0 + 1:
+ return self.b # --DEPARTURE--
+
+ # With this line, strings of length 1 or 2 don't go through the
+ # stemming process, although no mention is made of this in the
+ # published algorithm. Remove the line to match the published
+ # algorithm.
+
+ self.step1ab()
+ self.step1c()
+ self.step2()
+ self.step3()
+ self.step4()
+ self.step5()
+ return self.b[self.k0:self.k+1]
+
+ def stem_sentence(self, txt):
+ return " ".join(map(self.stem, txt.split()))
+
+ def stem_documents(self, docs):
+ return map(self.stem_sentence, docs)
+
+
+if __name__ == '__main__':
+ p = PorterStemmer()
+ if len(sys.argv) > 1:
+ for f in sys.argv[1:]:
+ infile = open(f, 'r')
+ while 1:
+ output = ''
+ word = ''
+ line = infile.readline()
+ if line == '':
+ break
+ for c in line:
+ if c.isalpha():
+ word += c.lower()
+ else:
+ if word:
+ output += p.stem(word, 0,len(word)-1)
+ word = ''
+ output += c.lower()
+ print output,
+ infile.close()
View
96 parsing/preprocessing.py
@@ -0,0 +1,96 @@
+import re
+import string
+import glob
+
+basepath = "/home/quesada/coding/gensim/trunk/"
+from gensim.parsing.porter import PorterStemmer
+
+def strip_punctuation(s):
+ return re.sub("([%s]+)" % string.punctuation, " ", s)
+
+def strip_punctuation2(s):
+ return s.translate(string.maketrans("",""), string.punctuation)
+
+def strip_tags(s):
+ # assumes s is already lowercase
+ return re.sub(r"<([^>]+)>", "", s)
+
+def strip_short(s, minsize=3):
+ return " ".join([e for e in s.split() if len(e) >= minsize])
+
+def strip_numeric(s):
+ return re.sub(r"[0-9]+", "", s)
+
+def strip_non_alphanum(s):
+ # assumes s is already lowercase
+ return re.sub(r"[^a-z0-9\ ]", " ", s)
+
+def strip_multiple_whitespaces(s):
+ return re.sub(r"(\s|\\n|\\r|\\t)+", " ", s)
+ #return s
+
+def split_alphanum(s):
+ s = re.sub(r"([a-z]+)([0-9]+)", r"\1 \2", s)
+ return re.sub(r"([0-9]+)([a-z]+)", r"\1 \2", s)
+
+def stem_text(s):
+ """
+ given a text, returns the same text after running the porter stemmer
+ assumes all text is lowercase
+ """
+ output = ''
+ p = PorterStemmer()
+ for word in s.split():
+ output += p.stem(word, 0, len(word)-1) +" "
+ return(output)
+
+# improved list from Stone, Denis, Kwantes (2010)
+STOPWORDS = """
+a about above across after afterwards again against all almost alone along already also although always am among amongst amoungst amount an and another any anyhow anyone anything anyway anywhere are around as at back be
+became because become becomes becoming been before beforehand behind being below beside besides between beyond bill both bottom but by call can
+cannot cant co computer con could couldnt cry de describe
+detail did do doesn done down due during
+each eg eight either eleven else elsewhere empty enough etc even ever every everyone everything everywhere except few fifteen
+fify fill find fire first five for former formerly forty found four from front full further get give go
+had has hasnt have he hence her here hereafter hereby herein hereupon hers herself him himself his how however hundred i ie
+if in inc indeed interest into is it its itself keep last latter latterly least less ltd
+just
+kg km
+made many may me meanwhile might mill mine more moreover most mostly move much must my myself name namely
+neither never nevertheless next nine no nobody none noone nor not nothing now nowhere of off
+often on once one only onto or other others otherwise our ours ourselves out over own part per
+perhaps please put rather re
+quite
+rather really regarding
+same see seem seemed seeming seems serious several she should show side since sincere six sixty so some somehow someone something sometime sometimes somewhere still such system take ten
+than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they thick thin third this those though three through throughout thru thus to together too top toward towards twelve twenty two un under
+until up unless upon us used using
+various very very via
+was we well were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would yet you
+your yours yourself yourselves
+"""
+
+STOPWORDS = dict((w,1) for w in STOPWORDS.strip().replace("\n", " ").split())
+
+def remove_stopwords(s):
+ return " ".join([w for w in s.split() if w not in STOPWORDS])
+
+DEFAULT_FILTERS = [str.lower, strip_tags, strip_punctuation,
+strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short, stem_text]
+
+def preprocess_string(s, filters=DEFAULT_FILTERS):
+ for f in filters:
+ s = f(s)
+ return s.split()
+
+def preprocess_documents(docs):
+ return map(preprocess_string, docs)
+
+def read_file(path):
+ f = open(path)
+ ret = f.read()
+ return ret
+
+def read_files(pattern):
+ return map(read_file, glob.glob(pattern))
+
View
175 parsing/tfidf.py
@@ -0,0 +1,175 @@
+# -*- coding: utf-8 -*-
+
+from math import log
+
+import numpy as np
+import scipy.sparse as sp
+
+def tokenize(text):
+ return text.split()
+
+def tc(dataset, tokenizer=tokenize):
+ """ term counts. this function creates the vocab (type to id dict)
+ And also docs, this is the basic global freq dict.
+ Note that this is not a bag of words (bow) format,
+ here the bag contains the actual word, not the id.
+ The ids are calculated a posteriori.
+
+ input: a dataset (array of texts)
+
+ If we wanted compatibility with gensim, this function
+ could easily output the bow..
+ """
+ vocab = {}
+ docs = []
+
+ for doc in dataset:
+ d = {} # token => count
+
+ for term in tokenizer(doc):
+ vocab[term] = 1
+ d[term] = d.get(term, 0) + 1
+
+ docs.append(d)
+
+ sorted_terms = sorted(vocab.keys())
+ vocab = dict([(t, i) for i, t in enumerate(sorted_terms)])
+
+ return docs, vocab
+
+
+
+def bow_from_tc(term_counts, vocab):
+ """
+ returns: a bow (bag of words). A bow is list of a tuples (corpusid, localfq)
+ """
+ bow = []
+ for word, count in term_counts.iteritems():
+ bow.append(( vocab[word], count))
+ return bow
+
+def tf_from_tc(term_counts):
+ """ Normalized local FQ (dividing by sum, not vector norms)
+ """
+ docs = []
+
+ for doc in term_counts:
+ d = {}
+ length = sum(doc.values())
+ for term, count in doc.items():
+ d[term] = float(count) / length
+ docs.append(d)
+
+ return docs
+
+
+def idc_from_tc(term_counts):
+ """ contextual diversity (n of docs the term appears in, in the corpus
+ """
+ t = {}
+ for doc in term_counts:
+ for term in doc:
+ t[term] = t.get(term, 0) + 1
+ return t
+
+def idf_from_tc(term_counts):
+ """ inverse document frequency (related to contextual diversity)
+ denominator of the tfidf formula
+ """
+ n_docs = len(term_counts)
+ idf = {}
+ idc = idc_from_tc(term_counts)
+ for term in idc:
+ idf[term] = log(n_docs*1.0/(idc[term]),10)# NOTE changed to log(x,10)
+ return idf
+
+def tf_mul_idf(tf, idf):
+ docs = []
+
+ for doc in tf:
+ d = {}
+ for term in doc:
+ d[term] = doc[term] * idf[term]
+ docs.append(d)
+
+ return docs
+
+def to_vector(idf_dict, vocab):
+ ret = np.zeros(len(idf_dict))
+ for term, idx in vocab.items():
+ ret[idx] = idf_dict[term]
+ return ret
+
+def to_sparse_matrix(tfidf_dict, vocab):
+ tfm = sp.lil_matrix((len(vocab), len(tfidf_dict)), dtype=np.double)
+
+ for j, doc in enumerate(tfidf_dict):
+ for term in doc:
+ try:
+ i = vocab[term]
+ tfm[i,j] = doc[term]
+ except KeyError:
+ pass
+
+ return tfm
+
+def inverse_vocab(vocab):
+ """
+ Converts a vocab dictionary term => index to index => term
+ """
+ return dict((i,t) for t,i in vocab.items())
+
+def vocab_array(vocab):
+ """
+ Converts vocab dictionary to vocab array
+ """
+ return np.char.array(sorted(vocab.keys(),
+ lambda a,b: cmp(vocab[a],vocab[b])))
+
+def vocab_dict(vocab):
+ """
+ Converts vocab array to vocab dictionary
+ """
+ return dict((term,i) for i,term in enumerate(vocab))
+
+def replace_vocab(td, oldvocab, newvocab):
+ """
+ td: V x X term-document matrix
+ oldvocab: dictionary
+ newvocab: dictionary
+ """
+ newtd = np.zeros((len(newvocab),td.shape[1]))
+ for term in newvocab:
+ try:
+ newtd[newvocab[term]] = td[oldvocab[term]]
+ except KeyError:
+ newtd[newvocab[term]] = 0
+ return newtd
+
+class tfidf(object):
+ """
+ dataset is an list of strings
+ """
+ def __init__(self, dataset, tokenizer=tokenize):
+ self._dataset = dataset
+ self._tokenizer = tokenizer
+
+ def as_dict(self):
+ term_counts, vocab = tc(self._dataset, self._tokenizer)
+ tf = term_counts # NOTE before it was: tf_from_tc(term_counts) but
+ # tf from tc normalizes, and this doesn't replicate miislita counts
+ idf = idf_from_tc(term_counts)
+ return tf_mul_idf(tf, idf), vocab
+
+ def as_sparse_matrix(self):
+ tfidf_dict, vocab = self.as_dict()
+ return to_sparse_matrix(tfidf_dict, vocab), vocab
+
+ def as_array(self):
+ tfm, vocab = self.as_sparse_matrix()
+ return tfm.toarray(), vocab
+
+ def get_idf(self):
+ term_counts, vocab = tc(self._dataset, self._tokenizer)
+ idf = idf_from_tc(term_counts)
+ return idf, vocab
View
32 src/gensim/corpora/dictionary.py
@@ -11,14 +11,12 @@
Dictionaries can be created from a corpus and can later be pruned according to
document frequency (removing (un)common words via the :func:`Dictionary.filterExtremes` method),
-save/loaded from disk via :func:`Dictionary.save` and :func:`Dictionary.load` methods etc.
+save/loaded from disk (via :func:`Dictionary.save` and :func:`Dictionary.load` methods) etc.
"""
-
import logging
import itertools
-import random
from gensim import utils
@@ -46,11 +44,6 @@ def __init__(self, documents=None):
self.addDocuments(documents)
- # TODO expensive, only here for historical reasons; maybe deprecate?
- id2token = property(lambda self: dict((id, token) for token, id in self.token2id.iteritems()))
- id2word = id2token
-
-
def __len__(self):
"""
Return the number of token->id mappings in the dictionary.
@@ -70,7 +63,7 @@ def fromDocuments(documents):
def addDocuments(self, documents):
"""
Build dictionary from a collection of documents. Each document is a list
- of tokens (**tokenized and normalized** utf-8 encoded strings).
+ of tokens = **tokenized and normalized** utf-8 encoded strings.
This is only a convenience wrapper for calling `doc2bow` on each document
with `allowUpdate=True`.
@@ -86,19 +79,20 @@ def addDocuments(self, documents):
(self, self.numDocs, self.numPos))
- def doc2bow(self, document, allowUpdate=False):
+ def doc2bow(self, document, allowUpdate=False, return_missing=False):
"""
- Convert `document` (a list of words) into the bag-of-words format = list of
- `(tokenId, tokenCount)` 2-tuples. Each word is assumed to be a
+ Convert `document` (a list of words) into the bag-of-words format = list
+ of `(tokenId, tokenCount)` 2-tuples. Each word is assumed to be a
**tokenized and normalized** utf-8 encoded string.
- If `allowUpdate` is set, then also update dictionary in the process: create ids
- for new words. At the same time, update document frequencies -- for
+ If `allowUpdate` is set, then also update dictionary in the process: create
+ ids for new words. At the same time, update document frequencies -- for
each word appearing in this document, increase its `self.dfs` by one.
If `allowUpdate` is **not** set, this function is `const`, i.e. read-only.
"""
result = {}
+ missing = {}
document = sorted(document)
# construct (word, frequency) mapping. in python3 this is done simply
# using Counter(), but here i use itertools.groupby() for the job
@@ -107,6 +101,8 @@ def doc2bow(self, document, allowUpdate=False):
tokenId = self.token2id.get(wordNorm, None)
if tokenId is None:
# first time we see this token (~normalized form)
+ if return_missing:
+ missing[wordNorm] = frequency
if not allowUpdate: # if we aren't allowed to create new tokens, continue with the next unique token
continue
tokenId = len(self.token2id)
@@ -123,7 +119,12 @@ def doc2bow(self, document, allowUpdate=False):
for tokenId in result.iterkeys():
self.dfs[tokenId] = self.dfs.get(tokenId, 0) + 1
- return sorted(result.iteritems()) # return tokenIds, in ascending id order
+ # return tokenIds, in ascending id order
+ result = sorted(result.iteritems())
+ if return_missing:
+ return result, missing
+ else:
+ return result
def filterExtremes(self, noBelow=5, noAbove=0.5, keepN=None):
@@ -156,7 +157,6 @@ def filterExtremes(self, noBelow=5, noAbove=0.5, keepN=None):
self.rebuildDictionary()
logger.info("resulting dictionary: %s" % self)
-
def filterTokens(self, badIds=None, goodIds=None):
"""
Remove the selected `badIds` tokens from all dictionary mappings, or, keep
View
BIN src/gensim/corpora/head500.noblanks.cor.bz2
Binary file not shown.
View
2 src/gensim/corpora/tst.svmlight
@@ -0,0 +1,2 @@
+0 2:0.5
+0 1:1.0 2:2.0
View
BIN src/gensim/corpora/tst.svmlight.index.db
Binary file not shown.
View
168 src/gensim/corpora/wikiExternalParsingCorpus.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# This is a variation on wikicorpus that processes plain text, not xml dumps
+# from wikipedia. We needed this because our parsing that replicates
+# Gabrilovitch (2009) requires some link analysis that is very difficult to do
+# (maybe cannot be done) straight from the dump.
+
+
+"""
+TODO: Update this
+
+clone of wikicorpus.py
+"""
+
+import os
+import logging
+import sys
+import bz2
+
+from gensim import interfaces, matutils
+from gensim.corpora.dictionary import Dictionary
+
+
+logger = logging.getLogger('wikiExternParsingCorpus')
+logger.setLevel(logging.DEBUG)
+
+
+class WikiExternalParsingCorpus(interfaces.CorpusABC):
+ """
+ Treat a wikipedia articles dump (*articles.xml.bz2) as a (read-only)
+ corpus.
+
+ The documents are extracted on-the-fly, so that the whole (massive) dump
+ can stay compressed on disk.
+
+ Just start (and study) the __main__ and you will get a demo.
+ """
+
+ def __init__(self, fname, noBelow=20, keep_words=200000, dictionary=None):
+ """
+ Initialize the corpus. This scans the corpus once, to determine its
+ vocabulary (only the first `keep_words` most frequent words that
+ appear in at least `noBelow` documents are kept).
+ """
+
+ self.fname = fname
+ if dictionary is None:
+ self.dictionary = Dictionary(self.getArticles())
+ # TODO: make filtering optional with a parameter
+ # self.dictionary.filterExtremes(noBelow=noBelow, noAbove=0.1,
+ # keepN=keep_words)
+ else:
+ self.dictionary = dictionary
+
+ def __len__(self):
+ return self.numDocs
+
+ def __iter__(self):
+ """
+ The function that defines a corpus -- iterating over the corpus yields
+ vectors, one for each document.
+ """
+ for docNo, text in enumerate(self.getArticles()):
+ yield self.dictionary.doc2bow(text, allowUpdate=False)
+
+ def saveDictionary(self, fname):
+ """
+ Store id->word mapping to a file, in format:
+ `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`.
+ """
+
+ logger.info("saving dictionary mapping to %s" % fname)
+ fout = open(fname, 'w')
+ for token, tokenId in sorted(self.dictionary.token2id.iteritems()):
+ fout.write("%i\t%s\t%i\n" % (tokenId, token,
+ self.dictionary.docFreq[tokenId]))
+ fout.close()
+
+ @staticmethod
+ def loadDictionary(fname):
+ """
+ Load previously stored mapping between words and their ids.
+
+ The result can be used as the `id2word` parameter for input to
+ transformations.
+ """
+ result = {}
+ for lineNo, line in enumerate(open(fname)):
+ cols = line[:-1].split('\t')
+ if len(cols) == 2:
+ wordId, word = cols
+ elif len(cols) == 3:
+ wordId, word, docFreq = cols
+ else:
+ continue
+ # docFreq not used
+ result[int(wordId)] = word
+ return result
+
+ def saveAsText(self, fname):
+ """
+ Store the corpus to disk, in a human-readable text format.
+
+ This actually saves two files:
+
+ 1. Document-term co-occurence frequency counts (bag-of-words), as
+ a Matrix Market file `fname_bow.mm`.
+ 2. Token to integer mapping, as a text file `fname_wordids.txt`.
+
+ """
+ self.saveDictionary(fname + '_wordids.txt')
+ matutils.MmWriter.writeCorpus(fname + '_bow.mm', self,
+ progressCnt=10000)
+
+ def getArticles(self):
+ """
+ Iterate over the dump, returning text version of each article.
+
+ Only articles of sufficient length are returned (short articles
+ & redirects etc are ignored).
+ """
+ articles, intext = 0, False
+
+ for lineno, line in enumerate(bz2.BZ2File(self.fname)):
+ articles += 1
+ # split text into tokens
+ yield line.split()
+ # cache corpus length
+ self.numDocs = articles
+
+
+if __name__ == '__main__':
+ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
+ logging.root.setLevel(level=logging.INFO)
+ logging.info("running %s" % ' '.join(sys.argv))
+
+ import os
+ import tempfile
+
+ module_path = os.path.dirname(__file__)
+ corpusname = 'head500.noblanks.cor'
+ # the demo file is in the corpora folder
+ source = os.path.join(module_path, corpusname + '.bz2')
+ # save the results to tmp
+ output = os.path.join(tempfile.gettempdir(), corpusname)
+
+ # build dictionary.
+ logging.info("source: " + source)
+ wiki = WikiExternalParsingCorpus(source, keep_words=200000)
+
+ # save dictionary and bag-of-words
+ wiki.saveAsText(output)
+ del wiki
+
+ # initialize corpus reader and word->id mapping
+ from gensim.corpora import MmCorpus
+ id2token = WikiExternalParsingCorpus.loadDictionary(output + '_wordids.txt')
+ mm = MmCorpus(output + '_bow.mm')
+
+ # build tfidf
+ from gensim.models import TfidfModel
+ tfidf = TfidfModel(mm, id2word=id2token, normalize=True)
+
+ # save tfidf vectors in matrix market format
+ MmCorpus.saveCorpus(output + '_tfidf.mm', tfidf[mm], progressCnt=10000)
+
+ logging.info("finished running")
View
163 src/gensim/dmlcz/geteval.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+Generate data for the Word Intrusion and Topic Intrusion tasks, in csv format \
+suitable for the Mechanical Turk. For a description of what this means, see
+Chang et al.: Reading Tea Leaves: How Humans Interpret Topic Models
+
+For word intrusion:
+./geteval_wordintrusion.py NUM_TASKS CONCEPT_FILE
+e.g. ./geteval_wordintrusion.py 1000 /Users/kofola/workspace/dml/data/results/gensim_eng.lsa_concepts300 \
+1> /Users/kofola/workspace/dml/data/results/gensim_eng.lsa_concepts300.wordintrusion
+
+For topic instrusion:
+./geteval_topicintrusion.py NUM_TASKS CONCEPT_FILE
+e.g. ./geteval_topicintrusion.py 500 /Users/kofola/workspace/dml/data/results/gensim_eng.lsa_concepts300 \
+1> /Users/kofola/workspace/dml/data/results/gensim_eng.lsa_concepts300.topicintrusion
+"""
+
+# The functions in this module expect that topics have been previously saved to
+# disk in a specific format, for example via docsim.saveTopics().
+
+import logging
+import sys
+import os
+import random
+
+# number of top words from the same concept, for the word intrusion task.
+# one extra word from a different concept will be added, for a set of WI_WORDS+1
+# randomly shuffled words.
+WI_WORDS = 5
+
+
+def loadConcepts(fname):
+ """
+ Load concepts (words) from a file on disk. Ignore the word weights, only store
+ the words themselves.
+
+ Return list of concepts, where each concept is a list of words. A concept's
+ id is implicitly its position in the list.
+ """
+ logging.info("loading concepts from %s" % fname)
+ concepts = []
+ for line in open(fname):
+ concept = [part.split(':')[0] for part in line.strip().split('\t')]
+ concepts.append(concept) # concept id is implicitly position within the list of concepts
+ logging.info("loaded %i concepts" % len(concepts))
+ return concepts
+
+
+class WordIntrusion(object):
+ def __init__(self, fname):
+ self.concepts = loadConcepts(fname)
+
+ def getAlienWord(self, conceptId):
+ """
+ For a given concept, choose an 'alien' word, which
+ a) is unlikely for the input concept
+ b) is likely in some other concept (called alient concept).
+
+ Return the 2-tuple (alien concept id, alien word).
+ """
+ allWords = self.concepts[conceptId]
+ badWords = set(allWords[int(0.6 * len(allWords)) : ]) # use the bottom 40% of words for alien candidates
+
+ candidates = []
+ for alienId, concept in enumerate(self.concepts):
+ if alienId == conceptId:
+ continue
+ topAlienWords = concept[ : 10] # use 10 most significant words as alien concept representatives
+ alienOk = badWords.intersection(topAlienWords)
+ candidates.extend((alienId, alienWord) for alienWord in alienOk)
+ assert candidates, "for concept %s, method %s, there are no candidates for alien words!" % (conceptId, method)
+
+ return random.choice(candidates)
+
+ def wordIntrusion(self, numWords):
+ """
+ Generate data for a single word intrusion task instance.
+ """
+ # randomly pick the target topic and its most significant words
+ conceptId = random.randint(0, len(self.concepts) - 1)
+ words = self.concepts[conceptId][ : numWords]
+ random.shuffle(words) # shuffle the words in place
+
+ # randomly pick another word, significant in another topic, and inject it into this topic
+ alienConceptId, alienWord = self.getAlienWord(conceptId)
+ alienPos = random.randint(0, numWords) # position of the alien word, for insertion
+ words.insert(alienPos, alienWord)
+ return conceptId, alienConceptId, words, alienPos
+
+ def printProtocol(self, numInstances):
+ """
+ Print a specified number of instances for the word intrusion test.
+
+ Each instance contains:
+ 1) id of the concept tested for intrusion
+ 2) five words from this concept
+ 3) id of concept from which the alien word is taken
+ 4) one alien word
+
+ This information is represented by six words (shuffled), one position (of
+ the alien word within the six), and two concept ids, per instance.
+
+ Each instance is saved as one line in a csv file (which can be included
+ in Mechanical Turk or similar software, to be evaluated by humans). The file
+ therefore contains numInstances+1 lines; the extra first line is the csv
+ descriptor of fields.
+ """
+ fields = ['w%i' % i for i in xrange(WI_WORDS + 1)] + ['apos', 'cid', 'aid']
+ template = ','.join("%(" + field + ')s' for field in fields)
+ headerLine = template % dict(zip(fields, fields)) # first line in csv describes the fields
+ print headerLine
+ for i in xrange(numInstances):
+ cid, aid, words, apos = self.wordIntrusion(numWords = WI_WORDS)
+ w0, w1, w2, w3, w4, w5 = words # FIXME this assumes WI_WORDS==5, make more flexible
+ print template % locals()
+#endclass WordIntrusion
+
+
+def topicIntrusion(useTop = 3, fromBottom = 10):
+ method = getRandomMethod()
+ document = getRandomDocument()
+ conceptScores = getConceptScores(method, document)
+ conceptScores.sort(reverse = True) # best scores first
+ concepts = conceptScores[: useTop]
+ alienConcept = random.choice(conceptScores[-fromBottom : ])
+ alienPos = random.randint(0, useTop)
+ concepts.insert(alienPos, alienConcept)
+ return method, document, alienPos, concepts
+
+
+
+# ============= main entry point ================
+if __name__ == "__main__":
+ logging.basicConfig(level = logging.DEBUG)
+ logging.info("running %s" % " ".join(sys.argv))
+
+ program = os.path.basename(sys.argv[0])
+
+ # make sure we have enough cmd line parameters
+ if len(sys.argv) < 2:
+ print globals()["__doc__"]
+ sys.exit(1)
+
+ # parse cmd line
+ numInstances = int(sys.argv[1])
+ conceptFile = sys.argv[2]
+ if 'word' in program:
+ wi = WordIntrusion(conceptFile)
+ wi.printProtocol(numInstances)
+ elif 'topic' in program:
+ ti = TopicIntrusion(conceptFile)
+ ti.printProtocol(numInstances)
+ else:
+ print globals()["__doc__"]
+ sys.exit(1)
+
+ logging.info("finished running %s" % program)
+
View
1 src/gensim/dmlcz/geteval_topicintrusion.py
View
163 src/gensim/dmlcz/geteval_topicintrusion.py.old
@@ -0,0 +1,163 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+Generate data for the Word Intrusion and Topic Intrusion tasks, in csv format \
+suitable for the Mechanical Turk. For a description of what this means, see
+Chang et al.: Reading Tea Leaves: How Humans Interpret Topic Models
+
+For word intrusion:
+./geteval_wordintrusion.py NUM_TASKS CONCEPT_FILE
+e.g. ./geteval_wordintrusion.py 1000 /Users/kofola/workspace/dml/data/results/gensim_eng.lsa_concepts300 \
+1> /Users/kofola/workspace/dml/data/results/gensim_eng.lsa_concepts300.wordintrusion
+
+For topic instrusion:
+./geteval_topicintrusion.py NUM_TASKS CONCEPT_FILE
+e.g. ./geteval_topicintrusion.py 500 /Users/kofola/workspace/dml/data/results/gensim_eng.lsa_concepts300 \
+1> /Users/kofola/workspace/dml/data/results/gensim_eng.lsa_concepts300.topicintrusion
+"""
+
+# The functions in this module expect that topics have been previously saved to
+# disk in a specific format, for example via docsim.saveTopics().
+
+import logging
+import sys
+import os
+import random
+
+# number of top words from the same concept, for the word intrusion task.
+# one extra word from a different concept will be added, for a set of WI_WORDS+1
+# randomly shuffled words.
+WI_WORDS = 5
+
+
+def loadConcepts(fname):
+ """
+ Load concepts (words) from a file on disk. Ignore the word weights, only store
+ the words themselves.
+
+ Return list of concepts, where each concept is a list of words. A concept's
+ id is implicitly its position in the list.
+ """
+ logging.info("loading concepts from %s" % fname)
+ concepts = []
+ for line in open(fname):
+ concept = [part.split(':')[0] for part in line.strip().split('\t')]
+ concepts.append(concept) # concept id is implicitly position within the list of concepts
+ logging.info("loaded %i concepts" % len(concepts))
+ return concepts
+
+
+class WordIntrusion(object):
+ def __init__(self, fname):
+ self.concepts = loadConcepts(fname)
+
+ def getAlienWord(self, conceptId):
+ """
+ For a given concept, choose an 'alien' word, which
+ a) is unlikely for the input concept
+ b) is likely in some other concept (called alient concept).
+
+ Return the 2-tuple (alien concept id, alien word).
+ """
+ allWords = self.concepts[conceptId]
+ badWords = set(allWords[int(0.6 * len(allWords)) : ]) # use the bottom 40% of words for alien candidates
+
+ candidates = []
+ for alienId, concept in enumerate(self.concepts):
+ if alienId == conceptId:
+ continue
+ topAlienWords = concept[ : 10] # use 10 most significant words as alien concept representatives
+ alienOk = badWords.intersection(topAlienWords)
+ candidates.extend((alienId, alienWord) for alienWord in alienOk)
+ assert candidates, "for concept %s, method %s, there are no candidates for alien words!" % (conceptId, method)
+
+ return random.choice(candidates)
+
+ def wordIntrusion(self, numWords):
+ """
+ Generate data for a single word intrusion task instance.
+ """
+ # randomly pick the target topic and its most significant words
+ conceptId = random.randint(0, len(self.concepts) - 1)
+ words = self.concepts[conceptId][ : numWords]
+ random.shuffle(words) # shuffle the words in place
+
+ # randomly pick another word, significant in another topic, and inject it into this topic
+ alienConceptId, alienWord = self.getAlienWord(conceptId)
+ alienPos = random.randint(0, numWords) # position of the alien word, for insertion
+ words.insert(alienPos, alienWord)
+ return conceptId, alienConceptId, words, alienPos
+
+ def printProtocol(self, numInstances):
+ """
+ Print a specified number of instances for the word intrusion test.
+
+ Each instance contains:
+ 1) id of the concept tested for intrusion
+ 2) five words from this concept
+ 3) id of concept from which the alien word is taken
+ 4) one alien word
+
+ This information is represented by six words (shuffled), one position (of
+ the alien word within the six), and two concept ids, per instance.
+
+ Each instance is saved as one line in a csv file (which can be included
+ in Mechanical Turk or similar software, to be evaluated by humans). The file
+ therefore contains numInstances+1 lines; the extra first line is the csv
+ descriptor of fields.
+ """
+ fields = ['w%i' % i for i in xrange(WI_WORDS + 1)] + ['apos', 'cid', 'aid']
+ template = ','.join("%(" + field + ')s' for field in fields)
+ headerLine = template % dict(zip(fields, fields)) # first line in csv describes the fields
+ print headerLine
+ for i in xrange(numInstances):
+ cid, aid, words, apos = self.wordIntrusion(numWords = WI_WORDS)
+ w0, w1, w2, w3, w4, w5 = words # FIXME this assumes WI_WORDS==5, make more flexible
+ print template % locals()
+#endclass WordIntrusion
+
+
+def topicIntrusion(useTop = 3, fromBottom = 10):
+ method = getRandomMethod()
+ document = getRandomDocument()
+ conceptScores = getConceptScores(method, document)
+ conceptScores.sort(reverse = True) # best scores first
+ concepts = conceptScores[: useTop]
+ alienConcept = random.choice(conceptScores[-fromBottom : ])
+ alienPos = random.randint(0, useTop)
+ concepts.insert(alienPos, alienConcept)
+ return method, document, alienPos, concepts
+
+
+
+# ============= main entry point ================
+if __name__ == "__main__":
+ logging.basicConfig(level = logging.DEBUG)
+ logging.info("running %s" % " ".join(sys.argv))
+
+ program = os.path.basename(sys.argv[0])
+
+ # make sure we have enough cmd line parameters
+ if len(sys.argv) < 2:
+ print globals()["__doc__"]
+ sys.exit(1)
+
+ # parse cmd line
+ numInstances = int(sys.argv[1])
+ conceptFile = sys.argv[2]
+ if 'word' in program:
+ wi = WordIntrusion(conceptFile)
+ wi.printProtocol(numInstances)
+ elif 'topic' in program:
+ ti = TopicIntrusion(conceptFile)
+ ti.printProtocol(numInstances)
+ else:
+ print globals()["__doc__"]
+ sys.exit(1)
+
+ logging.info("finished running %s" % program)
+
View
1 src/gensim/dmlcz/geteval_wordintrusion.py
View
163 src/gensim/dmlcz/geteval_wordintrusion.py.old
@@ -0,0 +1,163 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+Generate data for the Word Intrusion and Topic Intrusion tasks, in csv format \
+suitable for the Mechanical Turk. For a description of what this means, see
+Chang et al.: Reading Tea Leaves: How Humans Interpret Topic Models
+
+For word intrusion:
+./geteval_wordintrusion.py NUM_TASKS CONCEPT_FILE
+e.g. ./geteval_wordintrusion.py 1000 /Users/kofola/workspace/dml/data/results/gensim_eng.lsa_concepts300 \
+1> /Users/kofola/workspace/dml/data/results/gensim_eng.lsa_concepts300.wordintrusion
+
+For topic instrusion:
+./geteval_topicintrusion.py NUM_TASKS CONCEPT_FILE
+e.g. ./geteval_topicintrusion.py 500 /Users/kofola/workspace/dml/data/results/gensim_eng.lsa_concepts300 \
+1> /Users/kofola/workspace/dml/data/results/gensim_eng.lsa_concepts300.topicintrusion
+"""
+
+# The functions in this module expect that topics have been previously saved to
+# disk in a specific format, for example via docsim.saveTopics().
+
+import logging
+import sys
+import os
+import random
+
+# number of top words from the same concept, for the word intrusion task.
+# one extra word from a different concept will be added, for a set of WI_WORDS+1
+# randomly shuffled words.
+WI_WORDS = 5
+
+
+def loadConcepts(fname):
+ """
+ Load concepts (words) from a file on disk. Ignore the word weights, only store
+ the words themselves.
+
+ Return list of concepts, where each concept is a list of words. A concept's
+ id is implicitly its position in the list.
+ """
+ logging.info("loading concepts from %s" % fname)
+ concepts = []
+ for line in open(fname):
+ concept = [part.split(':')[0] for part in line.strip().split('\t')]
+ concepts.append(concept) # concept id is implicitly position within the list of concepts
+ logging.info("loaded %i concepts" % len(concepts))
+ return concepts
+
+
+class WordIntrusion(object):
+ def __init__(self, fname):
+ self.concepts = loadConcepts(fname)
+
+ def getAlienWord(self, conceptId):
+ """
+ For a given concept, choose an 'alien' word, which
+ a) is unlikely for the input concept
+ b) is likely in some other concept (called alient concept).
+
+ Return the 2-tuple (alien concept id, alien word).
+ """
+ allWords = self.concepts[conceptId]
+ badWords = set(allWords[int(0.6 * len(allWords)) : ]) # use the bottom 40% of words for alien candidates
+
+ candidates = []
+ for alienId, concept in enumerate(self.concepts):
+ if alienId == conceptId:
+ continue
+ topAlienWords = concept[ : 10] # use 10 most significant words as alien concept representatives
+ alienOk = badWords.intersection(topAlienWords)
+ candidates.extend((alienId, alienWord) for alienWord in alienOk)
+ assert candidates, "for concept %s, method %s, there are no candidates for alien words!" % (conceptId, method)
+
+ return random.choice(candidates)
+
+ def wordIntrusion(self, numWords):
+ """
+ Generate data for a single word intrusion task instance.
+ """
+ # randomly pick the target topic and its most significant words
+ conceptId = random.randint(0, len(self.concepts) - 1)
+ words = self.concepts[conceptId][ : numWords]
+ random.shuffle(words) # shuffle the words in place
+
+ # randomly pick another word, significant in another topic, and inject it into this topic
+ alienConceptId, alienWord = self.getAlienWord(conceptId)
+ alienPos = random.randint(0, numWords) # position of the alien word, for insertion
+ words.insert(alienPos, alienWord)
+ return conceptId, alienConceptId, words, alienPos
+
+ def printProtocol(self, numInstances):
+ """
+ Print a specified number of instances for the word intrusion test.
+
+ Each instance contains:
+ 1) id of the concept tested for intrusion
+ 2) five words from this concept
+ 3) id of concept from which the alien word is taken
+ 4) one alien word
+
+ This information is represented by six words (shuffled), one position (of
+ the alien word within the six), and two concept ids, per instance.
+
+ Each instance is saved as one line in a csv file (which can be included
+ in Mechanical Turk or similar software, to be evaluated by humans). The file
+ therefore contains numInstances+1 lines; the extra first line is the csv
+ descriptor of fields.
+ """
+ fields = ['w%i' % i for i in xrange(WI_WORDS + 1)] + ['apos', 'cid', 'aid']
+ template = ','.join("%(" + field + ')s' for field in fields)
+ headerLine = template % dict(zip(fields, fields)) # first line in csv describes the fields
+ print headerLine
+ for i in xrange(numInstances):
+ cid, aid, words, apos = self.wordIntrusion(numWords = WI_WORDS)
+ w0, w1, w2, w3, w4, w5 = words # FIXME this assumes WI_WORDS==5, make more flexible
+ print template % locals()
+#endclass WordIntrusion
+
+
+def topicIntrusion(useTop = 3, fromBottom = 10):
+ method = getRandomMethod()
+ document = getRandomDocument()
+ conceptScores = getConceptScores(method, document)
+ conceptScores.sort(reverse = True) # best scores first
+ concepts = conceptScores[: useTop]
+ alienConcept = random.choice(conceptScores[-fromBottom : ])
+ alienPos = random.randint(0, useTop)
+ concepts.insert(alienPos, alienConcept)
+ return method, document, alienPos, concepts
+
+
+
+# ============= main entry point ================
+if __name__ == "__main__":
+ logging.basicConfig(level = logging.DEBUG)
+ logging.info("running %s" % " ".join(sys.argv))
+
+ program = os.path.basename(sys.argv[0])
+
+ # make sure we have enough cmd line parameters
+ if len(sys.argv) < 2:
+ print globals()["__doc__"]
+ sys.exit(1)
+
+ # parse cmd line
+ numInstances = int(sys.argv[1])
+ conceptFile = sys.argv[2]
+ if 'word' in program:
+ wi = WordIntrusion(conceptFile)
+ wi.printProtocol(numInstances)
+ elif 'topic' in program:
+ ti = TopicIntrusion(conceptFile)
+ ti.printProtocol(numInstances)
+ else:
+ print globals()["__doc__"]
+ sys.exit(1)
+
+ logging.info("finished running %s" % program)
+
View
330 src/gensim/models/cossim_compare.py
@@ -0,0 +1,330 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from __future__ import with_statement
+
+import sys
+import sys, os, os.path
+import cPickle, random,itertools
+
+import logging
+logger = logging.getLogger('cossim')
+logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
+logger.setLevel(logging.DEBUG)
+
+import numpy
+import matplotlib.pyplot as plt
+plt.ioff()
+
+from gensim import similarities
+from gensim import matutils
+
+
+
+def rmse(a1, a2):
+ assert a1.shape == a2.shape
+ diff = a1 - a2
+ return numpy.sqrt(1.0 * numpy.multiply(diff, diff).sum() / a1.size)
+
+
+def cossim(model, corpus):
+ a1 = numpy.asmatrix(numpy.zeros((len(corpus), len(corpus)), dtype = numpy.float32))
+ logger.info("creating index")
+ index = similarities.MatrixSimilarity(model[corpus], numBest = None, numFeatures = model.numTopics)
+ logger.info("computing cossims")
+ for row, sims in enumerate(index):
+ a1[row] = sims
+ if row % 1000 == 0:
+ logger.debug("created cossim of %i/%i documents" % (row + 1, len(corpus)))
+ return a1
+
+
+def cossim2(model, corpus):
+ u = model.projection.u
+ s = model.projection.s
+ p = numpy.diag(1.0 / numpy.diag(s)) * u.T
+ logger.info("constructing vt")
+ ak = numpy.asmatrix(numpy.column_stack(p * matutils.sparse2full(doc, model.numTerms).reshape(model.numTerms, 1) for doc in corpus))
+# logger.info("reconstructing rank-k ak")
+# ak = u * (s * vt)
+ logger.info("normalizing ak for cossim")
+ lens = numpy.sqrt(numpy.sum(numpy.multiply(ak, ak), axis = 0))
+ ak = ak / lens
+ logger.debug("first few lens: %s" % (lens[:10]))
+ logger.info("computing cossims")
+ result = ak.T * ak
+ return result
+
+
+
+def sdiff(s1, s2):
+ return numpy.abs(s1-s2) / (numpy.abs(s2))
+
+def diff(u1, s1, u2, s2, scale = False):
+ udiff = 1.0 - numpy.abs(numpy.diag(u1.T * u2))
+ if scale:
+ udiff = (udiff * s2) / numpy.sum(s2) # weight errors by singular values from s2
+ degs = numpy.arcsin(udiff) / numpy.pi * 180
+# print r"%.3f$^{\circ}$" % degs
+ return sdiff(s1, s2), degs
+
+
+def fig_sdiff(ns, cols, labels):
+ fig = plt.figure()
+ ax1 = fig.add_subplot(111)
+ for n, col, label in zip(ns, cols, labels):
+ ax1.plot(n, color=col, label = label)
+ ax1.set_xlabel('singular values $i$')
+ ax1.set_ylabel('relative error $r_i$')
+ ax1.legend(loc=0)
+ plt.ylim(ymin=-.01)
+ return fig
+
+
+def fig_udiff(ns, labels):
+ fig = plt.figure()
+ ax1 = fig.add_subplot(111)
+ for n, label in zip(ns, labels):
+ ax1.plot(n, label = label)
+ ax1.set_xlabel('singular vectors $i$')
+ ax1.set_ylabel('angular error $r_i$')
+ ax1.legend(loc=0)
+ return fig
+
+
+def copys():
+ """Cut out U from results, leave only S (to save bandwidth in scp)."""
+ for fname in os.listdir(Result.froot):
+ if fname.startswith('wiki_p'):
+ result = Result('', '', fname)
+ logging.info("copying %s" % fname)
+ with open(fname + '.s', 'w') as fout:
+ cPickle.dump([None, result.s], fout, protocol=-1)
+
+
+def stos(s):
+ """wall-clock times to (x hours, y minutes)"""
+ h = int(s / 3600)
+ m = s - h * 3600
+ return h, int(m/60)
+
+
+
+class Result(object):
+ froot = '/Users/kofola/svn/gensim/trunk/src'
+
+ def __init__(self, name, marker, fname):
+ if not fname.endswith('.s'): # prepend "*" for local macbook results (not asteria)
+ name = '* ' + name
+ self.name = name
+ self.fname = fname
+ self.marker = marker
+ if fname:
+ self.s = self.getS()
+ else:
+ self.s = 14 + 2 * numpy.random.rand(400) # experiment not finished yet; plot some noise
+
+ def getS(self, maxvals = 400):
+ fin = os.path.join(Result.froot, self.fname)
+ logging.info("loading run from %s" % fin)
+ obj = cPickle.load(open(fin))
+ try:
+ s = obj.projection.s
+ except:
+ s = obj[1]
+ return s[:maxvals]
+#endclass Result
+
+def plotS(results, truth=None, labelx='factor $i$', labely=None):
+ if labely is None:
+ if truth:
+ labely='relative error $r_i$'
+ else:
+ labely='singular value $s_i$ (log scale)'
+
+ fig = plt.figure()
+ ax1 = fig.add_subplot(111)
+ ax1.set_xlabel(labelx)
+ ax1.set_ylabel(labely)
+
+ m = 100.0
+ for pos, result in enumerate(results):
+ if truth:
+ s = abs(result.s-truth.s)/truth.s
+ else:
+ s = result.s
+ m = min(m, min(s))
+ ax1.plot(s, result.marker, label=result.name, markevery = 20)
+
+ if not truth:
+ ax1.semilogy(basey=3, subsy=[32, 64, 128, 256])
+ ax1.legend(loc=0)
+ plt.ylim(ymin=m-1.0)
+ return fig
+
+
+
+def exp1():
+ """Oversampling experiment. Factors=400, chunks=20k.
+ """
+ results = [
+ Result('P1, $l=0$ [10h36m]', '-kx', 'wiki_p1_f400_c20k_e0.pkl.s'),
+ Result('P1, $l=200$ [21h17m]', '-k*', 'wiki_p1_f600_c20k_e0.pkl.s'),
+ Result('P1, $l=400$ [32h40m]', '-k+', 'wiki_p1_f800_c20k_e0.pkl.s'),
+
+ Result('P12, $l=0$ [6h30m] ', '-rx', 'wiki_p12_f400_c20k_e100_pi2.pkl.s'), # (100,2)
+# Result('P12, $l=0$ [6h23m] (200,1) ', '-ro', 'wiki_p12_f400_c20k_e200_pi1.pkl.s'),
+# Result('P12, $l=0$ [5h31m] (100,1) ', '-rs', 'wiki_p12_f400_c20k_e100_pi1.pkl.s'),
+# Result('P12, $l=200$ (100,1) [8h37m]', '-r*', 'wiki_p12_f600_c20k_e100_pi1.pkl.s'),
+ Result('P12, $l=200$ [9h21m]', '-r*', 'wiki_p12_f600_c20k_e100_pi2.pkl.s'), # (100,2)
+# Result('P12, $l=400$ []', '-r+', ''), # pada s pameti
+
+ Result('P2, $l=0$ [2h8m]', '-gx', 'wiki_p2_f400_c20k_e0_pi0.pkl.s'),
+ Result('P2, $l=200$ [2h28m]', '-g*', 'wiki_p2_f400_c20k_e200_pi0.pkl.s'),
+ Result('P2, $l=400$ [2h54m]', '-g+', 'wiki_p2_f400_c20k_e400_pi0.pkl.s'),
+
+ Result('P2, $l=400$, $q=3$ [7h57m]', '-m+', 'wiki_p2_f400_c20k_e400_pi3.pkl.s'),
+
+ ]
+ fout = os.path.join(Result.froot, 'experiment1.eps')
+ logging.info("saving figure with %i runs to %s" % (len(results), fout))
+ plotS(results).savefig(fout)
+
+ results = [
+ Result('P2, $l=0$, $q=0$ [2h8m]', '-gx', 'wiki_p2_f400_c20k_e0_pi0.pkl.s'),
+ Result('P2, $l=200$, $q=0$ [2h28m]', '-g*', 'wiki_p2_f400_c20k_e200_pi0.pkl.s'),
+ Result('P2, $l=400$, $q=0$ [2h54m]', '-g+', 'wiki_p2_f400_c20k_e400_pi0.pkl.s'),
+
+ Result('P2, $l=0$, $q=1$ [3h6m]', '-bx', 'wiki_p2_f400_c20k_e0_pi1.pkl.s'),
+ Result('P2, $l=200$, $q=1$ [3h53m]', '-b*', 'wiki_p2_f400_c20k_e200_pi1.pkl.s'),
+ Result('P2, $l=400$, $q=1$ [4h6m]', '-b+', 'wiki_p2_f400_c20k_e400_pi1.pkl.s'),
+
+ Result('P2, $l=0$, $q=3$ [4h49m]', '-mx', 'wiki_p2_f400_c20k_e0_pi3.pkl.s'),
+ Result('P2, $l=200$, $q=3$ [5h41m]', '-m*', 'wiki_p2_f400_c20k_e200_pi3.pkl.s'),
+ Result('P2, $l=400$, $q=3$ [7h57m]', '-m+', 'wiki_p2_f400_c20k_e400_pi3.pkl.s'),
+ ]
+ fout = os.path.join(Result.froot, 'experiment1pi.eps')
+ logging.info("saving figure with %i runs to %s" % (len(results), fout))
+ plotS(results).savefig(fout)
+
+
+
+def exp2():
+ """Effects of chunk size on P1 and P12. Factors=400.
+ """
+ results = [
+ Result('P1, chunks 10k [13h14m]', '-kx', 'wiki_p1_f400_c10k_e0.pkl.s'),
+ Result('P1, chunks 20k [10h36m]', '-r*', 'wiki_p1_f400_c20k_e0.pkl.s'),
+ Result('P1, chunks 40k [9h29m]', '-g+', 'wiki_p1_f400_c40k_e0.pkl.s'),
+
+ Result('P12, chunks 10k [9h35m]', '-bx', 'wiki_p12_f400_c10k_e100_pi2.pkl.s'),
+ Result('P12, chunks 20k [6h30m]', '-m*', 'wiki_p12_f400_c20k_e100_pi2.pkl.s'),
+ Result('P12, chunks 40k [4h42m]', '-c+', 'wiki_p12_f400_c40k_e100_pi2.pkl.s'),
+
+ Result('P2, $l=400$, $q=3$ [7h57m]', '-m+', 'wiki_p2_f400_c20k_e400_pi3.pkl.s'),
+ ]
+
+ fout = os.path.join(Result.froot, 'experiment2.eps')
+ logging.info("saving figure with %i runs to %s" % (len(results), fout))
+ plotS(results).savefig(fout)
+# plotS(results, truth=results[0]).savefig(os.path.join(Result.froot, 'experiment2r.eps'))
+
+
+def exp3():
+ """P1 input order experiment. k=400, chunks of !40k!, not 20k.
+ """
+ results = [
+ Result('P1 [9h29m]', '-k|', 'wiki_p1_f400_c40k_e0.pkl.s'),
+ Result('P1, shuffled1 [10h40m]', '-b*', 'wiki_p1_f400_c40k_e0_shuffled1.pkl.s'),
+ Result('P1, shuffled2 [10h57m]', '-g+', 'wiki_p1_f400_c40k_e0_shuffled2.pkl.s'),
+ Result('P1, shuffled3 [10h9m]', '-cx', 'wiki_p1_f400_c40k_e0_shuffled3.pkl.s'),
+
+ Result('P2, $l=400$, $q=3$ [7h57m]', '-m+', 'wiki_p2_f400_c20k_e400_pi3.pkl.s'),
+ ]
+
+ fout = os.path.join(Result.froot, 'experiment3.eps')
+ logging.info("saving figure with %i runs to %s" % (len(results), fout))
+ plotS(results).savefig(fout)
+
+
+def exp4():
+ """Distributed LSI for P1, P12. k=400, c=20k.
+ """
+ results = [
+ Result('P1, 1 node [10h36m]', '-kx', 'wiki_p1_f400_c20k_e0.pkl.s'),
+ Result('P1, 2 nodes [6h0m]', '-k*', 'wiki_p1_w2_f400_c20k_e0.pkl.s'),
+ Result('P1, 4 nodes [3h18m]', '-k+', 'wiki_p1_w4_f400_c20k_e0.pkl.s'),
+
+ Result('P12, 1 node [5h30m]', '-gx', 'wiki_p12_f400_c20k_e100_pi2.pkl.s'),
+ Result('P12, 2 nodes [2h53m]', '-g*', 'wiki_p12_w2_f400_c20k_e100_pi2.pkl.s'),
+# Result('P12, 4 nodes, $l=100$ [2h29m]', '-g+', 'wiki_p12_w4_f500_c20k_e100_pi2.pkl.s'),
+ Result('P12, 3 nodes, $l=200$ [3h1m]', '-go', 'wiki_p12_w3_f600_c20k_e100_pi2.pkl.s'),
+ Result('P12, 4 nodes, [1h41m]', '-g+', 'wiki_p12_w5_f400_c20k_e100_pi2.pkl.s'),
+# Result('P12, $l=100$, 4 nodes [2h12m]', '-ro', 'wiki_p12_w5_f500_c20k_e100_pi2.pkl.s'),
+
+ Result('P2, $l=400$, $q=3$ [7h57m]', '-m+', 'wiki_p2_f400_c20k_e400_pi3.pkl.s'),
+ ]
+
+ fout = os.path.join(Result.froot, 'experiment4.eps')
+ logging.info("saving figure with %i runs to %s" % (len(results), fout))
+ plotS(results).savefig(fout)
+
+
+
+def exp():
+ """
+ Generate all experiment graphs for the NIPS paper.
+ """
+ exp1()
+ exp2()
+ exp3()
+ exp4()
+
+
+def replotECIR(data, labels, markers=['-bx', '-g*', '-r+', '-co', '-ms', '-k^', '-yv', '-b>'],
+ labelx='factor $i$', labely=None):
+ if labely is None:
+ labely='relative error $r_i$'
+
+ fig = plt.figure()
+ ax1 = fig.add_subplot(111)
+ ax1.set_xlabel(labelx)
+ ax1.set_ylabel(labely)
+
+ m = 100.0
+ for s, label, marker in itertools.izip(data, labels, markers):
+ m = min(m, min(s))
+ ax1.plot(s, marker, label=label, markevery=10)
+
+ ax1.legend(loc=0)
+ plt.ylim(ymin=m-.001)
+ return fig
+# y4 = numpy.cumsum([float(part.split(',')[1]) for part in d4.split(' ')])
+
+
+def piechart(fracs, labels, explode=None):
+ f = plt.figure(figsize=(5,5))
+ ax1 = f.add_subplot(111)
+ ax1.pie(fracs, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False, pctdistance=0.7)
+ return f
+
+
+def typeset(topics, topN=10):
+ t2 = []
+ for topicid in topics:
+ topic = lda.expElogbeta[topicid]
+ topic = topic / topic.sum()
+ bestn = numpy.argsort(topic)[::-1][:topN]
+ beststr = [lda.id2word[id].replace('$', '\\$') for id in bestn]
+ t2.append(beststr)
+ for i in xrange(topN):
+ print ' & '.join('$'+bs[i]+'$' if '$' in bs[i] else bs[i] for bs in t2) + r' \\'
+
+# ndoc = lambda doc: [(old2new[id], val) for id, val in doc if id in old2new]
+
+def plotpie(docid):
+ mix = lda[ndoc(mm[docid])]
+ top = [(cat, frac) for cat, frac in mix if frac > 0.09]
+ fracs, cats = [frac for _, frac in top], [cat for cat, _ in top]
+ f = cc.piechart(fracs + [1.0-sum(fracs)], labels=["topic %i" % cat for cat in cats] + ['other'])
+ return f
View
54 src/gensim/nosy.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+
+"""
+A simple testrunner for nose (or anything else).
+
+Watch for changes in all file types specified in 'EXTENSIONS'.
+If changes, run test executable in 'EXECUTABLE', with default
+arguments 'DEFAULTARGS'.
+
+The --with-color option needs the "rudolf" nose plugin. See:
+http://pypi.python.org/pypi/rudolf/
+
+Originally by Jeff Winkler, http://jeffwinkler.net
+Forked from wkral http://github.com/wkral/Nosy
+"""
+
+import os
+import stat
+import time
+import datetime
+import sys
+import fnmatch
+
+
+EXTENSIONS = ['*.py']
+EXECUTABLE = 'nosetests test/'
+DEFAULTARGS = '--with-color -exe'# -w tests'
+
+
+def checkSum():
+ """
+ Return a long which can be used to know if any .py files have changed.
+ """
+ val = 0
+ for root, dirs, files in os.walk(os.getcwd()):
+ for extension in EXTENSIONS:
+ for f in fnmatch.filter(files, extension):
+ stats = os.stat(os.path.join(root, f))
+ val += stats[stat.ST_SIZE] + stats[stat.ST_MTIME]
+ return val
+
+if __name__ == '__main__':
+ val = 0
+ try:
+ while True:
+ if checkSum() != val:
+ val = checkSum()
+ os.system('%s %s %s' % (EXECUTABLE, DEFAULTARGS,
+ ' '.join(sys.argv[1:])))
+ print datetime.datetime.now().__str__()
+ print '=' * 77
+ time.sleep(1)
+ except KeyboardInterrupt:
+ print 'Goodbye'
View
8 src/gensim/parsing/__init__.py
@@ -0,0 +1,8 @@
+"""
+This package contains functions to preprocess raw text
+"""
+
+# bring model classes directly into package namespace, to save some typing
+from porter import PorterStemmer
+from tfidf import tfidf
+from preprocessing import *
View
375 src/gensim/parsing/porter.py
@@ -0,0 +1,375 @@
+#!/usr/bin/env python
+
+"""Porter Stemming Algorithm
+This is the Porter stemming algorithm, ported to Python from the
+version coded up in ANSI C by the author. It may be be regarded
+as canonical, in that it follows the algorithm presented in
+
+Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+no. 3, pp 130-137,
+
+only differing from it at the points maked --DEPARTURE-- below.
+
+See also http://www.tartarus.org/~martin/PorterStemmer
+
+The algorithm as described in the paper could be exactly replicated
+by adjusting the points of DEPARTURE, but this is barely necessary,
+because (a) the points of DEPARTURE are definitely improvements, and
+(b) no encoding of the Porter stemmer I have seen is anything like
+as exact as this version, even with the points of DEPARTURE!
+
+Vivake Gupta (v@nano.com)
+
+Release 1: January 2001
+
+Further adjustments by Santiago Bruno (bananabruno@gmail.com)
+to allow word input not restricted to one word per line, leading
+to:
+
+release 2: July 2008
+"""
+
+import sys
+
+class PorterStemmer:
+
+ def __init__(self):
+ """The main part of the stemming algorithm starts here.
+ b is a buffer holding a word to be stemmed. The letters are in b[k0],
+ b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
+ readjusted downwards as the stemming progresses. Zero termination is
+ not in fact used in the algorithm.
+
+ Note that only lower case sequences are stemmed. Forcing to lower case
+ should be done before stem(...) is called.
+ """
+
+ self.b = "" # buffer for word to be stemmed
+ self.k = 0
+ self.k0 = 0
+ self.j = 0 # j is a general offset into the string
+
+ def cons(self, i):
+ """cons(i) is TRUE <=> b[i] is a consonant."""
+ if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u':
+ return 0
+ if self.b[i] == 'y':
+ if i == self.k0:
+ return 1
+ else:
+ return (not self.cons(i - 1))
+ return 1
+
+ def m(self):
+ """m() measures the number of consonant sequences between k0 and j.
+ if c is a consonant sequence and v a vowel sequence, and <..>
+ indicates arbitrary presence,
+
+ <c><v> gives 0
+ <c>vc<v> gives 1
+ <c>vcvc<v> gives 2
+ <c>vcvcvc<v> gives 3
+ ....
+ """
+ n = 0
+ i = self.k0
+ while 1:
+ if i > self.j:
+ return n
+ if not self.cons(i):
+ break
+ i = i + 1
+ i = i + 1
+ while 1:
+ while 1:
+ if i > self.j:
+ return n
+ if self.cons(i):
+ break
+ i = i + 1
+ i = i + 1
+ n = n + 1
+ while 1:
+ if i > self.j:
+ return n
+ if not self.cons(i):
+ break
+ i = i + 1
+ i = i + 1
+
+ def vowelinstem(self):
+ """vowelinstem() is TRUE <=> k0,...j contains a vowel"""
+ for i in range(self.k0, self.j + 1):
+ if not self.cons(i):
+ return 1
+ return 0
+
+ def doublec(self, j):
+ """doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
+ if j < (self.k0 + 1):
+ return 0
+ if (self.b[j] != self.b[j-1]):
+ return 0
+ return self.cons(j)
+
+ def cvc(self, i):
+ """cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
+ and also if the second c is not w,x or y. this is used when trying to
+ restore an e at the end of a short e.g.
+
+ cav(e), lov(e), hop(e), crim(e), but
+ snow, box, tray.
+ """
+ if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2):
+ return 0
+ ch = self.b[i]
+ if ch == 'w' or ch == 'x' or ch == 'y':
+ return 0
+ return 1
+
+ def ends(self, s):
+ """ends(s) is TRUE <=> k0,...k ends with the string s."""
+ length = len(s)
+ if s[length - 1] != self.b[self.k]: # tiny speed-up
+ return 0
+ if length > (self.k - self.k0 + 1):
+ return 0
+ if self.b[self.k-length+1:self.k+1] != s:
+ return 0
+ self.j = self.k - length
+ return 1
+
+ def setto(self, s):
+ """setto(s) sets (j+1),...k to the characters in the string s, readjusting k."""
+ length = len(s)
+ self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:]
+ self.k = self.j + length
+
+ def r(self, s):
+ """r(s) is used further down."""
+ if self.m() > 0:
+ self.setto(s)
+
+ def step1ab(self):
+ """step1ab() gets rid of plurals and -ed or -ing. e.g.
+
+ caresses -> caress
+ ponies -> poni
+ ties -> ti
+ caress -> caress
+ cats -> cat
+
+ feed -> feed
+ agreed -> agree
+ disabled -> disable
+
+ matting -> mat
+ mating -> mate
+ meeting -> meet
+ milling -> mill
+ messing -> mess
+
+ meetings -> meet
+ """
+ if self.b[self.k] == 's':
+ if self.ends("sses"):
+ self.k = self.k - 2
+ elif self.ends("ies"):
+ self.setto("i")
+ elif self.b[self.k - 1] != 's':
+ self.k = self.k - 1
+ if self.ends("eed"):
+ if self.m() > 0:
+ self.k = self.k - 1
+ elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
+ self.k = self.j
+ if self.ends("at"): self.setto("ate")
+ elif self.ends("bl"): self.setto("ble")
+ elif self.ends("iz"): self.setto("ize")
+ elif self.doublec(self.k):
+ self.k = self.k - 1
+ ch = self.b[self.k]
+ if ch == 'l' or ch == 's' or ch == 'z':
+ self.k = self.k + 1
+ elif (self.m() == 1 and self.cvc(self.k)):
+ self.setto("e")
+
+ def step1c(self):
+ """step1c() turns terminal y to i when there is another vowel in the stem."""
+ if (self.ends("y") and self.vowelinstem()):
+ self.b = self.b[:self.k] + 'i' + self.b[self.k+1:]
+
+ def step2(self):
+ """step2() maps double suffices to single ones.
+ so -ization ( = -ize plus -ation) maps to -ize etc. note that the
+ string before the suffix must give m() > 0.
+ """
+ if self.b[self.k - 1] == 'a':
+ if self.ends("ational"): self.r("ate")
+ elif self.ends("tional"): self.r("tion")
+ elif self.b[self.k - 1] == 'c':
+ if self.ends("enci"): self.r("ence")
+ elif self.ends("anci"): self.r("ance")
+ elif self.b[self.k - 1] == 'e':
+ if self.ends("izer"): self.r("ize")
+ elif self.b[self.k - 1] == 'l':
+ if self.ends("bli"): self.r("ble") # --DEPARTURE--
+ # To match the published algorithm, replace this phrase with
+ # if self.ends("abli"): self.r("able")
+ elif self.ends("alli"): self.r("al")
+ elif self.ends("entli"): self.r("ent")
+ elif self.ends("eli"): self.r("e")
+ elif self.ends("ousli"): self.r("ous")
+ elif self.b[self.k - 1] == 'o':
+ if self.ends("ization"): self.r("ize")
+ elif self.ends("ation"): self.r("ate")
+ elif self.ends("ator"): self.r("ate")
+ elif self.b[self.k - 1] == 's':
+ if self.ends("alism"): self.r("al")
+ elif self.ends("iveness"): self.r("ive")
+ elif self.ends("fulness"): self.r("ful")
+ elif self.ends("ousness"): self.r("ous")
+ elif self.b[self.k - 1] == 't':
+ if self.ends("aliti"): self.r("al")
+ elif self.ends("iviti"): self.r("ive")
+ elif self.ends("biliti"): self.r("ble")
+ elif self.b[self.k - 1] == 'g': # --DEPARTURE--
+ if self.ends("logi"): self.r("log")
+ # To match the published algorithm, delete this phrase
+
+ def step3(self):
+ """step3() dels with -ic-, -full, -ness etc. similar strategy to step2."""
+ if self.b[self.k] == 'e':
+ if self.ends("icate"): self.r("ic")
+ elif self.ends("ative"): self.r("")
+ elif self.ends("alize"): self.r("al")
+ elif self.b[self.k] == 'i':
+ if self.ends("iciti"): self.r("ic")
+ elif self.b[self.k] == 'l':
+ if self.ends("ical"): self.r("ic")
+ elif self.ends("ful"): self.r("")
+ elif self.b[self.k] == 's':
+ if self.ends("ness"): self.r("")
+
+ def step4(self):
+ """step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
+ if self.b[self.k - 1] == 'a':
+ if self.ends("al"): pass
+ else: return