#14 automated testing of corpus formats

git-svn-id: https://my-svn.assembla.com/svn/gensim/trunk@69 92d0401f-a546-4972-9173-107b360ed7e5
piskvorky · Mar 12, 2010 · 2e07c2d · 2e07c2d
1 parent b7e8add
commit 2e07c2d
Show file tree

Hide file tree

Showing 15 changed files with 189 additions and 33 deletions.
diff --git a/src/gensim/corpora/__init__.py b/src/gensim/corpora/__init__.py
@@ -1,5 +1,3 @@
 """
 This package contains implementations of various streaming corpus I/O format.
 """
-
-import corpora, dictionary, sources
diff --git a/src/gensim/corpora/bleicorpus.py b/src/gensim/corpora/bleicorpus.py
@@ -12,7 +12,7 @@
 
 import logging
 
-from gensim import interfaces
+from gensim import interfaces, utils
 
 
 class BleiCorpus(interfaces.CorpusABC):
@@ -60,7 +60,7 @@ def __iter__(self):
         """
         for lineNo, line in enumerate(open(self.fname)):
             parts = line.split()
-            if int(parts[0]) != len(parts) + 1:
+            if int(parts[0]) != len(parts) - 1:
                 raise ValueError("invalid format at line %i in %s" %
                                  (lineNo, self.fname))
             doc = [part.rsplit(':', 1) for part in parts[1:]]
@@ -76,20 +76,17 @@ def saveCorpus(fname, corpus, id2word = None):
         There are actually two files saved: fname and fname.vocab, where
         fname.vocab is the vocabulary file.
         """
-        if self.id2word is None:
+        if id2word is None:
             logging.info("no word id mapping provided; initializing from corpus")
-            maxId = -1
-            for document in corpus:
-                maxId = max(maxId, max([-1] + [fieldId for fieldId, _ in document]))
-            numTerms = 1 + maxId
-            id2word = dict(zip(xrange(numTerms), xrange(numTerms))) # word id mapping will be identity
+            id2word = utils.dictFromCorpus(corpus)
+            numTerms = len(id2word)
         else:
             numTerms = 1 + max([-1] + id2word.keys())
 
-        logging.info("converting corpus to Blei's LDA-C format: %s" % fname)
+        logging.info("storing corpus in Blei's LDA-C format: %s" % fname)
         fout = open(fname, 'w')
         for doc in corpus:
-            fout.write("%i %s\n" % (len(doc), ' '.join("%i:%i" % p for p in doc)))
+            fout.write("%i %s\n" % (len(doc), ' '.join("%i:%f" % p for p in doc)))
         fout.close()
 
         # write out vocabulary, in a format compatible with Blei's topics.py script

diff --git a/src/gensim/corpora/dmlcorpus.py b/src/gensim/corpora/dmlcorpus.py
@@ -28,7 +28,7 @@ class DmlConfig(object):
     sources in one place (= the self.sources attribute).
     
     Apart from glueing together sources, DmlConfig also decides where to store
-    output files and which articles to accept for the corpus (= additional filter 
+    output files and which articles to accept for the corpus (= an additional filter 
     over the sources).
     """
     def __init__(self, configId, resultDir, acceptLangs = None):

diff --git a/src/gensim/corpora/lowcorpus.py b/src/gensim/corpora/lowcorpus.py
@@ -12,7 +12,7 @@
 
 import logging
 
-from gensim import interfaces
+from gensim import interfaces, utils
 
 
 class LowCorpus(interfaces.CorpusABC):
@@ -117,17 +117,30 @@ def __iter__(self):
 
 
     @staticmethod
-    def saveCorpus(fname, corpus, id2word):
+    def saveCorpus(fname, corpus, id2word = None):
         """
         Save a corpus in the List-of-words format.
         """
+        if id2word is None:
+            logging.info("no word id mapping provided; initializing from corpus")
+            id2word = utils.dictFromCorpus(corpus)
+
+        logging.info("storing corpus in List-Of-Words format: %s" % fname)
+        truncated = 0
         fout = open(fname, 'w')
         fout.write('%i\n' % len(corpus))
         for doc in corpus:
             words = []
             for wordId, value in doc:
-                words.extend([id2word[wordId]] * int(value))
+                if abs(int(value) - value) > 1e-6:
+                    truncated += 1
+                words.extend([str(id2word[wordId])] * int(value))
             fout.write('%s\n' % ' '.join(words))
         fout.close()
+
+        if truncated:
+            logging.warning("List-of-words format can only save vectors with \
+            integer entries; %i float entries were truncated to integer value" % 
+            truncated)
 #endclass LowCorpus
 
diff --git a/src/gensim/corpora/mmcorpus.py b/src/gensim/corpora/mmcorpus.py
@@ -32,6 +32,7 @@ def saveCorpus(fname, corpus, id2word = None):
         """
         Save a corpus in the Matrix Market format.
         """
+        logging.info("storing corpus in Matrix Market format: %s" % fname)
         matutils.MmWriter.writeCorpus(fname, corpus)
 #endclass MmCorpus
 

diff --git a/src/gensim/corpora/sources.py b/src/gensim/corpora/sources.py
@@ -12,7 +12,7 @@
 same location (type of access), same way of parsing them etc.
 
 Different sources can be aggregated into a single corpus, which is what the 
-DmlCorpus class does (see the corpora module).
+DmlCorpus class does (see the dmlcorpus module).
 """
 
 import logging
@@ -63,7 +63,7 @@ def tokenize(self, content):
 
     def normalizeWord(self, word):
         raise NotImplementedError('Abstract Base Class')
-#endclass Source
+#endclass ArticleSource
 
 
 
@@ -280,7 +280,7 @@ class ArxmlivErrorHandler(xml.sax.handler.ErrorHandler):
         # byte encodings of InputSource are ignored, bad things sometimes happen 
         # in buffering of multi-byte files (such as utf8), characters get cut in 
         # the middle, resulting in invalid tokens...
-        # This is not really a problem with arxmliv xml files themselved, so ignore
+        # This is not really a problem with arxmliv xml files themselves, so ignore
         # these errors silently.
         def error(self, exception):
             pass

diff --git a/src/gensim/corpora/svmlightcorpus.py b/src/gensim/corpora/svmlightcorpus.py
@@ -55,9 +55,13 @@ def __iter__(self):
         Iterate over the corpus, returning one sparse vector at a time.
         """
         for lineNo, line in enumerate(open(self.fname)):
-            if line.startswith('#'):
-                continue
+            line = line[: line.find('#')].strip()
+            if not line:
+                continue # ignore comments and empty lines
             parts = line.split()
+            if not parts:
+                raise ValueError('invalid format at line no. %i in %s' %
+                                 (lineNo, self.fname))
             target, fields = parts[0], [part.rsplit(':', 1) for part in parts[1:]]
             doc = [(int(p1), float(p2)) for p1, p2 in fields if p1 != 'qid'] # ignore 'qid' features
             yield doc
@@ -71,7 +75,7 @@ def saveCorpus(fname, corpus, id2word = None):
         logging.info("converting corpus to SVMlight format: %s" % fname)
         fout = open(fname, 'w')
         for doc in corpus:
-            fout.write("%i %s\n" % (0, ' '.join("%i:%i" % p for p in doc)))
+            fout.write("%i %s\n" % (0, ' '.join("%i:%f" % p for p in doc)))
         fout.close()
 #endclass SvmLightCorpus
 
diff --git a/src/gensim/tests/test_corpora.py b/src/gensim/tests/test_corpora.py
@@ -8,29 +8,69 @@
 Automated tests for checking corpus I/O formats (the corpora package).
 """
 
-
 import logging
+import os
+import os.path
 import unittest
+import tempfile
 
 from gensim.corpora import dmlcorpus, bleicorpus, mmcorpus, lowcorpus, svmlightcorpus
 
 
-#FIXME TODO
+def testfile():
+    # temporary data will be stored to this file
+    return os.path.join(tempfile.gettempdir(), 'gensim.tst')
 
-class TestMmCorpus(unittest.TestCase):
-    def setUp(self):
-        self.corpus = mmcorpus.MmCorpus('deerwester.mm')
+
+class CorpusTesterABC(object):
+    def __init__(self):
+        raise NotImplementedError("cannot instantiate Abstract Base Class")
+        self.corpusClass = None # to be overridden with a particular class
+        self.fileExtension = None # file 'testcorpus.fileExtension' must exist and be in the format of corpusClass
 
-    def tearDown(self):
-        pass
 
     def testLoad(self):
-        pass
+        corpus = self.corpusClass('testcorpus.' + self.fileExtension.lstrip('.'))
+        docs = list(corpus)
+        self.assertEqual(len(docs), 9) # the deerwester corpus always has nine documents, no matter what format
+
 
     def testSave(self):
-        pass
+        corpus = [[(1, 1.0)], [], [(0, 0.5), (2, 1.0)], []]
+
+        # make sure the corpus can be saved
+        self.corpusClass.saveCorpus(testfile(), corpus)
+
+        # and loaded back, resulting in exactly the same corpus
+        corpus2 = list(self.corpusClass(testfile()))
+        self.assertEqual(corpus, corpus2)
+
+        # delete the temporary file
+        os.remove(testfile())
+#endclass CorpusTesterABC
+
+
+class TestMmCorpus(unittest.TestCase, CorpusTesterABC):
+    def setUp(self):
+        self.corpusClass = mmcorpus.MmCorpus
+        self.fileExtension = '.mm'
 #endclass TestMmCorpus
 
 
+class TestSvmLightCorpus(unittest.TestCase, CorpusTesterABC):
+    def setUp(self):
+        self.corpusClass = svmlightcorpus.SvmLightCorpus
+        self.fileExtension = '.svmlight'
+#endclass TestSvmLightCorpus
+
+
+class TestBleiCorpus(unittest.TestCase, CorpusTesterABC):
+    def setUp(self):
+        self.corpusClass = bleicorpus.BleiCorpus
+        self.fileExtension = '.blei'
+#endclass TestBleiCorpus
+
+
 if __name__ == '__main__':
+    logging.basicConfig(level = logging.ERROR)
     unittest.main()
diff --git a/src/gensim/tests/test_models.py b/src/gensim/tests/test_models.py
@@ -12,14 +12,14 @@
 import logging
 import unittest
 
-from gensim.corpora import corpora
+from gensim.corpora import mmcorpus
 from gensim.models import lsimodel, ldamodel, tfidfmodel
 
 # FIXME TODO
 
 class TestLsiModel(unittest.TestCase):
     def setUp(self):
-        self.corpus = corpora.MmCorpus('deerwester.mm')
+        self.corpus = mmcorpus.MmCorpus('testcorpus.mm')
 
     def tearDown(self):
         pass
@@ -48,5 +48,20 @@ def testPersistence(self):
 #endclass TestLdaModel
 
 
+class TestRPModel(unittest.TestCase):
+    def setUp(self):
+        pass
+
+    def tearDown(self):
+        pass
+
+    def testInference(self):
+        pass
+
+    def testPersistence(self):
+        pass
+#endclass TestRPModel
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/src/gensim/tests/testcorpus.blei b/src/gensim/tests/testcorpus.blei
@@ -0,0 +1,9 @@
+3 0:1.000000 1:1.000000 2:1.000000
+6 2:1.000000 3:1.000000 4:1.000000 5:1.000000 6:1.000000 8:1.000000
+4 1:1.000000 3:1.000000 4:1.000000 7:1.000000
+3 0:1.000000 4:2.000000 7:1.000000
+3 3:1.000000 5:1.000000 6:1.000000
+1 9:1.000000
+2 9:1.000000 10:1.000000
+3 9:1.000000 10:1.000000 11:1.000000
+3 8:1.000000 10:1.000000 11:1.000000
diff --git a/src/gensim/tests/testcorpus.blei.vocab b/src/gensim/tests/testcorpus.blei.vocab
@@ -0,0 +1,12 @@
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
diff --git a/src/gensim/tests/testcorpus.low b/src/gensim/tests/testcorpus.low
@@ -0,0 +1,10 @@
+9
+0 1 2
+2 3 4 5 6 8
+1 3 4 7
+0 4 4 7
+3 5 6
+9
+9 10
+9 10 11
+8 10 11
diff --git a/src/gensim/tests/testcorpus.mm b/src/gensim/tests/testcorpus.mm
@@ -0,0 +1,30 @@
+%%matrixmarket matrix coordinate real general
+9 12 28
+1 1 1.000000
+1 2 1.000000
+1 3 1.000000
+2 3 1.000000
+2 4 1.000000
+2 5 1.000000
+2 6 1.000000
+2 7 1.000000
+2 9 1.000000
+3 2 1.000000
+3 4 1.000000
+3 5 1.000000
+3 8 1.000000
+4 1 1.000000
+4 5 2.000000
+4 8 1.000000
+5 4 1.000000
+5 6 1.000000
+5 7 1.000000
+6 10 1.000000
+7 10 1.000000
+7 11 1.000000
+8 10 1.000000
+8 11 1.000000
+8 12 1.000000
+9 9 1.000000
+9 11 1.000000
+9 12 1.000000
diff --git a/src/gensim/tests/testcorpus.svmlight b/src/gensim/tests/testcorpus.svmlight
@@ -0,0 +1,10 @@
+# deerwester corpus stored in svmlight format
+0 0:1.000000 1:1.000000 2:1.000000 # the target class is always 0, as we are not interested in supervised learning
+0 2:1.000000 3:1.000000 4:1.000000 5:1.000000 6:1.000000 8:1.000000
+0 1:1.000000 3:1.000000 4:1.000000 7:1.000000
+0 0:1.000000 4:2.000000 7:1.000000
+0 3:1.000000 5:1.000000 6:1.000000
+0 9:1.000000
+0 9:1.000000 10:1.000000
+0 9:1.000000 10:1.000000 11:1.000000
+0 8:1.000000 10:1.000000 11:1.000000
diff --git a/src/gensim/utils.py b/src/gensim/utils.py
@@ -95,3 +95,20 @@ def save(self, fname):
 
 def identity(p):
     return p
+
+
+def dictFromCorpus(corpus):
+    """
+    Scan corpus for all word ids that appear in it, then contruct and return a mapping
+    which maps each wordId -> str(wordId).
+    
+    This function is used whenever words need to be displayed (as opposed to just 
+    their ids) but no wordId->word mapping was provided. The resulting mapping 
+    only covers words actually used in the corpus, up to the highest wordId found.
+    """
+    maxId = -1
+    for document in corpus:
+        maxId = max(maxId, max([-1] + [fieldId for fieldId, _ in document]))
+    numTerms = 1 + maxId
+    id2word = dict((fieldId, str(fieldId)) for fieldId in xrange(numTerms))
+    return id2word