Skip to content

Commit

Permalink
#14 automated testing of corpus formats
Browse files Browse the repository at this point in the history
git-svn-id: https://my-svn.assembla.com/svn/gensim/trunk@69 92d0401f-a546-4972-9173-107b360ed7e5
  • Loading branch information
piskvorky committed Mar 12, 2010
1 parent b7e8add commit 2e07c2d
Show file tree
Hide file tree
Showing 15 changed files with 189 additions and 33 deletions.
2 changes: 0 additions & 2 deletions src/gensim/corpora/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
"""
This package contains implementations of various streaming corpus I/O format.
"""

import corpora, dictionary, sources
17 changes: 7 additions & 10 deletions src/gensim/corpora/bleicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

import logging

from gensim import interfaces
from gensim import interfaces, utils


class BleiCorpus(interfaces.CorpusABC):
Expand Down Expand Up @@ -60,7 +60,7 @@ def __iter__(self):
"""
for lineNo, line in enumerate(open(self.fname)):
parts = line.split()
if int(parts[0]) != len(parts) + 1:
if int(parts[0]) != len(parts) - 1:
raise ValueError("invalid format at line %i in %s" %
(lineNo, self.fname))
doc = [part.rsplit(':', 1) for part in parts[1:]]
Expand All @@ -76,20 +76,17 @@ def saveCorpus(fname, corpus, id2word = None):
There are actually two files saved: fname and fname.vocab, where
fname.vocab is the vocabulary file.
"""
if self.id2word is None:
if id2word is None:
logging.info("no word id mapping provided; initializing from corpus")
maxId = -1
for document in corpus:
maxId = max(maxId, max([-1] + [fieldId for fieldId, _ in document]))
numTerms = 1 + maxId
id2word = dict(zip(xrange(numTerms), xrange(numTerms))) # word id mapping will be identity
id2word = utils.dictFromCorpus(corpus)
numTerms = len(id2word)
else:
numTerms = 1 + max([-1] + id2word.keys())

logging.info("converting corpus to Blei's LDA-C format: %s" % fname)
logging.info("storing corpus in Blei's LDA-C format: %s" % fname)
fout = open(fname, 'w')
for doc in corpus:
fout.write("%i %s\n" % (len(doc), ' '.join("%i:%i" % p for p in doc)))
fout.write("%i %s\n" % (len(doc), ' '.join("%i:%f" % p for p in doc)))
fout.close()

# write out vocabulary, in a format compatible with Blei's topics.py script
Expand Down
2 changes: 1 addition & 1 deletion src/gensim/corpora/dmlcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class DmlConfig(object):
sources in one place (= the self.sources attribute).
Apart from glueing together sources, DmlConfig also decides where to store
output files and which articles to accept for the corpus (= additional filter
output files and which articles to accept for the corpus (= an additional filter
over the sources).
"""
def __init__(self, configId, resultDir, acceptLangs = None):
Expand Down
19 changes: 16 additions & 3 deletions src/gensim/corpora/lowcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

import logging

from gensim import interfaces
from gensim import interfaces, utils


class LowCorpus(interfaces.CorpusABC):
Expand Down Expand Up @@ -117,17 +117,30 @@ def __iter__(self):


@staticmethod
def saveCorpus(fname, corpus, id2word):
def saveCorpus(fname, corpus, id2word = None):
"""
Save a corpus in the List-of-words format.
"""
if id2word is None:
logging.info("no word id mapping provided; initializing from corpus")
id2word = utils.dictFromCorpus(corpus)

logging.info("storing corpus in List-Of-Words format: %s" % fname)
truncated = 0
fout = open(fname, 'w')
fout.write('%i\n' % len(corpus))
for doc in corpus:
words = []
for wordId, value in doc:
words.extend([id2word[wordId]] * int(value))
if abs(int(value) - value) > 1e-6:
truncated += 1
words.extend([str(id2word[wordId])] * int(value))
fout.write('%s\n' % ' '.join(words))
fout.close()

if truncated:
logging.warning("List-of-words format can only save vectors with \
integer entries; %i float entries were truncated to integer value" %
truncated)
#endclass LowCorpus

1 change: 1 addition & 0 deletions src/gensim/corpora/mmcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def saveCorpus(fname, corpus, id2word = None):
"""
Save a corpus in the Matrix Market format.
"""
logging.info("storing corpus in Matrix Market format: %s" % fname)
matutils.MmWriter.writeCorpus(fname, corpus)
#endclass MmCorpus

Expand Down
6 changes: 3 additions & 3 deletions src/gensim/corpora/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
same location (type of access), same way of parsing them etc.
Different sources can be aggregated into a single corpus, which is what the
DmlCorpus class does (see the corpora module).
DmlCorpus class does (see the dmlcorpus module).
"""

import logging
Expand Down Expand Up @@ -63,7 +63,7 @@ def tokenize(self, content):

def normalizeWord(self, word):
raise NotImplementedError('Abstract Base Class')
#endclass Source
#endclass ArticleSource



Expand Down Expand Up @@ -280,7 +280,7 @@ class ArxmlivErrorHandler(xml.sax.handler.ErrorHandler):
# byte encodings of InputSource are ignored, bad things sometimes happen
# in buffering of multi-byte files (such as utf8), characters get cut in
# the middle, resulting in invalid tokens...
# This is not really a problem with arxmliv xml files themselved, so ignore
# This is not really a problem with arxmliv xml files themselves, so ignore
# these errors silently.
def error(self, exception):
pass
Expand Down
10 changes: 7 additions & 3 deletions src/gensim/corpora/svmlightcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,13 @@ def __iter__(self):
Iterate over the corpus, returning one sparse vector at a time.
"""
for lineNo, line in enumerate(open(self.fname)):
if line.startswith('#'):
continue
line = line[: line.find('#')].strip()
if not line:
continue # ignore comments and empty lines
parts = line.split()
if not parts:
raise ValueError('invalid format at line no. %i in %s' %
(lineNo, self.fname))
target, fields = parts[0], [part.rsplit(':', 1) for part in parts[1:]]
doc = [(int(p1), float(p2)) for p1, p2 in fields if p1 != 'qid'] # ignore 'qid' features
yield doc
Expand All @@ -71,7 +75,7 @@ def saveCorpus(fname, corpus, id2word = None):
logging.info("converting corpus to SVMlight format: %s" % fname)
fout = open(fname, 'w')
for doc in corpus:
fout.write("%i %s\n" % (0, ' '.join("%i:%i" % p for p in doc)))
fout.write("%i %s\n" % (0, ' '.join("%i:%f" % p for p in doc)))
fout.close()
#endclass SvmLightCorpus

58 changes: 49 additions & 9 deletions src/gensim/tests/test_corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,69 @@
Automated tests for checking corpus I/O formats (the corpora package).
"""


import logging
import os
import os.path
import unittest
import tempfile

from gensim.corpora import dmlcorpus, bleicorpus, mmcorpus, lowcorpus, svmlightcorpus


#FIXME TODO
def testfile():
# temporary data will be stored to this file
return os.path.join(tempfile.gettempdir(), 'gensim.tst')

class TestMmCorpus(unittest.TestCase):
def setUp(self):
self.corpus = mmcorpus.MmCorpus('deerwester.mm')

class CorpusTesterABC(object):
def __init__(self):
raise NotImplementedError("cannot instantiate Abstract Base Class")
self.corpusClass = None # to be overridden with a particular class
self.fileExtension = None # file 'testcorpus.fileExtension' must exist and be in the format of corpusClass

def tearDown(self):
pass

def testLoad(self):
pass
corpus = self.corpusClass('testcorpus.' + self.fileExtension.lstrip('.'))
docs = list(corpus)
self.assertEqual(len(docs), 9) # the deerwester corpus always has nine documents, no matter what format


def testSave(self):
pass
corpus = [[(1, 1.0)], [], [(0, 0.5), (2, 1.0)], []]

# make sure the corpus can be saved
self.corpusClass.saveCorpus(testfile(), corpus)

# and loaded back, resulting in exactly the same corpus
corpus2 = list(self.corpusClass(testfile()))
self.assertEqual(corpus, corpus2)

# delete the temporary file
os.remove(testfile())
#endclass CorpusTesterABC


class TestMmCorpus(unittest.TestCase, CorpusTesterABC):
def setUp(self):
self.corpusClass = mmcorpus.MmCorpus
self.fileExtension = '.mm'
#endclass TestMmCorpus


class TestSvmLightCorpus(unittest.TestCase, CorpusTesterABC):
def setUp(self):
self.corpusClass = svmlightcorpus.SvmLightCorpus
self.fileExtension = '.svmlight'
#endclass TestSvmLightCorpus


class TestBleiCorpus(unittest.TestCase, CorpusTesterABC):
def setUp(self):
self.corpusClass = bleicorpus.BleiCorpus
self.fileExtension = '.blei'
#endclass TestBleiCorpus


if __name__ == '__main__':
logging.basicConfig(level = logging.ERROR)
unittest.main()
19 changes: 17 additions & 2 deletions src/gensim/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@
import logging
import unittest

from gensim.corpora import corpora
from gensim.corpora import mmcorpus
from gensim.models import lsimodel, ldamodel, tfidfmodel

# FIXME TODO

class TestLsiModel(unittest.TestCase):
def setUp(self):
self.corpus = corpora.MmCorpus('deerwester.mm')
self.corpus = mmcorpus.MmCorpus('testcorpus.mm')

def tearDown(self):
pass
Expand Down Expand Up @@ -48,5 +48,20 @@ def testPersistence(self):
#endclass TestLdaModel


class TestRPModel(unittest.TestCase):
def setUp(self):
pass

def tearDown(self):
pass

def testInference(self):
pass

def testPersistence(self):
pass
#endclass TestRPModel


if __name__ == '__main__':
unittest.main()
9 changes: 9 additions & 0 deletions src/gensim/tests/testcorpus.blei
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
3 0:1.000000 1:1.000000 2:1.000000
6 2:1.000000 3:1.000000 4:1.000000 5:1.000000 6:1.000000 8:1.000000
4 1:1.000000 3:1.000000 4:1.000000 7:1.000000
3 0:1.000000 4:2.000000 7:1.000000
3 3:1.000000 5:1.000000 6:1.000000
1 9:1.000000
2 9:1.000000 10:1.000000
3 9:1.000000 10:1.000000 11:1.000000
3 8:1.000000 10:1.000000 11:1.000000
12 changes: 12 additions & 0 deletions src/gensim/tests/testcorpus.blei.vocab
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
0
1
2
3
4
5
6
7
8
9
10
11
10 changes: 10 additions & 0 deletions src/gensim/tests/testcorpus.low
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
9
0 1 2
2 3 4 5 6 8
1 3 4 7
0 4 4 7
3 5 6
9
9 10
9 10 11
8 10 11
30 changes: 30 additions & 0 deletions src/gensim/tests/testcorpus.mm
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
%%matrixmarket matrix coordinate real general
9 12 28
1 1 1.000000
1 2 1.000000
1 3 1.000000
2 3 1.000000
2 4 1.000000
2 5 1.000000
2 6 1.000000
2 7 1.000000
2 9 1.000000
3 2 1.000000
3 4 1.000000
3 5 1.000000
3 8 1.000000
4 1 1.000000
4 5 2.000000
4 8 1.000000
5 4 1.000000
5 6 1.000000
5 7 1.000000
6 10 1.000000
7 10 1.000000
7 11 1.000000
8 10 1.000000
8 11 1.000000
8 12 1.000000
9 9 1.000000
9 11 1.000000
9 12 1.000000
10 changes: 10 additions & 0 deletions src/gensim/tests/testcorpus.svmlight
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# deerwester corpus stored in svmlight format
0 0:1.000000 1:1.000000 2:1.000000 # the target class is always 0, as we are not interested in supervised learning
0 2:1.000000 3:1.000000 4:1.000000 5:1.000000 6:1.000000 8:1.000000
0 1:1.000000 3:1.000000 4:1.000000 7:1.000000
0 0:1.000000 4:2.000000 7:1.000000
0 3:1.000000 5:1.000000 6:1.000000
0 9:1.000000
0 9:1.000000 10:1.000000
0 9:1.000000 10:1.000000 11:1.000000
0 8:1.000000 10:1.000000 11:1.000000
17 changes: 17 additions & 0 deletions src/gensim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,20 @@ def save(self, fname):

def identity(p):
return p


def dictFromCorpus(corpus):
"""
Scan corpus for all word ids that appear in it, then contruct and return a mapping
which maps each wordId -> str(wordId).
This function is used whenever words need to be displayed (as opposed to just
their ids) but no wordId->word mapping was provided. The resulting mapping
only covers words actually used in the corpus, up to the highest wordId found.
"""
maxId = -1
for document in corpus:
maxId = max(maxId, max([-1] + [fieldId for fieldId, _ in document]))
numTerms = 1 + maxId
id2word = dict((fieldId, str(fieldId)) for fieldId in xrange(numTerms))
return id2word

0 comments on commit 2e07c2d

Please sign in to comment.