#8 transform whole corpora (the model.apply() method)

git-svn-id: https://my-svn.assembla.com/svn/gensim/trunk@56 92d0401f-a546-4972-9173-107b360ed7e5
piskvorky · Feb 26, 2010 · 0d3d39d · 0d3d39d
1 parent fb22c0b
commit 0d3d39d
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 43 deletions.
diff --git a/gensim.py b/gensim.py
@@ -20,16 +20,14 @@
 import docsim
 import lsimodel
 import ldamodel
-#import tfidfmodel
+import tfidfmodel
 
 
 #TODO:
-# 1) tfidf prepsat z matutils do samostatneho tfidfmodel.py (plus __getitem__, aby 
-#    to byla transformace)
-# 5) prevest lda modely z blei na asteria01 na LdaModel objekt (trva tydny, nepoustet
-#    znova...
-# 6) pridat random projections
-# 7) logging per module -- ne vsechno pres logging.root, zlepseni prehlednosti logu
+# * prevest lda modely z blei na asteria01 na LdaModel objekt (trva tydny, nepoustet
+#   znova...
+# * prepsat random projections
+# * logging per module -- ne vsechno pres logging.root, zlepseni prehlednosti logu
 
 
 SOURCE_LIST = [
@@ -42,11 +40,11 @@
 
 # set to True to do everything EXCEPT actually writing out similar.xml files to disk.
 # similar.xml files are NOT written if DRY_RUN is true.
-DRY_RUN = False
+DRY_RUN = True # False
 
 # how many 'most similar' documents to store in each similar.xml?
 MIN_SCORE = 0.0 # prune based on similarity score (all below MIN_SCORE are ignored)
-MAX_SIMILAR = 10 # prune based on rank (at most MAX_SIMILAR are stored). set to 0 to all of them (no limit).
+MAX_SIMILAR = 10 # prune based on rank (at most MAX_SIMILAR are stored). set to 0 to store all of them (no limit).
 
 # internal method parameters
 DIM_RP = 300 # dimensionality for the random projections
@@ -92,18 +90,14 @@ def getMeta(fname):
     return author, title
 
 
-def buildDmlCorpus(language):
-    config = corpora.DmlConfig('gensim_%s' % language, resultDir = RESULT_DIR, acceptLangs = [language])
-    for source in SOURCE_LIST:
-        config.addSource(source)
-
+def buildDmlCorpus(config, language):
     dml = corpora.DmlCorpus()
-    dml.processConfig(config)
+    dml.processConfig(config, shuffle = True)
     dml.buildDictionary()
     dml.dictionary.filterExtremes(noBelow = 5, noAbove = 0.3) # ignore too (in)frequent words
 
-    dml.save(config.resultFile('.pkl')) # save the whole object as binary data
-    dml.saveAsText() # save id mappings and matrices as text data
+    dml.save(config.resultFile('.pkl')) # save the mappings as binary data (actual documents are not saved, only their uris) 
+    dml.saveAsText() # save id mappings and documents as text data (matrix market format)
     return dml
 
 
@@ -149,29 +143,41 @@ def generateSimilar(corpus, similarities, method):
         sys.exit(1)
     language = sys.argv[1]
 
+    # construct the config, which holds information about sources, data file filenames etc.
+    config = corpora.DmlConfig('gensim_%s' % language, resultDir = RESULT_DIR, acceptLangs = [language])
+    for source in SOURCE_LIST:
+        config.addSource(source)
+
     if 'build' in program:
-        buildDmlCorpus(language)
+        buildDmlCorpus(config, language)
     elif 'genmodel' in program:
-        if 'tfidf' in program:
-            corpus = corpora.MmCorpus(dml_bow)
-            model = tfidfmodel.TfidfModel(corpus)
-            model.save(modelfname('tfidf'))
-        if 'lda' in program:
-            corpus = corpora.MmCorpus(dml_bow)
-            id2word = loadDictionary(dml_dict.txt)
-            model = ldamodel.LdaModel(corpus, id2word, numTopics = DIM_LDA)
-            model.save(modelfname('lda'))
-        elif 'lsi' in program:
+        if len(sys.argv) < 3:
+            print globals()['__doc__'] % (program)
+            sys.exit(1)
+        method = sys.argv[2].strip().lower()
+        try:
+            dml = corpora.DmlCorpus.load(config.resultFile('.pkl'))
+        except IOError, e:
+            raise IOError("no word-count corpus found at %s; you must first generate it through gensim_build.py")
+
+        id2word = corpora.DmlCorpus.loadDictionary(config.resultFile('wordids.txt'))
+        if method == 'tfidf':
+            corpus = corpora.MmCorpus(config.resultFile('bow.mm'))
+            model = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
+            model.save(config.resultFile('tfidfmodel.pkl'))
+        elif method == 'lda':
+            corpus = corpora.MmCorpus(config.resultFile('bow.mm'))
+            model = ldamodel.LdaModel(corpus, id2word = id2word, numTopics = DIM_LDA)
+            model.save(config.resultFile('ldamodel%i.pkl' % DIM_LDA))
+        elif method == 'lsi' or method == 'lsa':
             # first, transform word counts to tf-idf weights
-            corpus = corpora.MmCorpus(dml_bow)
-            model = tfidfmodel.TfidfModel(corpus)
-            tfidf = corpora.TopicsCorpus(model, corpus)
+            corpus = corpora.MmCorpus(config.resultFile('bow.mm'))
+            tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
             # then find the transformation from tf-idf to latent space
-            id2word = loadDictionary(dml_dict.txt)
-            model = lsimodel.LsiModel(tfidf, id2word, numTopics = DIM_LSI)
-            model.save(modelfname('lsi'))
+            lsi = lsimodel.LsiModel(tfidf.apply(corpus), id2word = id2word, numTopics = DIM_LSI)
+            model.save(config.resultFile('lsimodel%i.pkl' % DIM_LSI))
         else:
-            raise ValueError('unknown topic extraction method in %s' % program)
+            raise ValueError('unknown topic extraction method: %s' % repr(method))
     elif 'gensim' in program:
         corpus = corpora.DmlCorpus.load(dml.pkl)
         input = corpora.MmCorpus(bow.mm)

diff --git a/interfaces.py b/interfaces.py
@@ -15,9 +15,9 @@ class CorpusABC(utils.SaveLoad):
     See the corpora module for some example corpus implementations.
     
     Note that although a default len() method is provided, it is very inefficient
-    (performs a linear scan through the corpus). Wherever the corpus size is known
-    in advance (or at least doesn't change so that it can be cached), the len() method 
-    should be overridden.
+    (performs a linear scan through the corpus to determine its length). Wherever 
+    the corpus size is known in advance (or at least doesn't change so that it can 
+    be cached), the len() method should be overridden.
     """
     def __iter__(self):
         raise NotImplementedError('cannot instantiate abstract base class')
@@ -43,6 +43,28 @@ class TransformationABC(utils.SaveLoad):
     
     See the tfidfmodel module for an example of a transformation.
     """
+    class TransformedCorpus(CorpusABC):
+        def __init__(self, fnc, corpus):
+            self.fnc, self.corpus = fnc, corpus
+
+        def __len__(self):
+            return len(self.corpus)
+
+        def __iter__(self):
+            for doc in self.corpus:
+                yield self.fnc(doc) 
+    #endclass TransformedCorpus
+
     def __getitem__(self):
         raise NotImplementedError('cannot instantiate abstract base class')
+
+
+    def apply(self, corpus):
+        """
+        Helper function used in derived classes. Applies the transformation to 
+        a whole corpus (as opposed to a single document) and returns another corpus.
+        """
+        return TransformationABC.TransformedCorpus(self.__getitem__, corpus)
 #endclass TransformationABC
+
+
diff --git a/lsimodel.py b/lsimodel.py
@@ -184,12 +184,9 @@ def __getitem__(self, bow):
         """
         Return topic distribution, as a list of (topic_id, topic_value) 2-tuples.
         
-        This is done by folding the input document into the latent topic space.
+        This is done by folding input document into the latent topic space.
         """
-        if isinstance(bow, numpy.ndarray): # input already a numpy array
-            vec = bow
-        else:
-            vec = matutils.doc2vec(bow, self.numTerms)
+        vec = matutils.doc2vec(bow, self.numTerms)
         vec.shape = (self.numTerms, 1)
         topicDist = self.projection * vec
         return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist)