Skip to content

Commit

Permalink
#8 transform whole corpora (the model.apply() method)
Browse files Browse the repository at this point in the history
git-svn-id: https://my-svn.assembla.com/svn/gensim/trunk@56 92d0401f-a546-4972-9173-107b360ed7e5
  • Loading branch information
piskvorky committed Feb 26, 2010
1 parent fb22c0b commit 0d3d39d
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 43 deletions.
76 changes: 41 additions & 35 deletions gensim.py
Expand Up @@ -20,16 +20,14 @@
import docsim
import lsimodel
import ldamodel
#import tfidfmodel
import tfidfmodel


#TODO:
# 1) tfidf prepsat z matutils do samostatneho tfidfmodel.py (plus __getitem__, aby
# to byla transformace)
# 5) prevest lda modely z blei na asteria01 na LdaModel objekt (trva tydny, nepoustet
# znova...
# 6) pridat random projections
# 7) logging per module -- ne vsechno pres logging.root, zlepseni prehlednosti logu
# * prevest lda modely z blei na asteria01 na LdaModel objekt (trva tydny, nepoustet
# znova...
# * prepsat random projections
# * logging per module -- ne vsechno pres logging.root, zlepseni prehlednosti logu


SOURCE_LIST = [
Expand All @@ -42,11 +40,11 @@

# set to True to do everything EXCEPT actually writing out similar.xml files to disk.
# similar.xml files are NOT written if DRY_RUN is true.
DRY_RUN = False
DRY_RUN = True # False

# how many 'most similar' documents to store in each similar.xml?
MIN_SCORE = 0.0 # prune based on similarity score (all below MIN_SCORE are ignored)
MAX_SIMILAR = 10 # prune based on rank (at most MAX_SIMILAR are stored). set to 0 to all of them (no limit).
MAX_SIMILAR = 10 # prune based on rank (at most MAX_SIMILAR are stored). set to 0 to store all of them (no limit).

# internal method parameters
DIM_RP = 300 # dimensionality for the random projections
Expand Down Expand Up @@ -92,18 +90,14 @@ def getMeta(fname):
return author, title


def buildDmlCorpus(language):
config = corpora.DmlConfig('gensim_%s' % language, resultDir = RESULT_DIR, acceptLangs = [language])
for source in SOURCE_LIST:
config.addSource(source)

def buildDmlCorpus(config, language):
dml = corpora.DmlCorpus()
dml.processConfig(config)
dml.processConfig(config, shuffle = True)
dml.buildDictionary()
dml.dictionary.filterExtremes(noBelow = 5, noAbove = 0.3) # ignore too (in)frequent words

dml.save(config.resultFile('.pkl')) # save the whole object as binary data
dml.saveAsText() # save id mappings and matrices as text data
dml.save(config.resultFile('.pkl')) # save the mappings as binary data (actual documents are not saved, only their uris)
dml.saveAsText() # save id mappings and documents as text data (matrix market format)
return dml


Expand Down Expand Up @@ -149,29 +143,41 @@ def generateSimilar(corpus, similarities, method):
sys.exit(1)
language = sys.argv[1]

# construct the config, which holds information about sources, data file filenames etc.
config = corpora.DmlConfig('gensim_%s' % language, resultDir = RESULT_DIR, acceptLangs = [language])
for source in SOURCE_LIST:
config.addSource(source)

if 'build' in program:
buildDmlCorpus(language)
buildDmlCorpus(config, language)
elif 'genmodel' in program:
if 'tfidf' in program:
corpus = corpora.MmCorpus(dml_bow)
model = tfidfmodel.TfidfModel(corpus)
model.save(modelfname('tfidf'))
if 'lda' in program:
corpus = corpora.MmCorpus(dml_bow)
id2word = loadDictionary(dml_dict.txt)
model = ldamodel.LdaModel(corpus, id2word, numTopics = DIM_LDA)
model.save(modelfname('lda'))
elif 'lsi' in program:
if len(sys.argv) < 3:
print globals()['__doc__'] % (program)
sys.exit(1)
method = sys.argv[2].strip().lower()
try:
dml = corpora.DmlCorpus.load(config.resultFile('.pkl'))
except IOError, e:
raise IOError("no word-count corpus found at %s; you must first generate it through gensim_build.py")

id2word = corpora.DmlCorpus.loadDictionary(config.resultFile('wordids.txt'))
if method == 'tfidf':
corpus = corpora.MmCorpus(config.resultFile('bow.mm'))
model = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
model.save(config.resultFile('tfidfmodel.pkl'))
elif method == 'lda':
corpus = corpora.MmCorpus(config.resultFile('bow.mm'))
model = ldamodel.LdaModel(corpus, id2word = id2word, numTopics = DIM_LDA)
model.save(config.resultFile('ldamodel%i.pkl' % DIM_LDA))
elif method == 'lsi' or method == 'lsa':
# first, transform word counts to tf-idf weights
corpus = corpora.MmCorpus(dml_bow)
model = tfidfmodel.TfidfModel(corpus)
tfidf = corpora.TopicsCorpus(model, corpus)
corpus = corpora.MmCorpus(config.resultFile('bow.mm'))
tfidf = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
# then find the transformation from tf-idf to latent space
id2word = loadDictionary(dml_dict.txt)
model = lsimodel.LsiModel(tfidf, id2word, numTopics = DIM_LSI)
model.save(modelfname('lsi'))
lsi = lsimodel.LsiModel(tfidf.apply(corpus), id2word = id2word, numTopics = DIM_LSI)
model.save(config.resultFile('lsimodel%i.pkl' % DIM_LSI))
else:
raise ValueError('unknown topic extraction method in %s' % program)
raise ValueError('unknown topic extraction method: %s' % repr(method))
elif 'gensim' in program:
corpus = corpora.DmlCorpus.load(dml.pkl)
input = corpora.MmCorpus(bow.mm)
Expand Down
28 changes: 25 additions & 3 deletions interfaces.py
Expand Up @@ -15,9 +15,9 @@ class CorpusABC(utils.SaveLoad):
See the corpora module for some example corpus implementations.
Note that although a default len() method is provided, it is very inefficient
(performs a linear scan through the corpus). Wherever the corpus size is known
in advance (or at least doesn't change so that it can be cached), the len() method
should be overridden.
(performs a linear scan through the corpus to determine its length). Wherever
the corpus size is known in advance (or at least doesn't change so that it can
be cached), the len() method should be overridden.
"""
def __iter__(self):
raise NotImplementedError('cannot instantiate abstract base class')
Expand All @@ -43,6 +43,28 @@ class TransformationABC(utils.SaveLoad):
See the tfidfmodel module for an example of a transformation.
"""
class TransformedCorpus(CorpusABC):
def __init__(self, fnc, corpus):
self.fnc, self.corpus = fnc, corpus

def __len__(self):
return len(self.corpus)

def __iter__(self):
for doc in self.corpus:
yield self.fnc(doc)
#endclass TransformedCorpus

def __getitem__(self):
raise NotImplementedError('cannot instantiate abstract base class')


def apply(self, corpus):
"""
Helper function used in derived classes. Applies the transformation to
a whole corpus (as opposed to a single document) and returns another corpus.
"""
return TransformationABC.TransformedCorpus(self.__getitem__, corpus)
#endclass TransformationABC


7 changes: 2 additions & 5 deletions lsimodel.py
Expand Up @@ -184,12 +184,9 @@ def __getitem__(self, bow):
"""
Return topic distribution, as a list of (topic_id, topic_value) 2-tuples.
This is done by folding the input document into the latent topic space.
This is done by folding input document into the latent topic space.
"""
if isinstance(bow, numpy.ndarray): # input already a numpy array
vec = bow
else:
vec = matutils.doc2vec(bow, self.numTerms)
vec = matutils.doc2vec(bow, self.numTerms)
vec.shape = (self.numTerms, 1)
topicDist = self.projection * vec
return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist)
Expand Down

0 comments on commit 0d3d39d

Please sign in to comment.