Skip to content

Commit

Permalink
Merge pull request #1398 from chinmayapancholi13/lda_lsi_wrapper_changes
Browse files Browse the repository at this point in the history
[WIP] Changes in sklearn wrappers for LDA and LSI models
  • Loading branch information
menshikh-iv committed Jun 20, 2017
2 parents 477a3a3 + 34a6d14 commit b989be7
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 121 deletions.
72 changes: 33 additions & 39 deletions gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,35 +3,37 @@
#
# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
#

"""
Scikit learn interface for gensim for easy use of gensim with scikit-learn
follows on scikit learn API conventions
"""

import numpy as np
from scipy import sparse
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.exceptions import NotFittedError

from gensim import models
from gensim import matutils
from gensim.sklearn_integration import base_sklearn_wrapper
from scipy import sparse
from sklearn.base import TransformerMixin, BaseEstimator


class SklearnWrapperLdaModel(models.LdaModel, base_sklearn_wrapper.BaseSklearnWrapper, TransformerMixin, BaseEstimator):
class SklLdaModel(base_sklearn_wrapper.BaseSklearnWrapper, TransformerMixin, BaseEstimator):
"""
Base LDA module
"""

def __init__(
self, corpus=None, num_topics=100, id2word=None,
self, num_topics=100, id2word=None,
chunksize=2000, passes=1, update_every=1,
alpha='symmetric', eta=None, decay=0.5, offset=1.0,
eval_every=10, iterations=50, gamma_threshold=0.001,
minimum_probability=0.01, random_state=None):
"""
Sklearn wrapper for LDA model. derived class for gensim.model.LdaModel .
"""
self.corpus = corpus
self.gensim_model = None
self.num_topics = num_topics
self.id2word = id2word
self.chunksize = chunksize
Expand All @@ -46,82 +48,66 @@ def __init__(
self.gamma_threshold = gamma_threshold
self.minimum_probability = minimum_probability
self.random_state = random_state
# if no fit function is used , then corpus is given in init
if self.corpus:
models.LdaModel.__init__(
self, corpus=self.corpus, num_topics=self.num_topics, id2word=self.id2word,
chunksize=self.chunksize, passes=self.passes, update_every=self.update_every,
alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset,
eval_every=self.eval_every, iterations=self.iterations,
gamma_threshold=self.gamma_threshold, minimum_probability=self.minimum_probability,
random_state=self.random_state)

def get_params(self, deep=True):
"""
Returns all parameters as dictionary.
"""
return {"corpus": self.corpus, "num_topics": self.num_topics, "id2word": self.id2word,
"chunksize": self.chunksize, "passes": self.passes,
"update_every": self.update_every, "alpha": self.alpha, "eta": self.eta, "decay": self.decay,
"offset": self.offset, "eval_every": self.eval_every, "iterations": self.iterations,
return {"num_topics": self.num_topics, "id2word": self.id2word, "chunksize": self.chunksize,
"passes": self.passes, "update_every": self.update_every, "alpha": self.alpha, "eta": self.eta,
"decay": self.decay, "offset": self.offset, "eval_every": self.eval_every, "iterations": self.iterations,
"gamma_threshold": self.gamma_threshold, "minimum_probability": self.minimum_probability,
"random_state": self.random_state}

def set_params(self, **parameters):
"""
Set all parameters.
"""
super(SklearnWrapperLdaModel, self).set_params(**parameters)
super(SklLdaModel, self).set_params(**parameters)

def fit(self, X, y=None):
"""
For fitting corpus into the class object.
Calls gensim.model.LdaModel:
>>> gensim.models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word, passes=passes, update_every=update_every, alpha=alpha, iterations=iterations, eta=eta, random_state=random_state)
Fit the model according to the given training data.
Calls gensim.models.LdaModel
"""
if sparse.issparse(X):
self.corpus = matutils.Sparse2Corpus(X)
corpus = matutils.Sparse2Corpus(X)
else:
self.corpus = X
corpus = X

models.LdaModel.__init__(
self, corpus=self.corpus, num_topics=self.num_topics, id2word=self.id2word,
self.gensim_model = models.LdaModel(corpus=corpus, num_topics=self.num_topics, id2word=self.id2word,
chunksize=self.chunksize, passes=self.passes, update_every=self.update_every,
alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset,
eval_every=self.eval_every, iterations=self.iterations,
gamma_threshold=self.gamma_threshold, minimum_probability=self.minimum_probability,
random_state=self.random_state)
return self

def transform(self, docs, minimum_probability=None):
def transform(self, docs):
"""
Takes as an list of input a documents (documents).
Returns matrix of topic distribution for the given document bow, where a_ij
indicates (topic_i, topic_probability_j).
The input `docs` should be in BOW format and can be a list of documents like : [ [(4, 1), (7, 1)], [(9, 1), (13, 1)], [(2, 1), (6, 1)] ]
or a single document like : [(4, 1), (7, 1)]
"""
if self.gensim_model is None:
raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")

# The input as array of array
check = lambda x: [x] if isinstance(x[0], tuple) else x
docs = check(docs)
X = [[] for _ in range(0, len(docs))]

for k, v in enumerate(docs):
doc_topics = self.get_document_topics(v, minimum_probability=minimum_probability)
doc_topics = self.gensim_model[v]
probs_docs = list(map(lambda x: x[1], doc_topics))
# Everything should be equal in length
if len(probs_docs) != self.num_topics:
probs_docs.extend([1e-12]*(self.num_topics - len(probs_docs)))
X[k] = probs_docs
return np.reshape(np.array(X), (len(docs), self.num_topics))

def get_topic_dist(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False):
"""
Takes as an input a new document (bow).
Returns the topic distribution for the given document bow, as a list of (topic_id, topic_probability) 2-tuples.
"""
return self.get_document_topics(
bow, minimum_probability=minimum_probability,
minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics)

def partial_fit(self, X):
"""
Train model over X.
Expand All @@ -134,4 +120,12 @@ def partial_fit(self, X):
if sparse.issparse(X):
X = matutils.Sparse2Corpus(X)

self.update(corpus=X)
if self.gensim_model is None:
self.gensim_model = models.LdaModel(num_topics=self.num_topics, id2word=self.id2word,
chunksize=self.chunksize, passes=self.passes, update_every=self.update_every,
alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset,
eval_every=self.eval_every, iterations=self.iterations, gamma_threshold=self.gamma_threshold,
minimum_probability=self.minimum_probability, random_state=self.random_state)

self.gensim_model.update(corpus=X)
return self
51 changes: 29 additions & 22 deletions gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,33 @@
#
# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
#

"""
Scikit learn interface for gensim for easy use of gensim with scikit-learn
Follows scikit-learn API conventions
"""

import numpy as np
from scipy import sparse
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.exceptions import NotFittedError

from gensim import models
from gensim import matutils
from gensim.sklearn_integration import base_sklearn_wrapper
from scipy import sparse
from sklearn.base import TransformerMixin, BaseEstimator


class SklearnWrapperLsiModel(models.LsiModel, base_sklearn_wrapper.BaseSklearnWrapper, TransformerMixin, BaseEstimator):
class SklLsiModel(base_sklearn_wrapper.BaseSklearnWrapper, TransformerMixin, BaseEstimator):
"""
Base LSI module
"""

def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000,
def __init__(self, num_topics=200, id2word=None, chunksize=20000,
decay=1.0, onepass=True, power_iters=2, extra_samples=100):
"""
Sklearn wrapper for LSI model. Class derived from gensim.model.LsiModel.
"""
self.corpus = corpus
self.gensim_model = None
self.num_topics = num_topics
self.id2word = id2word
self.chunksize = chunksize
Expand All @@ -36,52 +38,51 @@ def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000,
self.extra_samples = extra_samples
self.power_iters = power_iters

# if 'fit' function is not used, then 'corpus' is given in init
if self.corpus:
models.LsiModel.__init__(self, corpus=self.corpus, num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize,
decay=self.decay, onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples)

def get_params(self, deep=True):
"""
Returns all parameters as dictionary.
"""
return {"corpus": self.corpus, "num_topics": self.num_topics, "id2word": self.id2word,
return {"num_topics": self.num_topics, "id2word": self.id2word,
"chunksize": self.chunksize, "decay": self.decay, "onepass": self.onepass,
"extra_samples": self.extra_samples, "power_iters": self.power_iters}

def set_params(self, **parameters):
"""
Set all parameters.
"""
super(SklearnWrapperLsiModel, self).set_params(**parameters)
super(SklLsiModel, self).set_params(**parameters)

def fit(self, X, y=None):
"""
For fitting corpus into the class object.
Calls gensim.model.LsiModel:
>>>gensim.models.LsiModel(corpus=corpus, num_topics=num_topics, id2word=id2word, chunksize=chunksize, decay=decay, onepass=onepass, power_iters=power_iters, extra_samples=extra_samples)
Fit the model according to the given training data.
Calls gensim.models.LsiModel
"""
if sparse.issparse(X):
self.corpus = matutils.Sparse2Corpus(X)
corpus = matutils.Sparse2Corpus(X)
else:
self.corpus = X
corpus = X

models.LsiModel.__init__(self, corpus=self.corpus, num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize,
decay=self.decay, onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples)
self.gensim_model = models.LsiModel(corpus=corpus, num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize,
decay=self.decay, onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples)
return self

def transform(self, docs):
"""
Takes a list of documents as input ('docs').
Returns a matrix of topic distribution for the given document bow, where a_ij
indicates (topic_i, topic_probability_j).
The input `docs` should be in BOW format and can be a list of documents like : [ [(4, 1), (7, 1)], [(9, 1), (13, 1)], [(2, 1), (6, 1)] ]
or a single document like : [(4, 1), (7, 1)]
"""
if self.gensim_model is None:
raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")

# The input as array of array
check = lambda x: [x] if isinstance(x[0], tuple) else x
docs = check(docs)
X = [[] for i in range(0,len(docs))];
for k,v in enumerate(docs):
doc_topics = self[v]
doc_topics = self.gensim_model[v]
probs_docs = list(map(lambda x: x[1], doc_topics))
# Everything should be equal in length
if len(probs_docs) != self.num_topics:
Expand All @@ -96,4 +97,10 @@ def partial_fit(self, X):
"""
if sparse.issparse(X):
X = matutils.Sparse2Corpus(X)
self.add_documents(corpus=X)

if self.gensim_model is None:
self.gensim_model = models.LsiModel(num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize,
decay=self.decay, onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples)

self.gensim_model.add_documents(corpus=X)
return self
Loading

0 comments on commit b989be7

Please sign in to comment.