# Chapter 6: Clustering for Text Similarity

## Clustering by Document Similarity
### Partitive Clustering

#### k-means clustering
Initialize the NLTK `KMeansClusterer` with our desired number of clusters (*k*) and our preferred distance measure (`cosine_distance`), and avoid a result iwth clusters that contain no documents.  
Then we'll add our no-op `fit()` method and a `transform()` method that calls the internal `KMeansClusterer` model's `cluster()` method, specifying that each document should be assigned a cluster. 

In [1]:
from nltk.cluster import KMeansClusterer
from sklearn.base import BaseEstimator, TransformerMixin

class KMeansClusters(BaseEstimator, TransformerMixin):
    
    def __init__(self, k=7):
        '''
        k is the number of clusters
        model is the implementation of Kmeans
        '''
        self.k = k
        self.distance = nltk.cluster.util.cosine_distance
        self.model = KMeansClusterer(self.k, self.distance, 
                                     avoid_empty_clusters=True)
    
    def fit(self, documents, labels=None):
        return self
    
    def transform(self, documents):
        '''
        Fits the K-Means model to one-hot vectorized documents
        '''
        return self.model.cluster(documents, assign_clusters=True)

Normalize and vectorize documents for our `KMeansClusters` class.  
Instead of returning a representation of documents as bags-of-words, this version of the `TextNormailzer` will perfrom stopwords removal and lemmatization and return a string for each document.  
*(Note: most of this is from Ch4, p72)*

In [2]:
class TextNormalizer(BaseEstimator, TransformerMixin):

    def __init__(self, language='english'):
        self.stopwords  = set(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()

    def is_punct(self, token):
        return all(
            unicodedata.category(char).startswith('P') for char in token
        )

    def is_stopword(self, token):
        return token.lower() in self.stopwords

    def normalize(self, document):
        return [
            self.lemmatize(token, tag).lower()
            for paragraph in document
            for sentence in paragraph
            for (token, tag) in sentence
            if not self.is_punct(token) and not self.is_stopword(token)
        ]

    def lemmatize(self, token, pos_tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(pos_tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

    def fit(self, X, y=None):
        return self

    def transform(self, documents):
        return [
            ' '.join(self.normalize(document))
            for document in documents
        ]

Vectorize documents after normalization (before clustering) with `OneHotVectorizer` class.  
Use Scikit-Learn's `CountVectorizer` with `binary=True`, which will wrap both frequency encoding and binarization.  
The `transform()` method will return a representation of each doucment as a one-hot vectorized array.

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer

class OneHotVectorizer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.vectorizer = CountVectorizer(binary=True)
        
    def fit(self, documents, labels=None):
        return self
    
    def transform(self, documents):
        freqs = self.vectorizer.fit_transform(documets)
        return [freq.toarray()[0] for freq in freqs]

Now, create a `Pipeline` inside our `main()` execution to perform *k*-means clustering.  
Initialize a `PickledCorpusReader` (Ch3, p51), specifying use of only the "news" category.  
Initialize a pipeline to streamline our custom `TextNormalizer`, `OneHotVectorizer`, and `KMeansClusters` classes. By calling `fit_transform()` on the pipeline, we perfrom each of the steps in sequence.

In [4]:
# pip install gensim

In [5]:
# pip install -U gensim

In [6]:
import gensim
import sklearn

In [13]:
#!/usr/bin/env python3

from reader import PickledCorpusReader
from transformers import TextNormalizer, GensimTfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF
from sklearn.feature_extraction.text import CountVectorizer

# from gensim.sklearn_api import lsimodel, ldamodel
from gensim.models import lsimodel, ldamodel

def identity(words):
    return words


class SklearnTopicModels(object):

    def __init__(self, n_topics=50, estimator='LDA'):
        """
        n_topics is the desired number of topics
        To use Latent Semantic Analysis, set estimator to 'LSA',
        To use Non-Negative Matrix Factorization, set estimator to 'NMF',
        otherwise, defaults to Latent Dirichlet Allocation ('LDA').
        """
        self.n_topics = n_topics

        if estimator == 'LSA':
            self.estimator = TruncatedSVD(n_components=self.n_topics)
        elif estimator == 'NMF':
            self.estimator = NMF(n_components=self.n_topics)
        else:
            self.estimator = LatentDirichletAllocation(n_topics=self.n_topics)

        self.model = Pipeline([
            ('norm', TextNormalizer()),
            ('tfidf', CountVectorizer(tokenizer=identity,
                                      preprocessor=None, lowercase=False)),
            ('model', self.estimator)
        ])


    def fit_transform(self, documents):
        self.model.fit_transform(documents)

        return self.model


    def get_topics(self, n=25):
        """
        n is the number of top terms to show for each topic
        """
        vectorizer = self.model.named_steps['tfidf']
        model = self.model.steps[-1][1]
        names = vectorizer.get_feature_names()
        topics = dict()

        for idx, topic in enumerate(model.components_):
            features = topic.argsort()[:-(n - 1): -1]
            tokens = [names[i] for i in features]
            topics[idx] = tokens

        return topics


class GensimTopicModels(object):

    def __init__(self, n_topics=50, estimator='LDA'):
        """
        n_topics is the desired number of topics

        To use Latent Semantic Analysis, set estimator to 'LSA'
        otherwise defaults to Latent Dirichlet Allocation.
        """
        self.n_topics = n_topics

        if estimator == 'LSA':
            self.estimator = lsimodel.LsiTransformer(num_topics=self.n_topics)
        else:
            self.estimator = ldamodel.LdaTransformer(num_topics=self.n_topics)

        self.model = Pipeline([
            ('norm', TextNormalizer()),
            ('vect', GensimTfidfVectorizer()),
            ('model', self.estimator)
        ])

    def fit(self, documents):
        self.model.fit(documents)

        return self.model


if __name__ == '__main__':
    corpus = PickledCorpusReader('../corpus')

    # With Sklearn
    skmodel = SklearnTopicModels(estimator='NMF')
    documents   = corpus.docs()

    skmodel.fit_transform(documents)
    topics = skmodel.get_topics()
    for topic, terms in topics.items():
        print("Topic #{}:".format(topic+1))
        print(terms)

    # # With Gensim
    # gmodel = GensimTopicModels(estimator='LSA')
    #
    # docs = [
    #     list(corpus.docs(fileids=fileid))[0]
    #     for fileid in corpus.fileids()
    # ]
    #
    # gmodel.fit(docs)
    #
    # # retrieve the fitted lsa model from the named steps of the pipeline
    # lsa = gmodel.model.named_steps['lsa'].gensim_model
    #
    # # show the topics with the token-weights for the top 10 most influential tokens:
    # print(lsa.print_topics(10))


    # # retrieve the fitted lda model from the named steps of the pipeline
    # lda = gmodel.model.named_steps['lda'].gensim_model
    #
    # # show the topics with the token-weights for the top 10 most influential tokens:
    # lda.print_topics(10)

    # corpus = [
    #     gmodel.model.named_steps['vect'].lexicon.doc2bow(doc)
    #     for doc in gmodel.model.named_steps['norm'].transform(docs)
    # ]
    #
    #
    # id2token = gmodel.model.named_steps['vect'].lexicon.id2token
    #
    # for word_id, freq in next(iter(corpus)):
    #     print(id2token[word_id], freq)

    # # get the highest weighted topic for each of the documents in the corpus
    # def get_topics(vectorized_corpus, model):
    #     from operator import itemgetter
    #
    #     topics = [
    #         max(model[doc], key=itemgetter(1))[0]
    #         for doc in vectorized_corpus
    #     ]
    #
    #     return topics
    #
    # topics = get_topics(corpus,lda)
    #
    # for topic, doc in zip(topics, docs):
    #     print("Topic:{}".format(topic))
    #     print(doc)
    #
    ## retreive the fitted vectorizer or the lexicon if needed
    # tfidf = gmodel.model.named_steps['vect'].tfidf
    # lexicon = gmodel.model.named_steps['vect'].lexicon


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
import sys, os
# */site-packages is where your current session is running its python out of
site_path = ''
for path in sys.path:
    if 'site-packages' in path.split('/')[-1]:
        print(path)
        site_path = path
# search to see if gensim in installed packages
if len(site_path) > 0:
    if not 'gensim' in os.listdir(site_path):
        print('package not found')
    else:
        print('gensim installed')   

In [None]:
# !py -0p

In [None]:
# py -3.9 -m pip install gensim

In [8]:
from gensim.sklearn_integration import lsimodel, ldamodel

ModuleNotFoundError: No module named 'gensim.sklearn_integration'

In [9]:
from gensim.models import lsimodel, ldamodel