In [2]:
import os
import nltk
import unicodedata
import numpy as np

from itertools import groupby
from operator import itemgetter

from reader import PickledCorpusReader

from nltk.corpus import wordnet as wn
from nltk.cluster import KMeansClusterer

STOPWORDS = set(nltk.corpus.stopwords.words('english'))
lemmatizer = nltk.WordNetLemmatizer()


def is_punct(token):
    # Is every character punctuation?
    return all(
        unicodedata.category(char).startswith('P')
        for char in token
    )


def wnpos(tag):
    # Return the WordNet POS tag from the Penn Treebank tag
    return {
        'N': wn.NOUN,
        'V': wn.VERB,
        'R': wn.ADV,
        'J': wn.ADJ
    }.get(tag[0], wn.NOUN)


def normalize(document, stopwords=STOPWORDS):
    """
    Removes stopwords and punctuation, lowercases, lemmatizes
    """

    for token, tag in document:
        token = token.lower().strip()

        if is_punct(token) or (token in stopwords):
            continue

        yield lemmatizer.lemmatize(token, wnpos(tag))


class KMeansTopics(object):

    def __init__(self, corpus, k=10):
        """
        corpus is a corpus object, e.g. an HTMLCorpusReader()
        or an HTMLPickledCorpusReader() object

        k is the number of clusters
        """
        self.k = k
        self.model = None
        self.vocab = list(
            set(normalize(corpus.words(categories=['news'])))
            )

    def vectorize(self, document):
        """
        Vectorizes a document consisting of a list of part of speech
        tagged tokens using the segmentation and tokenization methods.

        One-hot encode the set of documents
        """
        features = set(normalize(document))
        return np.array([
            token in features for token in self.vocab], np.short)

    def cluster(self, corpus):
        """
        Fits the K-Means model to the given data.
        """
        cosine = nltk.cluster.util.cosine_distance
        self.model = KMeansClusterer(
            self.k, distance=cosine, avoid_empty_clusters=True)
        self.model.cluster([
            self.vectorize(
                corpus.words(fileid)
            ) for fileid in corpus.fileids(categories=['news'])
        ])

    def classify(self, document):
        """
        Pass through to the internal model classify
        """
        return self.model.classify(self.vectorize(document))

if __name__ == '__main__':
    corpus = PickledCorpusReader('../ATAwP/corpus')

    clusterer = KMeansTopics(corpus, k=7)
    clusterer.cluster(corpus)

    # Classify documents in the new corpus by cluster affinity
    groups  = [
        (clusterer.classify(corpus.words(fileid)), fileid)
        for fileid in corpus.fileids(categories=['news'])
    ]

    # Group documents in corpus by cluster and display them
    groups.sort(key=itemgetter(0))
    for group, items in groupby(groups, key=itemgetter(0)):
        for cluster, fname in items:
            print("Cluster {}: {}".format(cluster+1,fname))


  return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))
  return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))


Cluster 1: news/56d62554c1808113ffb87492.pickle
Cluster 1: news/56d6255dc1808113ffb874f0.pickle
Cluster 1: news/56d62570c1808113ffb87557.pickle
Cluster 1: news/56d625abc1808113ffb87625.pickle
Cluster 1: news/56d63a76c1808113ffb8841c.pickle
Cluster 1: news/56d63ae1c1808113ffb886b5.pickle
Cluster 1: news/56d63af0c1808113ffb88745.pickle
Cluster 1: news/56d64c7ac1808115036122b4.pickle
Cluster 1: news/56d64cf2c1808115036125f5.pickle
Cluster 1: news/56d65c2ec1808116aade2f8a.pickle
Cluster 1: news/56d65c7cc1808116aade31db.pickle
Cluster 1: news/56d67ba4c18081170340346e.pickle
Cluster 1: news/56d68aadc18081172beb51c7.pickle
Cluster 1: news/56d6c797c18081189e0f7f37.pickle
Cluster 1: news/56d6d6f4c1808118d117500e.pickle
Cluster 1: news/56d6e677c1808118f8de4438.pickle
Cluster 1: news/56d709eec180810560aeb0cd.pickle
Cluster 1: news/56d70a26c180810560aeb34b.pickle
Cluster 1: news/56d717ccc180810545e5f39e.pickle
Cluster 1: news/56d73d29c18081073f0c83e7.pickle
Cluster 1: news/56d73d2fc18081073f0c8435