# clustering de News : K-Means sur SVD

In [9]:
### Il y a quelques "perles" Python   qui facilitent les choses !!!

# from __future__ import print_function

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np


# Barre de progrès des calculs
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

# Arguments éventuels de la ligne de commande
op = OptionParser()
op.add_option("--lsa",
              dest="n_components", type="int",
              help="Preprocess documents with latent semantic analysis.")
op.add_option("--no-minibatch",
              action="store_false", dest="minibatch", default=True,
              help="Use ordinary k-means algorithm (in batch mode).")
op.add_option("--no-idf",
              action="store_false", dest="use_idf", default=True,
              help="Disable Inverse Document Frequency feature weighting.")
op.add_option("--use-hashing",
              action="store_true", default=False,
              help="Use a hashing feature vectorizer")
op.add_option("--n-features", type=int, default=10000,
              help="Maximum number of features (dimensions)"
                   " to extract from text.")
op.add_option("--verbose",
              action="store_true", dest="verbose", default=False,
              help="Print progress reports inside k-means algorithm.")

print(__doc__)
op.print_help()


def is_interactive():
    return not hasattr(sys.modules['__main__'], '__file__')
 
# Sous  Jupyter notebook enlever ces lignes
je_suis_dans_jupyter=True
argv = [] if is_interactive() else sys.argv[1:]
(opts, args) = op.parse_args(argv)

if not je_suis_dans_jupyter :
    #argv = [] if is_interactive() else sys.argv[1:]
    #(opts, args) = op.parse_args(argv)
    if len(args) > 0:
        op.error("this script takes no arguments.")
        sys.exit(1)
 

Automatically created module for IPython interactive environment
Usage: ipykernel_launcher.py [options]

Options:
  -h, --help            show this help message and exit
  --lsa=N_COMPONENTS    Preprocess documents with latent semantic analysis.
  --no-minibatch        Use ordinary k-means algorithm (in batch mode).
  --no-idf              Disable Inverse Document Frequency feature weighting.
  --use-hashing         Use a hashing feature vectorizer
  --n-features=N_FEATURES
                        Maximum number of features (dimensions) to extract
                        from text.
  --verbose             Print progress reports inside k-means algorithm.


### Trace :
    Automatically created module for IPython interactive environment
    Usage: ipykernel_launcher.py [options]

    Options:
      -h, --help            show this help message and exit
      --lsa=N_COMPONENTS    Preprocess documents with latent semantic analysis.
      --no-minibatch        Use ordinary k-means algorithm (in batch mode).
      --no-idf              Disable Inverse Document Frequency feature weighting.
      --use-hashing         Use a hashing feature vectorizer
      --n-features=N_FEATURES
                            Maximum number of features (dimensions) to extract
                            from text.
      --verbose             Print progress reports inside k-means algorithm.


In [10]:
# #############################################################################
# Chageons QQ catégories depuis l'ensemble d'apprentissage
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
# Si décommenté, on fera Toutes les catégoroes !
# categories = None

print("Loading 20 newsgroups dataset for categories:")
print(categories)

# Voir les imports
dataset = fetch_20newsgroups(subset='all', categories=categories,
                             shuffle=True, random_state=42)

print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()

labels = dataset.target
true_k = np.unique(labels).shape[0]

print("Extraction des features du training set avec vectorisation")
t0 = time()
if opts.use_hashing:
    if opts.use_idf:
        # Perform an IDF normalization on the output of HashingVectorizer
        hasher = HashingVectorizer(n_features=opts.n_features,
                                   stop_words='english', alternate_sign=False,
                                   norm=None, binary=False)
        vectorizer = make_pipeline(hasher, TfidfTransformer())
    else:
        vectorizer = HashingVectorizer(n_features=opts.n_features,
                                       stop_words='english',
                                       alternate_sign=False, norm='l2',
                                       binary=False)
else:
    vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,
                                 min_df=2, stop_words='english',
                                 use_idf=opts.use_idf)
X = vectorizer.fit_transform(dataset.data)

print("Fait en  %fs" % (time() - t0))
print("Info : nb samples: %d, nb features: %d" % X.shape)
print()

if opts.n_components:
    print("SVD LSA", end=' ')
    print("de ",  X.shape[1], " features à", opts.n_components, " features")
    t0 = time()
    # Les résultats sont déjà normalisés. Ce qui fait que KMeans se comportera comme 
    # spherical k-means (mieux). 
    # ATTENTION : les résultats de  LSA/SVD ne sont pas Re-normalisés
    # --->> ON DOIT LE REFAIRE
    svd = TruncatedSVD(opts.n_components)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)

    X = lsa.fit_transform(X)

    print("Fait en %fs" % (time() - t0))

    explained_variance = svd.explained_variance_ratio_.sum()
    print("Variance expliquée par la SVD : {}%".format(
        int(explained_variance * 100)))

    print()

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
3387 documents
4 categories

Extraction des features du training set avec vectorisation
Fait en  0.661316s
Info : nb samples: 3387, nb features: 10000



### Trace
    Loading 20 newsgroups dataset for categories:
    ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
    3387 documents
    4 categories

    Extraction des features du training set avec vectorisation
    Fait en  0.675751s
    Info : nb samples: 3387, nb features: 10000

In [11]:
# #############################################################################
# clustering

if opts.minibatch:
    km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
                         init_size=1000, batch_size=1000, verbose=opts.verbose)
else:
    km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
                verbose=opts.verbose)

print("Donnes creuse de Clustering avec %s" % km)
t0 = time()
km.fit(X)
print("Fait en %0.3fs" % (time() - t0))
print()

# Quelques mesures propres au clustering et Kmeans
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print(" Rand-Index ajusté : %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Coefficient Silhouette : %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()


if not opts.use_hashing:
    print("Meilleurs termes par cluster:")

    if opts.n_components:
        original_space_centroids = svd.inverse_transform(km.cluster_centers_)
        order_centroids = original_space_centroids.argsort()[:, ::-1]
    else:
        order_centroids = km.cluster_centers_.argsort()[:, ::-1]

    terms = vectorizer.get_feature_names()
    for i in range(true_k):
        print("Cluster %d:" % i, end='')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], end='')
        print()


Donnes creuse de Clustering avec MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',
                init_size=1000, max_iter=100, max_no_improvement=10,
                n_clusters=4, n_init=1, random_state=None,
                reassignment_ratio=0.01, tol=0.0, verbose=False)
Fait en 0.126s

Homogeneity: 0.595
Completeness: 0.604
V-measure: 0.599
 Rand-Index ajusté : 0.610
Coefficient Silhouette : 0.007

Meilleurs termes par cluster:
Cluster 0: graphics university image thanks com file files 3d ac posting
Cluster 1: god sandvik jesus kent com bible apple christian people newton
Cluster 2: space nasa henry access digex gov toronto pat alaska shuttle
Cluster 3: com keith sgi livesey article people caltech don morality think


### Trace
    Donnes creuse de Clustering avec MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',
                    init_size=1000, max_iter=100, max_no_improvement=10,
                    n_clusters=4, n_init=1, random_state=None,
                    reassignment_ratio=0.01, tol=0.0, verbose=False)
    Fait en 0.073s

    Homogeneity: 0.491
    Completeness: 0.524
    V-measure: 0.507
     Rand-Index ajusté : 0.413
    Coefficient Silhouette : 0.008

    Meilleurs termes par cluster:
    Cluster 0: god com people sandvik keith don article jesus say think
    Cluster 1: image images color polygon thanks bit 24 university vga graphics
    Cluster 2: graphics com university posting host nntp 3d ac computer ca
    Cluster 3: space nasa access henry digex pat toronto gov alaska moon