In [7]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np

n_features = 10000

In [8]:
# #############################################################################
# Load some categories from the training set
categories = [
    'alt.atheism',
 #   'talk.religion.misc',
    'comp.graphics',
#    'sci.space',
]
# Uncomment the following to do the analysis on all the categories
# categories = None

print("Loading 20 newsgroups dataset for categories:")
print(categories)

dataset = fetch_20newsgroups(subset='all', categories=categories,
                             shuffle=True, random_state=42)

print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()

labels = dataset.target
true_k = np.unique(labels).shape[0]

print("Extracting features from the training dataset "
      "using a sparse vectorizer")
t0 = time()
vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features,
                                 min_df=2, stop_words='english',
                                 use_idf=True)
X = vectorizer.fit_transform(dataset.data)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'comp.graphics']
1772 documents
2 categories

Extracting features from the training dataset using a sparse vectorizer
done in 0.542880s
n_samples: 1772, n_features: 10000



In [9]:
print("Performing dimensionality reduction using LSA")
t0 = time()
# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.
svd = TruncatedSVD(100)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(X)

print("done in %fs" % (time() - t0))

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
    int(explained_variance * 100)))

print()


Performing dimensionality reduction using LSA
done in 0.346190s
Explained variance of the SVD step: 28%



In [None]:
opts.n_components

In [10]:
# #############################################################################
# Do the actual clustering

km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
                verbose=True)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()



print("Top terms per cluster:")


original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]


terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=2, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=True)
Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 1596.806186725811
start iteration
done sorting
end inner loop
Iteration 1, inertia 1572.9427959007603
start iteration
done sorting
end inner loop
Iteration 2, inertia 1572.010338993808
start iteration
done sorting
end inner loop
Iteration 3, inertia 1571.9393386180273
start iteration
done sorting
end inner loop
Iteration 4, inertia 1571.9256294163342
start iteration
done sorting
end inner loop
Iteration 5, inertia 1571.9256294163342
center shift 0.000000e+00 within tolerance 9.195605e-07
done in 0.020s

Homogeneity: 0.875
Completeness: 0.873
V-measure: 0.874
Adjusted Rand-Index: 0.931
Silhouette Coefficient: 0.038

Top terms per cluster:
Cluster 0: graphics com university thanks image 3d 

In [None]:
X.shape