In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from sklearn.cluster import KMeans, MiniBatchKMeans

In [3]:
from sklearn import metrics

In [4]:
import numpy as np

# Выбираем интересующие нас тематики

In [5]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]

In [6]:
dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

In [7]:
print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))

3387 documents
4 categories


In [8]:
labels = dataset.target
true_k = np.unique(labels).shape[0]

# Создаём векторизатор

In [10]:
vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,
                             min_df=2, stop_words='english',
                             use_idf=True) # False

In [11]:
X = vectorizer.fit_transform(dataset.data)

In [12]:
X.shape

(3387, 10000)

# Находим кластеры

In [13]:
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=True)

In [14]:
km.fit(X)

Initialization complete
Iteration  0, inertia 6470.985
Iteration  1, inertia 3299.081
Iteration  2, inertia 3281.147
Iteration  3, inertia 3276.337
Iteration  4, inertia 3274.419
Iteration  5, inertia 3273.454
Iteration  6, inertia 3272.831
Iteration  7, inertia 3272.398
Iteration  8, inertia 3272.061
Iteration  9, inertia 3271.719
Iteration 10, inertia 3271.464
Iteration 11, inertia 3271.223
Iteration 12, inertia 3270.979
Iteration 13, inertia 3270.759
Iteration 14, inertia 3270.568
Iteration 15, inertia 3270.355
Iteration 16, inertia 3269.997
Iteration 17, inertia 3269.659
Iteration 18, inertia 3269.415
Iteration 19, inertia 3269.235
Iteration 20, inertia 3269.144
Iteration 21, inertia 3269.087
Iteration 22, inertia 3269.034
Iteration 23, inertia 3269.008
Iteration 24, inertia 3268.992
Iteration 25, inertia 3268.980
Iteration 26, inertia 3268.963
Iteration 27, inertia 3268.947
Iteration 28, inertia 3268.934
Iteration 29, inertia 3268.905
Iteration 30, inertia 3268.889
Iteration 31, i

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=4, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=True)

In [15]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

Homogeneity: 0.379
Completeness: 0.453
V-measure: 0.412
Adjusted Rand-Index: 0.290
Silhouette Coefficient: 0.007


In [15]:
terms = vectorizer.get_feature_names()

In [18]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

In [27]:
for i in range(true_k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

Cluster 0:
 uk
 ac
 mathew
 university
 article
 mantis
 posting
 host
 nntp
 like
Cluster 1:
 space
 nasa
 access
 article
 henry
 gov
 just
 posting
 com
 like
Cluster 2:
 com
 god
 article
 don
 people
 think
 just
 know
 say
 like
Cluster 3:
 university
 posting
 host
 nntp
 article
 graphics
 know
 like
 just
 cs
