Источник:
http://scikit-learn.org/stable/auto_examples/text/document_clustering.html

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

In [2]:
from sklearn.cluster import KMeans, MiniBatchKMeans

In [3]:
import logging
from optparse import OptionParser
import sys

In [5]:
import numpy as np
import pandas as pd
import nltk
import re
import os

In [6]:
random_seed = 42

In [7]:
from stop_words import get_stop_words
stop_words = get_stop_words('ru')
for word in ['свой', 'р', 'г', 'назир', 'назиров', 'б']:
    stop_words.append(word)

In [8]:
documents = []
filenames = []

for d, dirs, files in os.walk('lemma_texts'):
    for f in files:
        text = open(os.path.join('.', d, f), 'r', encoding='utf-8').read()
        documents.append(text.replace('\n', ' ').replace('?', ''))
        filenames.append(d.replace('lemma_texts/', '') + os.sep + f)

In [46]:
categories = {'diary': 0, 'dostoevsky': 3, 'literature': 1, 'myth': 2}
true_k = 4

In [47]:
labels = np.array([])

for cat in filenames:
    labels = np.concatenate((labels, [categories[cat.split('/')[0]]]))

In [60]:
vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,
                                 min_df=2, stop_words=stop_words,
                                 use_idf=opts.use_idf)

In [61]:
X = vectorizer.fit_transform(documents)

In [62]:
print("n_samples: %d, n_features: %d" % X.shape)

n_samples: 65, n_features: 10000


In [63]:
km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
                        init_size=1000, batch_size=1000, verbose=opts.verbose)

In [64]:
km.fit(X)

MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',
        init_size=1000, max_iter=100, max_no_improvement=10, n_clusters=4,
        n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=False)

In [65]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

Homogeneity: 0.575
Completeness: 0.566
V-measure: 0.571
Adjusted Rand-Index: 0.475
Silhouette Coefficient: 0.048


In [68]:
print("Top terms per cluster:")

order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Top terms per cluster:
Cluster 0: мифология обряд череп моисей обычай золушка мифологический животное аттила архаический
Cluster 1: печорин лермонтов игрок онегин подпольный усадебный полина тургенев подполье иванович
Cluster 2: хрущев сша американский фильм цк жанр подлинность партия президент американец
Cluster 3: раскольников чехов тургенев мышкин рогожин сновидение барятинский фабула карамазов митя


In [72]:
km.labels_

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 1, 3, 2, 1, 2, 2, 2, 1, 3, 1,
       2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 1, 1,
       3, 3, 3, 1, 3, 3, 3, 3, 1, 3, 1, 3, 3, 3, 3, 1, 3, 1, 3], dtype=int32)

In [70]:
labels

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,
        3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.])