In [1]:
import logging
import sys

import numpy as np

from time import time
from optparse import OptionParser
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn import metrics

### Выбираем категории новостей для последующей кластеризации

In [2]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]

### Загружаем данные по выбранным категориям

In [3]:
dataset = fetch_20newsgroups(
    subset='all', 
    categories=categories,
    shuffle=True, 
    random_state=42
)

In [4]:
dataset.data[0] # type: ignore

'From: healta@saturn.wwc.edu (Tammy R Healy)\nSubject: Re: who are we to judge, Bobby?\nLines: 38\nOrganization: Walla Walla College\nLines: 38\n\nIn article <1993Apr14.213356.22176@ultb.isc.rit.edu> snm6394@ultb.isc.rit.edu (S.N. Mozumder ) writes:\n>From: snm6394@ultb.isc.rit.edu (S.N. Mozumder )\n>Subject: Re: who are we to judge, Bobby?\n>Date: Wed, 14 Apr 1993 21:33:56 GMT\n>In article <healta.56.734556346@saturn.wwc.edu> healta@saturn.wwc.edu (TAMMY R HEALY) writes:\n>>Bobby,\n>>\n>>I would like to take the liberty to quote from a Christian writer named \n>>Ellen G. White.  I hope that what she said will help you to edit your \n>>remarks in this group in the future.\n>>\n>>"Do not set yourself as a standard.  Do not make your opinions, your views \n>>of duty, your interpretations of scripture, a criterion for others and in \n>>your heart condemn them if they do not come up to your ideal."\n>>                         Thoughts Fromthe Mount of Blessing p. 124\n>>\n>>I hope quoting 

In [5]:
labels = dataset.target # type: ignore
true_k = np.unique(labels).shape[0]

### Создаём tf-idf векторизатор и преобразуем набор данных

In [6]:
vectorizer = TfidfVectorizer(
    max_df=0.5, 
    max_features=10000,
    min_df=2, 
    stop_words='english',
    use_idf=True
)

In [7]:
X = vectorizer.fit_transform(dataset.data) # type: ignore

### Устанавливаем параметры для кластеризации и запускаем обучение модели

In [8]:
km = MiniBatchKMeans(
    n_clusters=true_k, 
    init='k-means++', 
    n_init=1,
    init_size=1000, 
    batch_size=1000, 
    verbose=True
)

In [9]:
km.fit(X)

Init 1/1 with method k-means++
Inertia for init 1/1: 1910.1218960637032
Minibatch step 1/338: mean batch inertia: 1.909955620307733
Minibatch step 2/338: mean batch inertia: 0.9763105930806969, ewa inertia: 0.9763105930806969
Minibatch step 3/338: mean batch inertia: 0.9734525323338511, ewa inertia: 0.974623426169926
Minibatch step 4/338: mean batch inertia: 0.9714204095151264, ewa inertia: 0.9727326253111306
Minibatch step 5/338: mean batch inertia: 0.9711383880543992, ewa inertia: 0.971791517131242
Minibatch step 6/338: mean batch inertia: 0.9713605171966444, ewa inertia: 0.9715370897790592
Minibatch step 7/338: mean batch inertia: 0.9697682023532994, ewa inertia: 0.9704928823258362
Minibatch step 8/338: mean batch inertia: 0.9680811156273901, ewa inertia: 0.9690691711697287
Minibatch step 9/338: mean batch inertia: 0.9690599025923492, ewa inertia: 0.9690636997545106
Minibatch step 10/338: mean batch inertia: 0.9687430594012348, ewa inertia: 0.9688744197348672
Minibatch step 11/338: 

### Получаем центры кластеров

In [10]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

### Выводим признаки, характеризующие центры кластеров

In [12]:
terms = vectorizer.get_feature_names_out()
for i in range(true_k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

Cluster 0:
 space
 nasa
 henry
 toronto
 gov
 alaska
 moon
 shuttle
 orbit
 zoo
Cluster 1:
 god
 com
 people
 sandvik
 keith
 jesus
 don
 article
 say
 think
Cluster 2:
 com
 access
 posting
 nntp
 host
 university
 digex
 pat
 article
 know
Cluster 3:
 graphics
 image
 files
 file
 thanks
 gif
 images
 university
 com
 ac
