# 3章　クラスタリング：関連のある文書を見つける

In [45]:
import sklearn.datasets
import nltk
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')

import scipy as sp
from sklearn.feature_extraction.text import TfidfVectorizer

In [39]:
class StemmedTfidfVectorizer(TfidfVectorizer):

    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)

In [6]:
print(vectorizer)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [8]:
content = ["How to format my hard disk", "Hard disk format problems"]
X = vectorizer.fit_transform(content)
vectorizer.get_feature_names()

['disk', 'format', 'hard', 'how', 'my', 'problems', 'to']

In [9]:
print(X.toarray().transpose())

[[1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 0]]


## 3.3 クラスタリング

In [11]:
import sklearn.datasets
MLCOMP_DIR = r"D:\data"
data = sklearn.datasets.load_mlcomp("20news-18828", mlcomp_root = MLCOMP_DIR)
print(data.filenames)

ValueError: Could not find folder: /Users/shooscke/Building Machine Learning Systems with Python/D:\data

In [15]:
train_data = sklearn.datasets.load_mlcomp("20news-18828", "train", mlcomp_root = MLCOMP_DATASETS_HOME)

NameError: name 'MLCOMP_DATASETS_HOME' is not defined

In [17]:
from sklearn.datasets import fetch_20newsgroups

In [19]:
train_data = fetch_20newsgroups(subset = "train")

Downloading dataset from http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz (14 MB)


In [20]:
test_data = fetch_20newsgroups(subset = "test")

In [23]:
print(len(train_data.filenames))
print(len(test_data.filenames))

11314
7532


In [25]:
groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']
train_data =  fetch_20newsgroups(subset="train", categories=groups)
print(len(train_data.filenames))

3529


In [48]:
vectorizer = StemmedTfidfVectorizer(min_df = 10, max_df = 0.5, stop_words = 'english')
vectorized = vectorizer.fit_transform(train_data.data)
num_samples, num_features = vectorized.shape
print("#samples: %d, #features: %d" % (num_samples, num_features))

#samples: 3529, #features: 4712


In [53]:
num_clusters = 50

In [54]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters = num_clusters, init = 'random', n_init = 1, verbose = 1)
km.fit(vectorized)

Initialization complete
Iteration  0, inertia 5890.112
Iteration  1, inertia 3214.751
Iteration  2, inertia 3171.228
Iteration  3, inertia 3146.766
Iteration  4, inertia 3133.403
Iteration  5, inertia 3125.185
Iteration  6, inertia 3120.057
Iteration  7, inertia 3116.460
Iteration  8, inertia 3114.682
Iteration  9, inertia 3113.155
Iteration 10, inertia 3111.616
Iteration 11, inertia 3110.478
Iteration 12, inertia 3109.760
Iteration 13, inertia 3108.818
Iteration 14, inertia 3107.582
Iteration 15, inertia 3106.658
Iteration 16, inertia 3106.182
Iteration 17, inertia 3105.360
Iteration 18, inertia 3105.121
Iteration 19, inertia 3105.017
Iteration 20, inertia 3104.804
Iteration 21, inertia 3104.593
Iteration 22, inertia 3104.331
Iteration 23, inertia 3104.164
Iteration 24, inertia 3103.763
Iteration 25, inertia 3103.428
Iteration 26, inertia 3103.259
Iteration 27, inertia 3102.909
Iteration 28, inertia 3102.687
Iteration 29, inertia 3102.651
Iteration 30, inertia 3102.615
Converged at it

KMeans(copy_x=True, init='random', max_iter=300, n_clusters=50, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=1)

In [61]:
km.labels_

array([25, 25, 20, ..., 14, 33, 41], dtype=int32)

In [62]:
km.labels_.shape

(3529,)