# Подгружаем библиотеки

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import scipy as sp
import os
import sys
from sklearn.cluster import KMeans

Создаём список данных

In [2]:
text = [
    "This is a toy post about machine learning. Actually, it contains not much interesting stuff.",
    "Imaging databases provide storage capabilities.",
    "Most imaging databases save images permanently.",
    "Imaging databases store data.",
    "Imaging databases store data. Imaging databases store data. Imaging databases store data."
       ]

In [3]:
vectorizer = CountVectorizer() # Создаём объект класса
print(vectorizer.analyzer) # Проверяем анализатор

word


In [4]:
X = vectorizer.fit_transform(text) # Проганяем входные данные через алгоритм обработки

In [5]:
print(vectorizer.get_feature_names()) # Получаем массив уникальных слов

['about', 'actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'provide', 'save', 'storage', 'store', 'stuff', 'this', 'toy']


In [6]:
num_samples, num_features = X.shape
print(X.shape) # Размер массива выходного

(5, 25)


In [7]:
print(X.toarray().transpose(), type(X.toarray().transpose()))

[[1 0 0 0 0]
 [1 0 0 0 0]
 [0 1 0 0 0]
 [1 0 0 0 0]
 [0 0 0 1 3]
 [0 1 1 1 3]
 [0 0 1 0 0]
 [0 1 1 1 3]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [0 0 1 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [0 0 1 0 0]
 [1 0 0 0 0]
 [0 1 0 0 0]
 [0 0 1 0 0]
 [0 1 0 0 0]
 [0 0 0 1 3]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]] <class 'numpy.ndarray'>


In [8]:
new_post =  "imaging databases"
new_post_vec = vectorizer.transform([new_post]) # создаём вектор нахождения этих слов в пространстве
print(new_post_vec, type(new_post_vec)) 

  (0, 5)	1
  (0, 7)	1 <class 'scipy.sparse.csr.csr_matrix'>


In [9]:
print(new_post_vec.toarray()) # Отображение этого вектора

[[0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [10]:
def dist_raw(v1, v2):
    
    """ф-Я Вычисления Евклидового расстояния"""
    
    delta = v1 - v2
    return sp.linalg.norm(delta.toarray())

In [11]:
def dist_norm(v1, v2):
    
    """ф-Я Вычисления Евклидового расстояния с нормирование вектора"""
    
    v1_normalized = v1 / sp.linalg.norm(v1.toarray())
    v2_normalized = v2 / sp.linalg.norm(v2.toarray())

    delta = v1_normalized - v2_normalized

    return sp.linalg.norm(delta.toarray())

In [12]:
dist = dist_norm

In [13]:
# Вычисления расстояние между одним вектором и другим

best_dist = sys.maxsize
best_i = None

for i in range(0, num_samples):
    post = text[i]
    if post == new_post:
        continue
    post_vec = X.getrow(i)
    d = dist(post_vec, new_post_vec)

    print("=== Post %i with dist=%.2f: %s" % (i, d, post))

    if d < best_dist:
        best_dist = d
        best_i = i

print("Best post is %i with dist=%.2f" % (best_i, best_dist))

=== Post 0 with dist=1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist=0.86: Imaging databases provide storage capabilities.
=== Post 2 with dist=0.92: Most imaging databases save images permanently.
=== Post 3 with dist=0.77: Imaging databases store data.
=== Post 4 with dist=0.77: Imaging databases store data. Imaging databases store data. Imaging databases store data.
Best post is 3 with dist=0.77


# строим модель

In [14]:
km = KMeans(n_clusters=5, init='random',n_init=1, verbose=1, random_state=42) # Сторим модель К-средних
km.fit(X) # Обучаем модель
print(km.cluster_centers_) # Выводим результаты
print(km.labels_)

Initialization complete
Iteration 0, inertia 0.0
Converged at iteration 0: center shift 0.0 within tolerance 3.2000000000000005e-05.
[[0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 3. 3. 0. 3. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 3. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0.
  0.]
 [1. 1. 0. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 1.
  1.]
 [0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
  0.]]
[3 0 2 4 1]
