# HDBSCAN의 간단한 예시 및 사용법

In [2]:
from sklearn.datasets import make_blobs
import pandas as pd

In [3]:
blobs, labels = make_blobs(n_samples=2000, n_features=10)
pd.DataFrame(blobs).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-8.231211,-6.063429,11.613905,0.87938,2.422296,-6.773034,-7.340702,-1.951232,-3.01487,-5.01885
1,-5.853803,5.389588,1.744435,-1.668368,1.122424,-5.578011,2.086856,10.081042,5.434678,-1.825883
2,-10.57016,-0.257651,-0.412932,-2.823168,0.536926,-2.029317,7.301112,-8.63986,7.374099,0.123067
3,-8.153194,3.017862,0.181226,-2.002926,2.180609,-1.954914,9.320577,-7.803324,6.063245,0.696308
4,-8.294267,-1.554326,-1.760401,-1.014484,0.13516,-2.403443,7.460814,-7.882671,4.750843,-0.115992


In [6]:
import hdbscan

In [7]:
clusterer=hdbscan.HDBSCAN()

In [9]:
clusterer.fit(blobs)

HDBSCAN()

In [12]:
#HDBSCAN(algorithm='best',alpha=1.0, apporox_min_span_tree=True, gen_min_span_tree=True,
        #leaf_size=40, memeory=Meomry(cachedir=None), metirc='euclidean', 
        #min_cluster_size=5, min_sample=None, p=None)

## Cluster 결과해석
- HDBSCAN은 노이즈를 인식한다.
- 어떤 클러스터에도 할당되지 않은 데이터 샘플: 샘플에 레이블 -1을 할당하여 처리됨.
- 소프트 클러스터링을 구현하며, 각 데이터 요소에는 0.0에서 1.0 사이의 클러스터 멤버 자격 점수가 할당됨.
- <b>점수 0</b> (클러스터에 전혀 없는 샘플; 모든 노이즈 포인트), <b>점수 1</b>(클러스터의 핵심에 있는 샘플)

In [14]:
# 클러스터 결과값
clusterer.labels_

array([1, 0, 2, ..., 2, 2, 0], dtype=int64)

In [15]:
# 클러스터 멤버 자격점수
clusterer.probabilities_

array([0.57480716, 0.7025811 , 0.76420535, ..., 0.5389692 , 0.80442683,
       0.9258222 ])

## 클러스터 생성에 설정할 수 있는 Metric option

In [17]:
clusterer = hdbscan.HDBSCAN(metric='manhattan')
clusterer.fit(blobs)
clusterer.labels_

array([1, 0, 2, ..., 2, 2, 0], dtype=int64)

In [16]:
hdbscan.dist_metrics.METRIC_MAPPING

{'euclidean': hdbscan.dist_metrics.EuclideanDistance,
 'l2': hdbscan.dist_metrics.EuclideanDistance,
 'minkowski': hdbscan.dist_metrics.MinkowskiDistance,
 'p': hdbscan.dist_metrics.MinkowskiDistance,
 'manhattan': hdbscan.dist_metrics.ManhattanDistance,
 'cityblock': hdbscan.dist_metrics.ManhattanDistance,
 'l1': hdbscan.dist_metrics.ManhattanDistance,
 'chebyshev': hdbscan.dist_metrics.ChebyshevDistance,
 'infinity': hdbscan.dist_metrics.ChebyshevDistance,
 'seuclidean': hdbscan.dist_metrics.SEuclideanDistance,
 'mahalanobis': hdbscan.dist_metrics.MahalanobisDistance,
 'wminkowski': hdbscan.dist_metrics.WMinkowskiDistance,
 'hamming': hdbscan.dist_metrics.HammingDistance,
 'canberra': hdbscan.dist_metrics.CanberraDistance,
 'braycurtis': hdbscan.dist_metrics.BrayCurtisDistance,
 'matching': hdbscan.dist_metrics.MatchingDistance,
 'jaccard': hdbscan.dist_metrics.JaccardDistance,
 'dice': hdbscan.dist_metrics.DiceDistance,
 'kulsinski': hdbscan.dist_metrics.KulsinskiDistance,
 'rogerst

## 거리행렬 (Distance matrices) - metric='precomputed'

In [18]:
from sklearn.metrics.pairwise import pairwise_distances

In [19]:
distance_matrix = pairwise_distances(blobs)
clusterer = hdbscan.HDBSCAN(metric='precomputed')
clusterer.fit(distance_matrix)
clusterer.labels_

array([0, 2, 1, ..., 1, 1, 2], dtype=int64)