<a href="https://colab.research.google.com/github/Vaycold/Python_ML/blob/main/Clustering/%234.HDBSCAN%20wtih%20iris%20dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Goal
 - Clustering의 다양한 알고리즘
 - 시각화 및 차이점

## Data load & Preprocessing

In [2]:
# importing module
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [3]:
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns = iris.feature_names)
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [4]:
# 컬럼명 재 할당
columns_name_list = ['sepal_length','sepal_width', 'petal_length', 'petal_width']
iris_df.columns = columns_name_list
iris_df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], dtype='object')

In [5]:
# 딕셔너리 방법으로도 가능함.

columns_replace_dict = {
    k:v for k,v in zip(iris.feature_names, columns_name_list)
}
iris_df.rename(columns_replace_dict, axis='columns', inplace=True)
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [6]:
# target을 추가해보자.
iris_df['target'] = iris.target

In [7]:
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [8]:
# target의 종류를 확인해보자.
iris_df.target.unique()

array([0, 1, 2])

In [9]:
# 결측값 확인해보자
iris_df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
target          0
dtype: int64

## 데이터 시각화

In [10]:
iris_df.dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
target            int64
dtype: object

In [11]:
iris_df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [12]:
import plotly.express as px

# scatter plot 생성
fig = px.scatter(iris_df, x = 'sepal_length', y='sepal_width')

# 그래프 사이즈 조정
fig.update_layout(width = 600, height=500)

fig.show()


In [13]:
# target은 색깔을 넣어 확인해보자

fig = px.scatter(iris_df, x = 'sepal_length', y='sepal_width', color = 'target')
fig.update_layout(width = 600, height=500)
fig.show()


## Split the dataset


In [14]:
X = iris_df.iloc[:, :-1] # 전체의 row와 마지막 column을 제외한 값을 x로
y = iris_df.iloc[:, -1]
train_x, test_x, train_y, test_y = train_test_split(X, y , test_size = 0.2)

## Clustering Algorithm  - HDBSCAN clustering 
   
  - DBSCAN에서 Hierarchical가 합쳐진 알고리즘
  - different size, densities, noise, arbitrary shapes인 데이터에 적합
  - 계층적 구조를 반영한 cluster


In [15]:
# 다양한 분포, 사이즈의 데이터 생성

from sklearn.datasets import make_blobs, make_moons
moons, _ = make_moons(n_samples= 100, noise=0.05)
blobs1, _ = make_blobs(n_samples=50, centers=[(-0.75,2.25), (1.0,2.0)], cluster_std=0.25) # 구형 데이터이기 때문에 중심점을 정할 수 있음.
blobs2, _ = make_blobs(n_samples=30, centers=[(-0.3,-1), (4.0,1.5)], cluster_std=0.3) 
blobs3, _ = make_blobs(n_samples=100, centers=[(3,-1), (4.0,1.5)], cluster_std=0.4) 

hdb_data = np.vstack([moons,blobs1,blobs2,blobs3])
hdb_data_df = pd.DataFrame(hdb_data,columns = ['x','y'])
hdb_data_df.head()

Unnamed: 0,x,y
0,0.412183,0.935172
1,0.182911,-0.046977
2,0.921653,0.365933
3,0.416868,0.823624
4,-0.397397,0.876612


In [16]:
hdb_data_df.shape

(280, 2)

In [17]:
fig = px.scatter(hdb_data_df, x='x', y='y')
fig.update_layout(width = 600, height = 500, title = 'HDBSCAN Data ')
fig.show()

In [18]:
# HDBSCAN 알고리즘 
# parameters
# - min_cluster_size            : 군집화를 위한 최소한의 cluster 사이즈
# - min_samples                 : 반경 내 있어야 할 최소 data points
# - cluster_selection_epsilon   : 거리 기준, 이 기준보다 아래의 거리는 cluster 끼리 merge됨.
!pip install hdbscan
import hdbscan
hdbscan_model = hdbscan.HDBSCAN()

Collecting hdbscan
[?25l  Downloading https://files.pythonhosted.org/packages/32/bb/59a75bc5ac66a9b4f9b8f979e4545af0e98bb1ca4e6ae96b3b956b554223/hdbscan-0.8.27.tar.gz (6.4MB)
[K     |████████████████████████████████| 6.4MB 5.7MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: hdbscan
  Building wheel for hdbscan (PEP 517) ... [?25l[?25hdone
  Created wheel for hdbscan: filename=hdbscan-0.8.27-cp37-cp37m-linux_x86_64.whl size=2311701 sha256=b67603db85b69d6febccd14548b652038f11a02364e9eb9e8e399f834369125b
  Stored in directory: /root/.cache/pip/wheels/42/63/fb/314ad6c3b270887a3ecb588b8e5aac50b0fad38ff89bb6dff2
Successfully built hdbscan
Installing collected packages: hdbscan
Successfully installed hdbscan-0.8.27


In [19]:
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size = 5)

In [20]:
hdbscan_model.fit(hdb_data)

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
        approx_min_span_tree=True, cluster_selection_epsilon=0.0,
        cluster_selection_method='eom', core_dist_n_jobs=4,
        gen_min_span_tree=False, leaf_size=40,
        match_reference_implementation=False, memory=Memory(location=None),
        metric='euclidean', min_cluster_size=5, min_samples=None, p=None,
        prediction_data=False)

In [21]:
# 훈련된 결과 label을 확인해보자
hdbscan_label = hdbscan_model.fit_predict(hdb_data)
hdbscan_label[:20]

array([4, 5, 4, 4, 4, 4, 4, 5, 5, 5, 5, 4, 5, 5, 5, 4, 5, 5, 5, 5])

In [22]:
set(hdbscan_label) # -1은 outlier

{-1, 0, 1, 2, 3, 4, 5}

In [23]:
# 결과 시각화
hdb_data_df['hdbscan_label'] = hdbscan_label
hdb_data_df['hdbscan_label'] = hdb_data_df['hdbscan_label'].astype(str)

fig = px.scatter(hdb_data_df, x='x', y='y', color='hdbscan_label')
fig.update_layout(width = 600, height = 500, title = 'HDBSCAN Data ')
fig.show()

In [24]:
# parameter를 변경해보면서 비교해보자

In [25]:
# 1. Min_cluster_size

for min_cluster_size in [3,5,7,9,13] :
    hdbscan_label = hdbscan.HDBSCAN(min_cluster_size = min_cluster_size, min_samples=None, prediction_data=True).fit_predict(hdb_data)
    hdb_data_df['hdbscan_label'] = hdbscan_label
    hdb_data_df['hdbscan_label'] =  hdb_data_df['hdbscan_label'].astype(str)

    # outlier가 몇개 있는 지 같이 확인해보자
    hdbscan_case_dict = dict((x, list(hdbscan_label).count(x)) for x in set(hdbscan_label))
    outliers =  hdbscan_case_dict[-1]

    fig = px.scatter(hdb_data_df, x='x', y='y', color='hdbscan_label')
    fig.update_layout(width = 600, height = 500, title = f'min_cluster_size : {min_cluster_size} >> label 수 : {len(set(hdbscan_label))}, outlier : {outliers}')
    fig.show()



In [26]:
# 2. min_samples - 주변에 n개만 있어도 군집이라고 하겠다. 라는 의미. 

for min_samples in [3,5,7,9,13] :
    hdbscan_label = hdbscan.HDBSCAN(min_cluster_size = 5, min_samples=min_samples, prediction_data=True).fit_predict(hdb_data)
    hdb_data_df['hdbscan_label'] = hdbscan_label
    hdb_data_df['hdbscan_label'] =  hdb_data_df['hdbscan_label'].astype(str)

    # outlier가 몇개 있는 지 같이 확인해보자
    hdbscan_case_dict = dict((x, list(hdbscan_label).count(x)) for x in set(hdbscan_label))
    outliers =  hdbscan_case_dict[-1]

    fig = px.scatter(hdb_data_df, x='x', y='y', color='hdbscan_label')
    fig.update_layout(width = 600, height = 500, title = f'min_samples : {min_samples} >> label 수 : {len(set(hdbscan_label))}, outlier : {outliers}')
    fig.show()



In [27]:
# 3. cluster_selection_epsilon - 

for cluster_selection_epsilon in [0.1, 0.5, 0.7, 1.0] :
    hdbscan_label = hdbscan.HDBSCAN(min_cluster_size = 5, min_samples=None, cluster_selection_epsilon=cluster_selection_epsilon, prediction_data=True).fit_predict(hdb_data)
    hdb_data_df['hdbscan_label'] = hdbscan_label
    hdb_data_df['hdbscan_label'] =  hdb_data_df['hdbscan_label'].astype(str)

    # outlier가 몇개 있는 지 같이 확인해보자
    hdbscan_case_dict = dict((x, list(hdbscan_label).count(x)) for x in set(hdbscan_label))
    if -1 in hdbscan_case_dict.keys() :
        outliers = hdbscan_case_dict[-1]
    else :
        outliers =  0   

    fig = px.scatter(hdb_data_df, x='x', y='y', color='hdbscan_label')
    fig.update_layout(width = 600, height = 500, title = f'cluster_selection_epsilon : {cluster_selection_epsilon} >> label 수 : {len(set(hdbscan_label))}, outlier : {outliers}')
    fig.show()

## DBSCAN vs HDBSCAN

In [29]:
# 분산이 극단적인 두 가지 케이스를 생성해보자
blobs1, _ = make_blobs(n_samples = 200, centers = [(-10,5),(0,-5)], cluster_std = 0.5)
blobs2, _ = make_blobs(n_samples = 200, centers = [(30,-1),(30,1.5)], cluster_std = 5)
comp_data = np.vstack([blobs1,blobs2])
comp_data_df = pd.DataFrame(comp_data, columns = ['x','y'])
comp_data_df.head()

Unnamed: 0,x,y
0,-10.014442,4.755662
1,-0.180254,-4.758106
2,-0.114337,-6.051695
3,-1.203935,-3.807017
4,-10.312572,4.848953


In [32]:
# scatter plot 생성
fig = px.scatter(comp_data_df, x='x',y='y')
fig.update_layout(width = 600, height = 500, title = 'Data')
fig.show()

In [35]:
# DBSCAN
from sklearn.cluster import DBSCAN
dbscan_model = DBSCAN(eps=0.6, min_samples=10).fit(comp_data)
comp_data_df['dbscan_model'] = dbscan_model.labels_
comp_data_df['dbscan_model'] = comp_data_df['dbscan_model'].astype(str)

#HDBSCAN
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=None, cluster_selection_epsilon=0.1, gen_min_span_tree=True)
comp_data_df['hdbscan_model'] = hdbscan_model.fit_predict(comp_data)
comp_data_df['hdbscan_model'] = comp_data_df['hdbscan_model'].astype(str)

In [36]:
comp_data_df.head()

Unnamed: 0,x,y,dbscan_model,hdbscan_model
0,-10.014442,4.755662,0,1
1,-0.180254,-4.758106,1,2
2,-0.114337,-6.051695,1,2
3,-1.203935,-3.807017,1,2
4,-10.312572,4.848953,0,1


In [37]:
# outlier 를 구분하기 위한 color 컬럼 생성
color_dict = {'-1' : '#d8d8d8', 
              '0'  : '#ff5e5b', 
              '1'  : '#457b9d', 
              '2'  : '#00cecb',
              '3'  : '#FFED66'}

In [42]:
import plotly.graph_objects as go
comp_data_df['dbscan_label_color'] = comp_data_df['dbscan_model'].map(color_dict)
comp_data_df['hdbscan_label_color'] = comp_data_df['hdbscan_model'].map(color_dict)

for label_case in ['hdbscan_label_color','dbscan_label_color'] :
    fig = go.Figure(data=go.Scatter(
        x = comp_data_df['x'],
        y = comp_data_df['y'],
        mode = 'markers' , 
        marker = dict(color = comp_data_df[label_case], showscale =True )
    ))
    fig.update_layout(width=600, height = 500, title = f'{label_case} 시각화')
    fig.show()