# DenseClus 클러스터링 예제

## 1. 라이브러리 호출

In [None]:

import logging
import warnings

# Standard library
import os

# Data handling
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning & clustering
from sklearn import metrics, preprocessing
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
import umap.umap_ as umap
from umap import validation
import hdbscan
from hdbscan.validity import validity_index

# Custom modules
from denseclus import DenseClus

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message=".*force_all_finite.*"
)

logging.captureWarnings(True)


In [None]:
sns.set_theme(style="darkgrid", rc={"figure.figsize": (10, 8)})

## 2. 데이터 불러오기

In [None]:

SEED = 42
np.random.seed(SEED)

data_path = './data/flat-training.csv'

df = pd.read_csv(data_path)

# 필요한 경우 샘플링 또는 결측 열 제거
# df = df.dropna(axis=1)
# df = df.sample(n=2000, random_state=SEED)

df.head()


## 3. 데이터 타입 확인

In [None]:
df.dtypes

## 4. DenseClus 학습

In [None]:

clf = DenseClus(random_state=SEED, umap_combine_method="intersection")
clf.fit(df)


## 5. 클러스터 통계

In [None]:

embedding = clf.mapper_.embedding_
labels = clf.evaluate()
clustered = labels >= 0

cnts = pd.DataFrame(labels)[0].value_counts().reset_index()
cnts.columns = ["cluster", "count"]
print(cnts.sort_values("cluster").reset_index(drop=True))


In [None]:
np.unique(labels)

In [None]:
embedding.shape

## 6. 임베딩 시각화

In [None]:

_ = sns.jointplot(
    x=embedding[clustered, 0],
    y=embedding[clustered, 1],
    hue=labels[clustered],
    kind="kde",
)
plt.show()


## 7. Condensed Tree 시각화

In [None]:

_ = clf.hdbscan_.condensed_tree_.plot(
    select_clusters=True,
    selection_palette=sns.color_palette("deep", np.unique(clustered).shape[0]),
)
plt.show()


## 8. 커버리지와 DBCV 점수

In [None]:

coverage = np.sum(clustered) / embedding.shape[0]
print(f"Coverage {coverage}")
print(f"DBCV score {clf.hdbscan_.relative_validity_}")


## 9. 임베딩 분포

In [None]:

for i in range(clf.mapper_.embedding_.shape[1]):
    sns.kdeplot(clf.mapper_.embedding_[:, i], fill=True)
plt.show()


## 10. UMAP Trustworthiness

In [None]:

K = 50

# suppress numba deprecation warnings from UMAP
warnings.filterwarnings("ignore")

numerical_trustworthiness = validation.trustworthiness_vector(
    source=clf.numerical_umap_._raw_data,
    embedding=clf.numerical_umap_.embedding_,
    max_k=K,
)

categorical_trustworthiness = validation.trustworthiness_vector(
    source=clf.categorical_umap_._raw_data,
    embedding=clf.categorical_umap_.embedding_,
    max_k=K,
)

warnings.filterwarnings("default")

plt.plot(numerical_trustworthiness)
plt.plot(categorical_trustworthiness)
plt.ylabel("Trustworthiness score")
plt.xlabel("Value of K")
plt.title(f"Trustworthiness at {K}")
plt.legend(["numerical T", "categorical T"], loc="upper right")
plt.show()


## 11. 클러스터링 결과 저장

In [None]:

SAVE_PATH = './results'

df['segment'] = clf.evaluate()

df.to_csv(os.path.join(SAVE_PATH, 'clustered_data.csv'), index=False)
