#### 라이브러리 호출

In [1]:
import logging  # to further silence deprecation warnings
import warnings

# Standard library
import os

# Data handling
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning & clustering
from sklearn import metrics, preprocessing
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
import umap.umap_ as umap
from umap import validation
import hdbscan
from hdbscan.validity import validity_index

# Custom modules
from denseclus import DenseClus

# Suppress the specific FutureWarning from sklearn
warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message=".*force_all_finite.*"
)

logging.captureWarnings(True)

In [2]:
sns.set_style("darkgrid", {"axes.facecolor": ".9"})
sns.set(rc={"figure.figsize": (10, 8)})

#### 데이터 호출

In [3]:
SEED = 42
np.random.seed(SEED)  # set the random seed as best we can

data_path = './data/flat-training.csv'

df = pd.read_csv(data_path)

# 결측치가 하나라도 있는 열을 제거
#df = df.dropna(axis=1)

#df = df.sample(n=2000, random_state=SEED)

df.head(5)

Unnamed: 0,dog,cat,rabbit,deer,panda,koala,otter,hedgehog,squirrel,dolphin,...,blanket,button,whistle,marble,wagon,storybook,candle,clover,bubble,cookie
0,10,A5DB,,4.46,-2,T2,B9DE,51.8,0,1,...,A8,2,2,0,-0.76,-54,B2,0.38,0,C13
1,10,A5DB,,4.42,0,T3,027A,72.2,1,1,...,A7,16,0,0,-0.76,-48,B0,0.16,0,C15
2,43,027A,8.0,3.11,-1,T0,B9DE,44.2,1,1,...,A4,-25,10,0,-0.88,-39,B0,0.13,0,C1
3,28,63D1,,3.37,-1,T1,027A,41.0,0,1,...,A3,21,9,0,-0.85,-58,B1,0.29,1,C5
4,82,C09E,,3.07,-6,T0,B9DE,46.4,1,1,...,A7,-4,13,0,-0.85,70,B2,0.75,1,C13


In [4]:
df.dtypes

dog            int64
cat           object
rabbit       float64
deer         float64
panda          int64
              ...   
storybook      int64
candle        object
clover       float64
bubble         int64
cookie        object
Length: 80, dtype: object

In [None]:
clf = DenseClus(random_state=SEED, umap_combine_method="intersection")

clf.fit(df)

Max of 90 is greater than threshold 25
Hashing categorical features


In [None]:
embedding = clf.mapper_.embedding_
labels = clf.evaluate()
clustered = labels >= 0

cnts = pd.DataFrame(labels)[0].value_counts()
cnts = cnts.reset_index()
cnts.columns = ["cluster", "count"]
print(cnts.sort_values(["cluster"], ignore_index=True))

NameError: name 'clf' is not defined

In [None]:
np.unique(labels)

In [None]:
embedding.shape

In [None]:
_ = sns.jointplot(
    x=embedding[clustered, 0], y=embedding[clustered, 1], hue=labels[clustered], kind="kde"
)

plt.show()

In [None]:
_ = clf.hdbscan_.condensed_tree_.plot(
    select_clusters=True,
    selection_palette=sns.color_palette("deep", np.unique(clustered).shape[0]),
)

plt.show()

In [None]:
coverage = np.sum(clustered) / embedding.shape[0]

print(f"Coverage {coverage}")
print(f"DBCV score {clf.hdbscan_.relative_validity_}")

In [None]:
for i in range(len(clf.mapper_.embedding_[0])):
    sns.kdeplot(clf.mapper_.embedding_[:, i], fill=True)

plt.show()

In [None]:
K = 50

# supress numba deprecation warnings from UMAP
filterwarnings("ignore")

numerical_trustworthiness = validation.trustworthiness_vector(
    source=clf.numerical_umap_._raw_data, embedding=clf.numerical_umap_.embedding_, max_k=K
)

categorical_trustworthiness = validation.trustworthiness_vector(
    source=clf.categorical_umap_._raw_data, embedding=clf.categorical_umap_.embedding_, max_k=K
)

filterwarnings("default")


plt.plot(numerical_trustworthiness)
plt.plot(categorical_trustworthiness)
plt.ylabel("Trustworthiness score")
plt.xlabel("Value of K")
plt.title(f"Trustworthiness at {K}")

plt.legend(["numerical T", "categorical T"], loc="upper right")

plt.show()

#### 클러스터링 결과 저장

In [None]:
SAVE_PATH = './results'

df['segment'] = clf.evaluate()

# CSV로 저장
df.to_csv(os.path.join(SAVE_PATH, "clustered_data.csv"), index=False)