In [809]:
import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

**1. 데이터셋 가져오기 + 데이터 분석**

In [810]:
# 데이터셋 가져오기 (Scikit-learn 의 기본 제공 데이터셋, Kaggle 의 csv 파일 등)
# csv 파일을 사용할 때도 코드 호환이 잘 되게 하기 위해 Pandas DataFrame 으로 처음부터 변환

# Scikit-learn 방식
"""
data = datasets.load_digits()
X, y = data.data, data.target
X = pd.DataFrame(X, columns=data.feature_names)
y = pd.DataFrame(y, columns=['target'])
"""

# Kaggle CSV 방식
df = pd.read_csv('CC GENERAL.csv')

In [811]:
df

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,C10001,40.900749,0.818182,95.40,0.00,95.40,0.000000,0.166667,0.000000,0.083333,0.000000,0,2,1000.0,201.802084,139.509787,0.000000,12
1,C10002,3202.467416,0.909091,0.00,0.00,0.00,6442.945483,0.000000,0.000000,0.000000,0.250000,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,C10003,2495.148862,1.000000,773.17,773.17,0.00,0.000000,1.000000,1.000000,0.000000,0.000000,0,12,7500.0,622.066742,627.284787,0.000000,12
3,C10004,1666.670542,0.636364,1499.00,1499.00,0.00,205.788017,0.083333,0.083333,0.000000,0.083333,1,1,7500.0,0.000000,,0.000000,12
4,C10005,817.714335,1.000000,16.00,16.00,0.00,0.000000,0.083333,0.083333,0.000000,0.000000,0,1,1200.0,678.334763,244.791237,0.000000,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8945,C19186,28.493517,1.000000,291.12,0.00,291.12,0.000000,1.000000,0.000000,0.833333,0.000000,0,6,1000.0,325.594462,48.886365,0.500000,6
8946,C19187,19.183215,1.000000,300.00,0.00,300.00,0.000000,1.000000,0.000000,0.833333,0.000000,0,6,1000.0,275.861322,,0.000000,6
8947,C19188,23.398673,0.833333,144.40,0.00,144.40,0.000000,0.833333,0.000000,0.666667,0.000000,0,5,1000.0,81.270775,82.418369,0.250000,6
8948,C19189,13.457564,0.833333,0.00,0.00,0.00,36.558778,0.000000,0.000000,0.000000,0.166667,2,0,500.0,52.549959,55.755628,0.250000,6


In [812]:
# 결측치가 있는 모든 행 제거 (여전히 데이터셋 크기가 충분하다는 가정 하에)
df = df.dropna()
target_col_name = 'PURCHASES_FREQUENCY'

X = df.drop(columns=[target_col_name, 'CUST_ID'])
y = df[[target_col_name]]
y.columns = ['target']

In [813]:
# PERCHASES_FREQUENCY column 을 4 구간으로 나누어 전처리

def mapping_to_class(x):
    if x < 0.15:
        return 'Class 0'
    elif x < 0.5:
        return 'Class 1'
    elif x < 0.85:
        return 'Class 2'
    else:
        return 'Class 3'

y['target'] = y['target'].apply(lambda x: mapping_to_class(x))

In [814]:
X

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,40.900749,0.818182,95.40,0.00,95.40,0.000000,0.000000,0.083333,0.000000,0,2,1000.0,201.802084,139.509787,0.000000,12
1,3202.467416,0.909091,0.00,0.00,0.00,6442.945483,0.000000,0.000000,0.250000,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,2495.148862,1.000000,773.17,773.17,0.00,0.000000,1.000000,0.000000,0.000000,0,12,7500.0,622.066742,627.284787,0.000000,12
4,817.714335,1.000000,16.00,16.00,0.00,0.000000,0.083333,0.000000,0.000000,0,1,1200.0,678.334763,244.791237,0.000000,12
5,1809.828751,1.000000,1333.28,0.00,1333.28,0.000000,0.000000,0.583333,0.000000,0,8,1800.0,1400.057770,2407.246035,0.000000,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8943,5.871712,0.500000,20.90,20.90,0.00,0.000000,0.166667,0.000000,0.000000,0,1,500.0,58.644883,43.473717,0.000000,6
8945,28.493517,1.000000,291.12,0.00,291.12,0.000000,0.000000,0.833333,0.000000,0,6,1000.0,325.594462,48.886365,0.500000,6
8947,23.398673,0.833333,144.40,0.00,144.40,0.000000,0.000000,0.666667,0.000000,0,5,1000.0,81.270775,82.418369,0.250000,6
8948,13.457564,0.833333,0.00,0.00,0.00,36.558778,0.000000,0.000000,0.166667,2,0,500.0,52.549959,55.755628,0.250000,6


In [815]:
# y 의 Class 를 Class 0 ~ (N - 1) 로 수정

classes = y['target'].unique()
classes.sort()
mapping = {cls: i for i, cls in enumerate(classes)}
print(mapping)

y['target'] = y['target'].apply(lambda x: mapping[x])

{'Class 0': 0, 'Class 1': 1, 'Class 2': 2, 'Class 3': 3}


In [816]:
y

Unnamed: 0,target
0,1
1,0
2,3
4,0
5,2
...,...
8943,1
8945,3
8947,2
8948,0


In [817]:
# 랜덤한 20,000 개의 데이터를 학습

if len(X) > 20000:
    num_samples = 20000
    np.random.seed(2025)
    indices = np.random.choice(X.shape[0], num_samples, replace=False)

    X = X.loc[indices]
    y = y.loc[indices]

In [818]:
# 데이터 분포 확인

y_count = y.value_counts()
y_count_percentage = y.value_counts(normalize=True)

y_distrib = pd.DataFrame({'count': y_count, 'percentage (%)': 100 * y_count_percentage})
y_distrib

Unnamed: 0_level_0,count,percentage (%)
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2715,31.438166
3,2635,30.511811
2,1816,21.028254
1,1470,17.021769


In [819]:
# 데이터셋을 먼저 표준정규분포로 표준화 -> K-Nearest-Neighbor Classifier 에서는 필수 !!

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [820]:
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,-0.744625,-0.370047,-0.429184,-0.359160,-0.354826,-0.468655,-0.686280,-0.717179,-0.681953,-0.479437,-0.517623,-0.962575,-0.543942,-0.305508,-0.537727,0.355181
1,0.764152,0.067679,-0.473208,-0.359160,-0.458839,2.568556,-0.686280,-0.926522,0.557022,0.099258,-0.597054,0.677204,0.796852,0.087689,0.212380,0.355181
2,0.426602,0.505405,-0.116413,0.099909,-0.458839,-0.468655,2.646651,-0.926522,-0.681953,-0.479437,-0.120467,0.813852,-0.399503,-0.099906,-0.537727,0.355181
3,-0.373910,0.505405,-0.465825,-0.349660,-0.458839,-0.468655,-0.408536,-0.926522,-0.681953,-0.479437,-0.557339,-0.907916,-0.380165,-0.261131,-0.537727,0.355181
4,0.099551,0.505405,0.142062,-0.359160,0.994815,-0.468655,-0.686280,0.538882,-0.681953,-0.479437,-0.279329,-0.743938,-0.132119,0.650363,-0.537727,0.355181
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8631,-0.761342,-1.902089,-0.463563,-0.346751,-0.458839,-0.468655,-0.130790,-0.926522,-0.681953,-0.479437,-0.557339,-1.099224,-0.593143,-0.345988,-0.537727,-4.221800
8632,-0.750546,0.505405,-0.338865,-0.359160,-0.141436,-0.468655,-0.686280,1.166912,-0.681953,-0.479437,-0.358760,-0.962575,-0.501396,-0.343706,1.150015,-4.221800
8633,-0.752977,-0.297095,-0.406572,-0.359160,-0.301402,-0.468655,-0.686280,0.748227,-0.681953,-0.479437,-0.398476,-0.962575,-0.585367,-0.329572,0.306144,-4.221800
8634,-0.757721,-0.297095,-0.473208,-0.359160,-0.458839,-0.451422,-0.686280,-0.926522,0.144032,-0.190089,-0.597054,-1.099224,-0.595238,-0.340811,0.306144,-4.221800


In [821]:
# 각 data의 corr-coef 분석

import plotly.figure_factory as ff

# Show corr-coef chart for given dataset
#
# args
# - X         : data to visualize
# - data_name : name for corr-coef chart title

def visualize_corr(X, data_name):
    corr = X.corr()

    fig = ff.create_annotated_heatmap(
                z=corr.round(decimals=2).to_numpy(),
                x=corr.columns.tolist(),
                y=corr.index.tolist(),
                xgap=1,
                ygap=1,
                zmax=1,
                zmin=-1,
                showscale=True,
                colorscale=['#F53', '#BBC', '#57F'],
                hoverongaps=True)

    fig.update_layout(title=f'correlation of {data_name}',
                      width=1040,
                      height=650)

    fig.show()

In [822]:
# 전체 Class 의 개수

CLASS_COUNT = len(y_distrib)
print(f'CLASS_COUNT : {CLASS_COUNT}')

CLASS_COUNT : 4


In [823]:
# 입력 데이터의 Column 이 19개 이하이면 상관계수 분석

if len(X.columns) <= 19:
    XY = pd.concat([X_scaled, y], axis=1)
    visualize_corr(XY, data_name='Entire data')

**2. Training, Test 데이터로 분리**

In [824]:
# 데이터 분리

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y,
                                                    test_size=0.25,
                                                    random_state=2025)

In [825]:
X_train.shape

(6477, 16)

In [826]:
X_test.shape

(2159, 16)

In [827]:
y_train.shape

(6477, 1)

In [828]:
y_test.shape

(2159, 1)

**3. Gaussian Mixture 분류 성능 테스트**

In [829]:
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=CLASS_COUNT, random_state=2025)
gmm.fit(X_train)

In [830]:
gmm_labels_test = gmm.predict(X_test)

In [831]:
gmm_labels_test = pd.DataFrame(gmm_labels_test, columns=['GMM pred'])
print(gmm_labels_test.to_numpy().flatten()[:50])

[3 2 3 1 1 1 3 3 2 3 2 0 1 3 3 2 0 3 3 0 3 3 3 2 3 3 0 3 2 3 1 3 2 0 3 1 2
 0 0 2 1 1 2 1 0 0 2 0 3 1]


In [832]:
print(y_test.to_numpy().flatten()[:50])

[2 0 3 3 1 1 3 2 0 0 0 2 1 3 0 0 2 2 2 1 1 2 3 0 0 2 1 3 0 3 3 3 0 0 1 2 0
 1 2 0 3 1 0 2 3 3 0 1 3 3]


In [833]:
# GMM 의 label 과 y_test 의 실제 label 의 순서가 바뀌었을 수도 있음을 고려하여 성능 측정
# 단, Class 개수가 많아지면 조합이 기하급수적으로 늘어나므로 문제가 될 수 있음

from itertools import permutations
permutations = list(permutations(range(CLASS_COUNT)))

def compute_GMM_accuracy(gmm_labels_test, y_test):
    metric_results = []
    best_accuracy = 0.0

    # 성능지표 결과 산출
    for perm in permutations:
        gmm_labels_test_ = {'GMM pred': gmm_labels_test['GMM pred'].apply(lambda x: f'CLASS {perm[x]}')}
        y_test_ = {'target': y_test['target'].apply(lambda x: f'CLASS {x}')}

        pred = gmm_labels_test_['GMM pred'].tolist()
        target = y_test_['target'].tolist()
        accuracy = accuracy_score(pred, target)

        # 속도 향상을 위해, 지금까지의 Best Accuracy 보다 좋은 경우만 추가
        if accuracy >= best_accuracy:
            metric_results.append({
                        'perm': perm,
                        'acc': accuracy,
                        'macro_f1': f1_score(pred, target, average='macro'),
                        'weighted_f1': f1_score(pred, target, average='weighted')})

            best_accuracy = accuracy
            print(metric_results[-1])

    # 성능 결과를 정렬하여 best 값 도출
    metric_results.sort(key=lambda x: x['acc'], reverse=True)
    best_result = metric_results[0]

    return {'accuracy': best_result['acc'],
            'macro_f1': best_result['macro_f1'],
            'weighted_f1': best_result['weighted_f1'],
            'permutation': best_result['perm']}

In [834]:
GMM_acc = compute_GMM_accuracy(gmm_labels_test, y_test)

{'perm': (0, 1, 2, 3), 'acc': 0.24826308476146364, 'macro_f1': 0.20093405413473403, 'weighted_f1': 0.2672184825993748}
{'perm': (0, 2, 1, 3), 'acc': 0.25382121352478, 'macro_f1': 0.2045820249509675, 'weighted_f1': 0.26913793366340505}
{'perm': (1, 0, 2, 3), 'acc': 0.2667901806391848, 'macro_f1': 0.22614328227810088, 'weighted_f1': 0.2967748377420534}
{'perm': (1, 2, 0, 3), 'acc': 0.4993052339045855, 'macro_f1': 0.45661100389546655, 'weighted_f1': 0.4921974494668561}


In [835]:
print(f'GMM Accuracy : {GMM_acc}')

GMM Accuracy : {'accuracy': 0.4993052339045855, 'macro_f1': 0.45661100389546655, 'weighted_f1': 0.4921974494668561, 'permutation': (1, 2, 0, 3)}


In [836]:
# Confusion Matrix

from sklearn.metrics import confusion_matrix

matched_perm = GMM_acc['permutation']
conf_matrix = confusion_matrix(gmm_labels_test, y_test)
conf_matrix_df = pd.DataFrame(conf_matrix,
                              index=[f'Pred Class {matched_perm[i]}' for i in range(CLASS_COUNT)],
                              columns=[f'True Class {i}' for i in range(CLASS_COUNT)])
conf_matrix_df = conf_matrix_df.sort_index()

In [837]:
# Conf_matrix_df 의 개수가 맞는지 비교를 통해 쉽게 확인하기 위함

y_test.value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
3,678
0,673
2,447
1,361


In [838]:
conf_matrix_df

Unnamed: 0,True Class 0,True Class 1,True Class 2,True Class 3
Pred Class 0,477,0,0,0
Pred Class 1,82,135,125,139
Pred Class 2,37,50,62,135
Pred Class 3,77,176,260,404


**4. K Nearest Neighbors 분류 성능 테스트**

In [839]:
# k-Nearest Neighbors (kNN) 정의

from sklearn.neighbors import KNeighborsClassifier

In [840]:
# 학습 및 성능 테스트 함수

def train_and_test_knn(X_train, y_train, X_test, y_test, n_neighbors):
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train.to_numpy().ravel())

    y_pred = knn.predict(X_test)

    knn_acc = accuracy_score(y_test, y_pred)
    knn_macro_f1 = f1_score(y_test, y_pred, average='macro')
    knn_weighted_f1 = f1_score(y_test, y_pred, average='weighted')

    return knn_acc, knn_macro_f1, knn_weighted_f1

In [841]:
# 성능 테스트

MAX_N_NEIGHBORS = 50

knn_accs = []
knn_macro_f1s = []
knn_weighted_f1s = []

for n_neighbors in range(1, MAX_N_NEIGHBORS + 1):
    print(f'k-NN test with n_neighbors : {n_neighbors}')

    knn_acc, knn_macro_f1, knn_weighted_f1 = train_and_test_knn(X_train, y_train, X_test, y_test, n_neighbors)

    knn_accs.append(knn_acc)
    knn_macro_f1s.append(knn_macro_f1)
    knn_weighted_f1s.append(knn_weighted_f1)

k-NN test with n_neighbors : 1
k-NN test with n_neighbors : 2
k-NN test with n_neighbors : 3
k-NN test with n_neighbors : 4
k-NN test with n_neighbors : 5
k-NN test with n_neighbors : 6
k-NN test with n_neighbors : 7
k-NN test with n_neighbors : 8
k-NN test with n_neighbors : 9
k-NN test with n_neighbors : 10
k-NN test with n_neighbors : 11
k-NN test with n_neighbors : 12
k-NN test with n_neighbors : 13
k-NN test with n_neighbors : 14
k-NN test with n_neighbors : 15
k-NN test with n_neighbors : 16
k-NN test with n_neighbors : 17
k-NN test with n_neighbors : 18
k-NN test with n_neighbors : 19
k-NN test with n_neighbors : 20
k-NN test with n_neighbors : 21
k-NN test with n_neighbors : 22
k-NN test with n_neighbors : 23
k-NN test with n_neighbors : 24
k-NN test with n_neighbors : 25
k-NN test with n_neighbors : 26
k-NN test with n_neighbors : 27
k-NN test with n_neighbors : 28
k-NN test with n_neighbors : 29
k-NN test with n_neighbors : 30
k-NN test with n_neighbors : 31
k-NN test with n_

In [842]:
# 성능지표 결과 출력

def print_metric_results(metric_results, metric_name):
    print(f'KNN {metric_name}        : {metric_results}')
    print(f'max KNN {metric_name}    : {max(metric_results) :.4f}')
    print(f'min KNN {metric_name}    : {min(metric_results) :.4f}')
    print(f'mean KNN {metric_name}   : {np.mean(metric_results) :.4f}')
    print(f'median KNN {metric_name} : {np.median(metric_results) :.4f}')

In [843]:
print_metric_results(knn_accs, 'Accuracy')

KNN Accuracy        : [0.8369615562760537, 0.8050023158869847, 0.8476146364057434, 0.8304770727188513, 0.8476146364057434, 0.8332561371005095, 0.8531727651690597, 0.8443723946271422, 0.8513200555812876, 0.8462251042149143, 0.8490041685965725, 0.8485409911996294, 0.8536359425660027, 0.8490041685965725, 0.8490041685965725, 0.8439092172301992, 0.8443723946271422, 0.8392774432607689, 0.8443723946271422, 0.841130152848541, 0.84251968503937, 0.8383510884668828, 0.8397406206577119, 0.8388142658638258, 0.8369615562760537, 0.8351088466882816, 0.8378879110699398, 0.8397406206577119, 0.8383510884668828, 0.8360352014821677, 0.8341824918943955, 0.8300138953219083, 0.8314034275127373, 0.8290875405280222, 0.825845298749421, 0.8267716535433071, 0.8272348309402501, 0.8267716535433071, 0.8300138953219083, 0.825845298749421, 0.825845298749421, 0.8239925891616489, 0.8286243631310792, 0.8267716535433071, 0.825845298749421, 0.824918943955535, 0.8281611857341362, 0.8216767021769338, 0.8221398795738768, 0.817

In [844]:
print_metric_results(knn_macro_f1s, 'Macro F1 Score')

KNN Macro F1 Score        : [0.8081416680378141, 0.7722521679183203, 0.8176564907419457, 0.7978821824467714, 0.8165112958763752, 0.7984396442186565, 0.8215319678427214, 0.8093870913622984, 0.8167456517645754, 0.8120536528437009, 0.8155542652322669, 0.8147942602877002, 0.8213027294673838, 0.8166334668471436, 0.8169477160037133, 0.8116188662613537, 0.8112239949253218, 0.8037664583478539, 0.8103173128709109, 0.8064803222197914, 0.8075924021247533, 0.8033697627581954, 0.80448928328371, 0.8042915872366834, 0.8015659104540089, 0.7997748623476978, 0.8023009988021137, 0.804267625796999, 0.8023860364375934, 0.7987965875721383, 0.7973813440373735, 0.792365254931706, 0.7945888559671146, 0.7910661211463015, 0.7865363050677514, 0.787737874039391, 0.7883583126700885, 0.78762070964445, 0.7923555498413803, 0.7872930700785512, 0.7873246653717915, 0.7843035554184427, 0.7913934264835862, 0.7882740442604683, 0.7881806165727339, 0.7858680902523811, 0.7909188830634357, 0.7820063564462336, 0.7826188631115449

In [845]:
print_metric_results(knn_weighted_f1s, 'Weighted F1 Score')

KNN Weighted F1 Score        : [0.8334094388209685, 0.8014809752908286, 0.8428438117007708, 0.8251236545348183, 0.8419398711171063, 0.8268033344399549, 0.8467148070844192, 0.8373093582188244, 0.8436024135527262, 0.8388192076147885, 0.8416797880275312, 0.8413875348097523, 0.8467032682131286, 0.8419831294353228, 0.8416082126234556, 0.8367492188757008, 0.8367144929013507, 0.8309196901114726, 0.836246796697247, 0.8330105058672768, 0.834297630782903, 0.8302222793987978, 0.8313374142491224, 0.8307079687296584, 0.8284283755749391, 0.8268135577706096, 0.8291834289444288, 0.831060215439158, 0.82933438766319, 0.8268335811368469, 0.8247306263204224, 0.8204120538013435, 0.8218116315078203, 0.8191858505829659, 0.8155025416842562, 0.8165365517284904, 0.8171230353314589, 0.8164477340798105, 0.820289326469737, 0.8158053004033488, 0.815743040839826, 0.8135282190949392, 0.8190523434261612, 0.8167099950045134, 0.815882102910524, 0.8145445118396322, 0.8183371796105128, 0.8110383163962999, 0.81148879876713

In [846]:
# 성능 시각화

import plotly.graph_objects as go

n_neighbors = np.array(range(1, MAX_N_NEIGHBORS + 1))

# 성능지표 결과 추이 시각화
def visualize_metric_trend(metric_results, metric_name):

    # 성능 추이 표시
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=n_neighbors,
                            y=np.array(metric_results) * 100.0,
                            mode='lines',
                            name=f'k-NN test dataset {metric_name}'))

    # 차트 표시
    fig.update_layout(width=800,
                      height=500,
                      xaxis_title='number of neighbors',
                      yaxis_title='Accuracy (%)',
                      title=f'K-NN Test Dataset {metric_name} by Number of Neighbors')

    fig.show()

In [847]:
visualize_metric_trend(knn_accs, 'Accuracy')

In [848]:
visualize_metric_trend(knn_macro_f1s, 'Macro F1 Score')

In [849]:
visualize_metric_trend(knn_weighted_f1s, 'Weighted F1 Score')

**5. 비지도학습 Clustering 성능 비교 : Gaussian Mixture vs. K-Means Clustering**

In [850]:
# Gaussian Mixture Model 비지도 분류

gmm_clustering = GaussianMixture(n_components=CLASS_COUNT, random_state=2025)
gmm_clustering_result = gmm_clustering.fit_predict(X_scaled)

print(gmm_clustering_result)

[3 0 3 ... 2 0 2]


In [851]:
# PCA 를 이용한 시각화

import plotly.express as px
from sklearn.decomposition import PCA

# args :
# - X_scaled          : 표준정규분포로 scale 된 입력 데이터
# - clustering_result : Clustering 결과에서의 각 데이터의 label
# - model_name        : Clustering 에 사용한 모델 이름 (차트 제목 용도)

def visualize_using_PCA(X_scaled, clustering_result, model_name):
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    X_pca = pd.DataFrame(X_pca, columns=['Principal Component 0', 'Principal Component 1'])

    fig = px.scatter(X_pca,
                    x="Principal Component 0",
                    y="Principal Component 1",
                    color=clustering_result,
                    color_continuous_scale=['#F70', '#9C0', '#09F', '#A6F'],
                    title=f"Clustering by {model_name} Model")

    fig.update_traces(marker_size=3)
    fig.update_layout(width=1200, height=900)
    fig.show()

In [852]:
# t-SNE 를 이용한 시각화

from sklearn.manifold import TSNE

# args :
# - X_scaled          : 표준정규분포로 scale 된 입력 데이터
# - clustering_result : Clustering 결과에서의 각 데이터의 label
# - model_name        : Clustering 에 사용한 모델 이름 (차트 제목 용도)

def visualize_using_tSNE(X_scaled, clustering_result, model_name):
    tsne = TSNE(n_components=2, random_state=2025)
    X_tsne = tsne.fit_transform(X_scaled)
    X_tsne = pd.DataFrame(X_tsne, columns=['t-SNE Component 0', 't-SNE Component 1'])

    fig = px.scatter(X_tsne,
                    x="t-SNE Component 0",
                    y="t-SNE Component 1",
                    color=clustering_result,
                    color_continuous_scale=['#F70', '#9C0', '#09F', '#A6F'],
                    title=f"Clustering by {model_name} Model")

    fig.update_traces(marker_size=3)
    fig.update_layout(width=1200, height=900)
    fig.show()

In [853]:
visualize_using_PCA(X_scaled,
                    clustering_result=y.to_numpy().ravel(),
                    model_name='(Original True Label)')

In [854]:
visualize_using_tSNE(X_scaled,
                     clustering_result=y.to_numpy().ravel(),
                     model_name='(Original True Label)')

In [855]:
visualize_using_PCA(X_scaled,
                    clustering_result=gmm_clustering_result,
                    model_name='Gaussian Mixture')

In [856]:
visualize_using_tSNE(X_scaled,
                     clustering_result=gmm_clustering_result,
                     model_name='Gaussian Mixture')

In [857]:
# Silhouette Score 측정
# (Elbow Score 는 Cluster 개수 변화에 따른 성능을 측정하는데, 여기서는 Cluster가 항상 3개이므로 부적합)

from sklearn.metrics import silhouette_score

silhouette = silhouette_score(X_scaled, gmm_clustering_result)
print(f'Silhouette Score : {silhouette}')

Silhouette Score : 0.05170817327647538


In [858]:
# K-means Clustering 비지도 분류

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=CLASS_COUNT, random_state=2025)
kmeans.fit(X_scaled)

In [859]:
# K-means Clustering 결과 확인

labels = kmeans.labels_
print(labels)

[3 2 3 ... 0 0 0]


In [860]:
visualize_using_PCA(X_scaled,
                    clustering_result=labels,
                    model_name='K-means Clustering')

In [861]:
visualize_using_tSNE(X_scaled,
                     clustering_result=labels,
                     model_name='K-means Clustering')

In [862]:
# Silhouette Score 측정

silhouette = silhouette_score(X_scaled, labels)
print(f'Silhouette Score : {silhouette}')

Silhouette Score : 0.24813279341313724
