In [4]:
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import warnings
import copy

warnings.filterwarnings("ignore")

df = pd.read_csv('Stars.csv')
X = df.drop(['Type'], axis=1)
y = df['Type']
df.head()

Unnamed: 0,Temperature,L,R,A_M,Color,Spectral_Class,Type
0,3068,0.0024,0.17,16.12,Red,M,0
1,3042,0.0005,0.1542,16.6,Red,M,0
2,2600,0.0003,0.102,18.7,Red,M,0
3,2800,0.0002,0.16,16.65,Red,M,0
4,1939,0.000138,0.103,20.06,Red,M,0


In [5]:
X_numerics = df.drop(['Type', 'Color', 'Spectral_Class'], axis=1)
X_categoricals = df.drop(['Type', 'L', 'R', 'A_M', 'Temperature'], axis=1)

enc = preprocessing.OneHotEncoder()
enc.fit(X_categoricals)
X_categoricals_enc = enc.transform(X_categoricals).toarray()
X_categoricals_enc = pd.DataFrame(X_categoricals_enc)

X_enc = pd.concat([X_categoricals_enc, X_numerics], axis = 1)

kmeans = KMeans(n_clusters=6, random_state=10).fit(X_enc)
result = kmeans.predict(X_enc)
print("Without preprocessing:")

result_corrected = copy.deepcopy(result)
for i in range(6):
    freqs = [0 for i in range(6)]
    for j in range(len(result)):
        if result[j] == i:
            freqs[y[j]] += 1
    label = freqs.index(max(freqs))
    for j in range(len(result)):
        if result[j] == i:
            result_corrected[j] = label

print("K-Means accuracy: " + str(accuracy_score(y, result_corrected)))

clustering  = AgglomerativeClustering(n_clusters=6, affinity='euclidean', linkage='complete')
result = clustering.fit_predict(X_enc)
result_corrected = copy.deepcopy(result)
for i in range(6):
    freqs = [0 for i in range(6)]
    for j in range(len(result)):
        if result[j] == i:
            freqs[y[j]] += 1
    label = freqs.index(max(freqs))
    for j in range(len(result)):
        if result[j] == i:
            result_corrected[j] = label

print("hierarchical accuracy: " + str(accuracy_score(y, result_corrected)))

Without preprocessing:
K-Means accuracy: 0.35833333333333334
hierarchical accuracy: 0.3541666666666667


In [6]:
discretizer = preprocessing.KBinsDiscretizer(n_bins=8, encode='ordinal', strategy='quantile')
X_enc_discretized = discretizer.fit_transform(X_enc)

# scaler = preprocessing.StandardScaler()
# X_enc_discretized_scaled = scaler.fit_transform(X_enc_discretized)
max_acc = 0
max_i = 0
for i in range(100):
    kmeans = KMeans(n_clusters=6, random_state=i).fit(X_enc_discretized)
    result = kmeans.predict(X_enc_discretized)
    
    result_corrected = copy.deepcopy(result)
    for i in range(6):
        freqs = [0 for i in range(6)]
        for j in range(len(result)):
            if result[j] == i:
                freqs[y[j]] += 1
        label = freqs.index(max(freqs))
        for j in range(len(result)):
            if result[j] == i:
                result_corrected[j] = label

        acc = accuracy_score(y, result_corrected)
        if max_acc < acc:
            max_acc = acc
            max_i = i
            
acc_kmeans = max_acc

max_acc = 0
for i in range(100):
    clustering  = AgglomerativeClustering(n_clusters=6, affinity='euclidean', linkage='complete')
    result = clustering.fit_predict(X_enc_discretized)
    result_corrected = copy.deepcopy(result)
    for i in range(6):
        freqs = [0 for i in range(6)]
        for j in range(len(result)):
            if result[j] == i:
                freqs[y[j]] += 1
        label = freqs.index(max(freqs))
        for j in range(len(result)):
            if result[j] == i:
                result_corrected[j] = label

        acc = accuracy_score(y, result_corrected)
        if max_acc < acc:
            max_acc = acc
            max_i = i

acc_hier = max_acc

print("With preprocessing:")
print("K-Means accuracy: " + str(acc_kmeans))
print("hierarchical accuracy: " + str(acc_hier))

With preprocessing:
K-Means accuracy: 0.8208333333333333
hierarchical accuracy: 0.8041666666666667
