In [1]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, confusion_matrix
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

In [2]:
mnist = fetch_openml('mnist_784', version=1, as_frame=False, parser='auto')
mnist.target = mnist.target.astype(np.uint8)
X = mnist["data"]
y = mnist["target"]

In [3]:
# KMeans for 8, 9, 10, 11, 12 clusters

clusters = range(8, 13)
silhouette_scores = []

for cluster in clusters:
    kmeans = KMeans(n_clusters=cluster, n_init=10)
    y_pred = kmeans.fit_predict(X)
    if cluster == 10:
        y_pred_10 = y_pred
    score = silhouette_score(X, kmeans.labels_)
    print(f'Cluster number: {cluster}, Silhouette score: {score}')
    silhouette_scores.append(score) 

Cluster number: 8, Silhouette score: 0.07338865262913863
Cluster number: 9, Silhouette score: 0.05681022835989294
Cluster number: 10, Silhouette score: 0.058779462477890426
Cluster number: 11, Silhouette score: 0.0583425751043781
Cluster number: 12, Silhouette score: 0.058170873187227586


In [4]:
with open("kmeans_sil.pkl", "wb") as f:
    pickle.dump(silhouette_scores, f)

with open("kmeans_sil.pkl", "rb") as f:
    print(pickle.load(f))

[0.07338865262913863, 0.05681022835989294, 0.058779462477890426, 0.0583425751043781, 0.058170873187227586]


In [5]:
conf_m = confusion_matrix(y, y_pred_10)

max_value_indeces = sorted(set(np.argmax(conf_m, axis=1)))

In [6]:
with open("kmeans_argmax.pkl", "wb") as f:
    pickle.dump(max_value_indeces, f)

with open("kmeans_argmax.pkl", "rb") as f:
    print(pickle.load(f))

[0, 1, 2, 3, 4, 5, 6, 7]


In [7]:
# DBSCAN

distances = []

for i in range(300):
    x1 = X[i]
    for j in range(len(X)):
        if i != j:
            x2 = X[j]
            distance = np.linalg.norm(x1-x2)
            if distance != 0:
                distances.append(distance)

result_distances = sorted(distances)[:10]

In [8]:
with open("dist.pkl", "wb") as f:
    pickle.dump(result_distances, f)

with open("dist.pkl", "rb") as f:
    print(pickle.load(f))

[279.26152617215286, 304.37641170103836, 317.5893575043093, 328.7658741414626, 333.4546445920344, 352.89800226127664, 355.1774204534967, 358.07401469528617, 359.64287842247063, 360.42474942767177]


In [9]:
s = np.mean(result_distances[:3])

epsilons = np.arange(s, 1.1*s, 0.04*s)

n_unique_labels = []

for eps in epsilons:
    dbscan = DBSCAN(eps=eps)
    dbscan.fit(X)
    unique_labels = np.unique(dbscan.labels_)
    n_labels = len(unique_labels[unique_labels != -1])
    n_unique_labels.append(n_labels)

In [10]:
with open("dbscan_len.pkl", "wb") as f:
    pickle.dump(n_unique_labels, f)

with open("dbscan_len.pkl", "rb") as f:
    print(pickle.load(f))

[3, 6, 21]
