In [1]:
from sklearn.datasets import fetch_openml 
import numpy as np
import pandas as pd
mnist = fetch_openml('mnist_784', version=1, as_frame=False) 
mnist.target = mnist.target.astype(np.uint8)
X = mnist["data"]
y = mnist["target"]

In [2]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [3]:
y

array([5, 0, 4, ..., 4, 5, 6], dtype=uint8)

In [12]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import confusion_matrix

conf_matrix = None
kmeans_sil = []
for i in range(8,13):
    kmeans = KMeans(n_clusters=i)
    clusters_labels = kmeans.fit_predict(X)
    sil = silhouette_score(X, clusters_labels)
    kmeans_sil.append(sil)
    if i == 10:
        conf_matrix = confusion_matrix(y, clusters_labels)
    print(kmeans.labels_)

[0 4 7 ... 2 0 6]
[6 4 7 ... 8 6 3]
[4 6 5 ... 3 4 0]
[ 9  2 10 ...  0  8  5]
[ 4  6 10 ...  5  3  1]


In [13]:
print(kmeans_sil)

[0.07339126267893233, 0.05678937316868785, 0.05601412540818749, 0.05834598230471449, 0.05816843959627188]


In [14]:
import pickle

open_file = open("kmeans_sil.pkl", "wb")
pickle.dump(kmeans_sil, open_file)
open_file.close()

In [15]:
open_file = open("kmeans_sil.pkl", "rb")
loaded_list = pickle.load(open_file)
open_file.close()
print(loaded_list)

[0.07339126267893233, 0.05678937316868785, 0.05601412540818749, 0.05834598230471449, 0.05816843959627188]


In [16]:
print(conf_matrix)

[[ 248   34    0   36  646   82 5505  292   46   14]
 [  10 3457 4364    7    8    4    0    9    9    9]
 [ 188  478  368   72  162  246   73  448 4907   48]
 [  68   91  426   48 1400   83   37 4566  254  168]
 [ 149  244  116 1818   11 2602    4    0   21 1859]
 [ 143  772  139  394 2167  228   68 2063   20  319]
 [5472  220  295    1  184  464   91   34  113    2]
 [   5  279  247 3182    3  707   14    3   39 2814]
 [  66  440  302  282 3664  152   36 1584   51  248]
 [   9   84  149 1893   55 1718   40   93    9 2908]]


In [24]:
values = []
for row in conf_matrix:
    values.append(np.argmax(row))
print(values)

[6, 2, 8, 7, 5, 4, 0, 3, 4, 9]


In [25]:
values.sort()

In [27]:
values = list(set(values))

In [28]:
print(values)

[0, 2, 3, 4, 5, 6, 7, 8, 9]


In [29]:
open_file = open("kmeans_argmax.pkl", "wb")
pickle.dump(values, open_file)
open_file.close()

In [30]:
open_file = open("kmeans_argmax.pkl", "rb")
loaded_list = pickle.load(open_file)
open_file.close()
print(loaded_list)

[0, 2, 3, 4, 5, 6, 7, 8, 9]


In [36]:
from sklearn.cluster import DBSCAN
X_300 = X[:300]

In [39]:
distance_list = []
for i in range(300):
    for j in range(len(X)):
        distance = np.linalg.norm(X[i]-X[j])
        if distance != 0:
            distance_list.append(distance)

In [41]:
distance_list.sort()

In [44]:
smallest_distances = distance_list[:10]

In [45]:
print(smallest_distances)

[279.26152617215286, 304.37641170103836, 317.5893575043093, 328.7658741414626, 333.4546445920344, 352.89800226127664, 355.1774204534967, 358.07401469528617, 359.64287842247063, 360.42474942767177]


In [46]:
len(smallest_distances)

10