In [1]:
from sklearn.datasets import fetch_openml 
import numpy as np
import pandas as pd
mnist = fetch_openml('mnist_784', version=1, as_frame=False) 
mnist.target = mnist.target.astype(np.uint8)
X = mnist["data"]
y = mnist["target"]

In [2]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [3]:
y

array([5, 0, 4, ..., 4, 5, 6], dtype=uint8)

In [4]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import confusion_matrix

conf_matrix = None
kmeans_sil = []
for i in range(8,13):
    kmeans = KMeans(n_clusters=i)
    clusters_labels = kmeans.fit_predict(X)
    sil = silhouette_score(X, clusters_labels)
    kmeans_sil.append(sil)
    if i == 10:
        conf_matrix = confusion_matrix(y, clusters_labels)
    print(kmeans.labels_)

[0 3 5 ... 4 0 2]
[0 5 3 ... 7 0 2]
[0 5 8 ... 2 1 4]
[7 6 9 ... 0 5 2]
[ 9  2  3 ... 11  0  6]


In [5]:
print(kmeans_sil)

[0.07338426761417954, 0.05679418111062125, 0.05869201052848778, 0.058345868447402464, 0.0581727846213671]


In [6]:
import pickle

open_file = open("kmeans_sil.pkl", "wb")
pickle.dump(kmeans_sil, open_file)
open_file.close()

In [7]:
open_file = open("kmeans_sil.pkl", "rb")
loaded_list = pickle.load(open_file)
open_file.close()
print(loaded_list)

[0.07338426761417954, 0.05679418111062125, 0.05869201052848778, 0.058345868447402464, 0.0581727846213671]


In [8]:
print(conf_matrix)

[[  72 1265    7    2  162 5053    4  290   39    9]
 [   8    7   11 4293    7    0 3526    8    7   10]
 [ 201  246   78  423  147   57  436  323  216 4863]
 [1083  461   45  449   31   24   58 4581  193  216]
 [  17  288 2173  178  168    9  234    0 3728   29]
 [1156 1812  215  155   67   60  280 2129  432    7]
 [  14 2068    4  190 4326   71   45   38   67   53]
 [  18   12 4399  372    4   21  314    6 2094   53]
 [4115  292  193  335   51   36  330 1212  208   53]
 [  87   31 2849  261   16   51   95   87 3462   19]]


In [9]:
values = []
for row in conf_matrix:
    values.append(np.argmax(row))
print(values)

[5, 3, 9, 7, 8, 7, 4, 2, 0, 8]


In [10]:
values.sort()

In [11]:
values = list(set(values))

In [12]:
print(values)

[0, 2, 3, 4, 5, 7, 8, 9]


In [13]:
open_file = open("kmeans_argmax.pkl", "wb")
pickle.dump(values, open_file)
open_file.close()

In [14]:
open_file = open("kmeans_argmax.pkl", "rb")
loaded_list = pickle.load(open_file)
open_file.close()
print(loaded_list)

[0, 2, 3, 4, 5, 7, 8, 9]


In [15]:
from sklearn.cluster import DBSCAN
X_300 = X[:300]

In [16]:
distance_list = []
for i in range(300):
    for j in range(len(X)):
        distance = np.linalg.norm(X[i]-X[j])
        if distance != 0:
            distance_list.append(distance)

In [17]:
distance_list.sort()

In [18]:
smallest_distances = distance_list[:10]

In [19]:
print(smallest_distances)

[279.26152617215286, 304.37641170103836, 317.5893575043093, 328.7658741414626, 333.4546445920344, 352.89800226127664, 355.1774204534967, 358.07401469528617, 359.64287842247063, 360.42474942767177]


In [20]:
len(smallest_distances)

10

In [21]:
open_file = open("dist.pkl", "wb")
pickle.dump(smallest_distances, open_file)
open_file.close()

In [22]:
open_file = open("dist.pkl", "rb")
loaded_list = pickle.load(open_file)
open_file.close()
print(loaded_list)

[279.26152617215286, 304.37641170103836, 317.5893575043093, 328.7658741414626, 333.4546445920344, 352.89800226127664, 355.1774204534967, 358.07401469528617, 359.64287842247063, 360.42474942767177]


In [23]:
s = (smallest_distances[0] + smallest_distances[1] + smallest_distances[2])/3

In [24]:
print(s)

300.40909845916684


In [25]:
eps_1 = s
eps_2 = 1.04 * s
eps_3 = 1.08 * s
eps_list = [eps_1, eps_2, eps_3]

In [26]:
print(eps_list)

[300.40909845916684, 312.4254623975335, 324.4418263359002]


In [28]:
labels_len = []
for eps_ in eps_list:
    dbscan = DBSCAN(eps=eps_)
    pred = dbscan.fit_predict(X)
    labels = list(set(dbscan.labels_))
    labels_without_anomalies = labels
    labels_without_anomalies.remove(-1)
    print(eps_, "  ", list(set(pred)), "  ", labels, "   ", labels_without_anomalies, "\n\n")
    labels_len.append(len(labels_without_anomalies))

300.40909845916684    [0, 1, 2, -1]    [0, 1, 2]     [0, 1, 2] 


312.4254623975335    [0, 1, 2, 3, 4, 5, -1]    [0, 1, 2, 3, 4, 5]     [0, 1, 2, 3, 4, 5] 


324.4418263359002    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, -1]    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]     [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] 




In [29]:
print(labels_len)

[3, 6, 21]


In [30]:
open_file = open("dbscan_len.pkl", "wb")
pickle.dump(labels_len, open_file)
open_file.close()

In [31]:
open_file = open("dbscan_len.pkl", "rb")
loaded_list = pickle.load(open_file)
open_file.close()
print(loaded_list)

[3, 6, 21]
