In [11]:
# imports
import numpy as np
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
import warnings
warnings.filterwarnings(action="ignore", category=FutureWarning)

In [2]:
# get dataset
data = fetch_olivetti_faces(shuffle=True, random_state=42)

X = data.images.reshape(data.images.shape[0], -1)
y = data.target

print("Images shape:", X.shape)
print("Labels shape:", y.shape)

downloading Olivetti faces from https://ndownloader.figshare.com/files/5976027 to /root/scikit_learn_data
Images shape: (400, 4096)
Labels shape: (400,)


In [3]:
# split data into training 70% and temporary 30%
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# split temporary into validation 15% and test 15%
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print("Training set size:", X_train.shape[0])
print("Validation set size:", X_val.shape[0])
print("Test set size:", X_test.shape[0])

Training set size: 280
Validation set size: 60
Test set size: 60


In [9]:
# perform k-fold cross-validation on svc classifier
classifier = SVC(kernel="linear", random_state=42)
cross_val_scores = cross_val_score(classifier, X_train, y_train, cv=5)
classifier.fit(X_train, y_train)

# predict on validation set
validation_predictions = classifier.predict(X_val)

# calculate accuracy on validation set
validation_accuracy = accuracy_score(y_val, validation_predictions)
print(validation_accuracy)

0.9166666666666666


In [5]:
best_silhouette_score = -1
best_num_clusters = None
best_kmeans = None

# try different numbers of clusters
for num_clusters in range(100, 140):
    kmeans = KMeans(n_clusters=num_clusters, random_state=42) # default similarity measure (euclidean)
    cluster_assignments = kmeans.fit_predict(X_train)
    silhouette_avg = silhouette_score(X_train, cluster_assignments)

    print("Cluster: ", num_clusters, ", Silhouette score: ", silhouette_avg)

    # save if better silhouette score
    if silhouette_avg > best_silhouette_score:
        best_silhouette_score = silhouette_avg
        best_num_clusters = num_clusters
        best_kmeans = kmeans

print("Best number of clusters:", best_num_clusters)
print("Best silhouette score:", best_silhouette_score)

# reduce dimension of dataset
X_train_reduced = best_kmeans.transform(X_train) # distance to all clusters
X_val_reduced = best_kmeans.transform(X_val)
X_test_reduced = best_kmeans.transform(X_test)
X_train_reduced.shape

Cluster:  100 , Silhouette score:  0.19350411
Cluster:  101 , Silhouette score:  0.19357832
Cluster:  102 , Silhouette score:  0.19100979
Cluster:  103 , Silhouette score:  0.19044916
Cluster:  104 , Silhouette score:  0.1956666
Cluster:  105 , Silhouette score:  0.18676892
Cluster:  106 , Silhouette score:  0.19931473
Cluster:  107 , Silhouette score:  0.19286318
Cluster:  108 , Silhouette score:  0.19494958
Cluster:  109 , Silhouette score:  0.19216657
Cluster:  110 , Silhouette score:  0.19768281
Cluster:  111 , Silhouette score:  0.19912644
Cluster:  112 , Silhouette score:  0.19240355
Cluster:  113 , Silhouette score:  0.19396624
Cluster:  114 , Silhouette score:  0.19470957
Cluster:  115 , Silhouette score:  0.19848812
Cluster:  116 , Silhouette score:  0.20251465
Cluster:  117 , Silhouette score:  0.20298488
Cluster:  118 , Silhouette score:  0.19294873
Cluster:  119 , Silhouette score:  0.20105506
Cluster:  120 , Silhouette score:  0.20040497
Cluster:  121 , Silhouette score:  

(280, 117)

In [6]:
# perform k-fold cross-validation on svc classifier
classifier = SVC(kernel="linear", random_state=42)
cross_val_scores = cross_val_score(classifier, X_train_reduced, y_train, cv=5)
classifier.fit(X_train_reduced, y_train)

# predict on validation set
validation_predictions = classifier.predict(X_val_reduced)

# calculate accuracy on validation set
validation_accuracy = accuracy_score(y_val, validation_predictions)
print(validation_accuracy)

0.8833333333333333


In [15]:
# playground to find gridsearch boundaries
dbscan = DBSCAN(eps=0.035, min_samples=10, metric="cosine")
dbscan_labels = dbscan.fit_predict(X_train)
dbscan_labels

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1

In [8]:
# set gridsearch ranges
epsilon_values = np.arange(0.005, 0.035, 0.001)
min_samples_values = np.arange(2, 10)

best_score = -1
best_params = None

# run gridsearch
for epsilon in epsilon_values:
    for min_samples in min_samples_values:
      dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric="cosine")
      dbscan_labels = dbscan.fit_predict(X_train)

      # check if all X are assigned to more than only 1 cluster, prevent errors in silhouette_score func
      if np.unique(dbscan_labels).size > 1:
        silhouette_avg = silhouette_score(X_train, dbscan_labels)
        # save best score and params
        if silhouette_avg > best_score:
          best_score = silhouette_avg
          best_params = {"epsilon": epsilon, "min_samples": min_samples}
          print("Best params:", best_params, "Silhouette score: ", best_score)

Best params: {'epsilon': 0.005, 'min_samples': 2} Silhouette score:  -0.22310396
Best params: {'epsilon': 0.005, 'min_samples': 3} Silhouette score:  0.15079734
Best params: {'epsilon': 0.028, 'min_samples': 7} Silhouette score:  0.15117227
Best params: {'epsilon': 0.033, 'min_samples': 4} Silhouette score:  0.17967784
Best params: {'epsilon': 0.034, 'min_samples': 3} Silhouette score:  0.2039259
