In [17]:
# imports
import numpy as np
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration
from sklearn.metrics import silhouette_score, accuracy_score
from tqdm import tqdm
import warnings
warnings.filterwarnings(action="ignore", category=FutureWarning)

In [18]:
# get dataset
data = fetch_olivetti_faces(shuffle=True, random_state=42)

X = data.images.reshape(data.images.shape[0], -1)
y = data.target

print("Images shape:", X.shape)
print("Labels shape:", y.shape)

Images shape: (400, 4096)
Labels shape: (400,)


In [19]:
# split data into training 70% and temporary 30%
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# split temporary into validation 15% and test 15%
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print("Training set size:", X_train.shape[0])
print("Validation set size:", X_val.shape[0])
print("Test set size:", X_test.shape[0])

Training set size: 280
Validation set size: 60
Test set size: 60


In [20]:
# standardize the input features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [21]:
# perform k-fold cross-validation on svc classifier
classifier = SVC(kernel="linear", random_state=42)
cross_val_scores = cross_val_score(classifier, X_train, y_train, cv=5)
classifier.fit(X_train, y_train)

# predict on validation set
y_pred = classifier.predict(X_val)

# calculate accuracy on validation set
validation_accuracy = accuracy_score(y_val, y_pred)
print("Accuracy: ", validation_accuracy)

Accuracy:  0.9666666666666667


In [22]:
def findBestNumOfAggloCluster(min_cluster, max_cluster, linkage, metric, X_train):
  best_silhouette_score = -1
  best_num_clusters = None
  best_instance = None

  # try different numbers of clusters
  for num_clusters in tqdm(range(min_cluster, max_cluster)):
      clustering = AgglomerativeClustering(n_clusters=num_clusters, linkage=linkage, metric=metric)
      cluster_assignments = clustering.fit_predict(X_train)
      silhouette_tmp = silhouette_score(X_train, cluster_assignments)

      # save if better silhouette score
      if silhouette_tmp > best_silhouette_score:
          best_silhouette_score = silhouette_tmp
          best_num_clusters = num_clusters
          best_instance = clustering

  return best_instance, best_silhouette_score, best_num_clusters

In [23]:
# metric euclidean: find best silhoutte score and number of cluster
best_instance_euclidean, best_silhouette_score_euclidean, best_num_clusters_euclidean = findBestNumOfAggloCluster(2, 200, "ward", "euclidean", X_train)
print("\nBest euclidean silhouette score:", best_silhouette_score_euclidean)
print("Best euclidean number of clusters:", best_num_clusters_euclidean)

100%|██████████| 198/198 [00:20<00:00,  9.43it/s]



Best euclidean silhouette score: 0.22770585
Best euclidean number of clusters: 108


In [24]:
# metric manhattan: find best silhoutte score and number of cluster
best_instance_manhattan, best_silhouette_score_manhattan, best_num_clusters_manhattan = findBestNumOfAggloCluster(2, 200, "complete", "manhattan", X_train)
print("\nBest manhattan silhouette score:", best_silhouette_score_manhattan)
print("Best manhattan number of clusters:", best_num_clusters_manhattan)

100%|██████████| 198/198 [00:24<00:00,  8.19it/s]


Best manhattan silhouette score: 0.19456427
Best manhattan number of clusters: 109





In [25]:
# metric cosine similairty: find best silhoutte score and number of cluster
best_instance_cosine, best_silhouette_score_cosine, best_num_clusters_cosine = findBestNumOfAggloCluster(2, 200, "complete", "cosine", X_train)
print("\nBest cosine silhouette score:", best_silhouette_score_cosine)
print("Best cosine number of clusters:", best_num_clusters_cosine)

100%|██████████| 198/198 [00:24<00:00,  8.09it/s]


Best cosine silhouette score: 0.19777095
Best cosine number of clusters: 115





In [26]:
# metric euclidean: apply feautre agglomeration for dimension reduction
feature_agglo_euclidean = FeatureAgglomeration(n_clusters=best_num_clusters_euclidean, linkage="ward", metric="euclidean")
X_train_reduced_euclidean = feature_agglo_euclidean.fit_transform(X_train)
X_val_reduced_euclidean = feature_agglo_euclidean.transform(X_val)
X_test_reduced_euclidean = feature_agglo_euclidean.transform(X_test)

# metric manhattan: apply feautre agglomeration for dimension reduction
feature_agglo_manhattan = FeatureAgglomeration(n_clusters=best_num_clusters_manhattan, linkage="complete", metric="manhattan")
X_train_reduced_manhattan = feature_agglo_manhattan.fit_transform(X_train)
X_val_reduced_manhattan = feature_agglo_manhattan.transform(X_val)
X_test_reduced_manhattan = feature_agglo_manhattan.transform(X_test)

# metric cosine similarity: apply feautre agglomeration for dimension reduction
feature_agglo_cosine = FeatureAgglomeration(n_clusters=best_num_clusters_cosine, linkage="complete", metric="cosine")
X_train_reduced_cosine = feature_agglo_cosine.fit_transform(X_train)
X_val_reduced_cosine = feature_agglo_cosine.transform(X_val)
X_test_reduced_cosine = feature_agglo_cosine.transform(X_test)

X_train_reduced_euclidean

array([[ 0.77577409,  0.44326119, -0.39645674, ..., -1.04429361,
        -1.98025452, -0.39548714],
       [ 0.7332965 ,  0.75286861,  0.76333222, ...,  0.63936276,
         0.79064318,  0.58938475],
       [ 0.84392468,  1.49966486, -0.12055025, ..., -1.98113974,
         0.49804797, -1.2020132 ],
       ...,
       [ 0.70417633,  0.21640729,  0.20746132, ..., -0.0794773 ,
        -0.2139014 , -0.21571886],
       [ 0.19462575, -0.11780064,  0.6156343 , ..., -0.26227326,
         0.35108655,  1.16140287],
       [ 0.52037043,  1.64657204,  0.26235241, ...,  0.67707406,
         0.77978669,  0.16119316]])

In [27]:
# create iterable structure for prediction
prediction_data = [[X_train_reduced_euclidean, X_val_reduced_euclidean, "Euclidean accuracy: "],
    [X_train_reduced_manhattan, X_val_reduced_manhattan, "Manhattan accuracy: "],
    [X_train_reduced_cosine, X_val_reduced_cosine, "Cosine accuracy: "]]

for data in prediction_data:
  # perform k-fold cross-validation on svc classifier
  classifier = SVC(kernel="linear", random_state=42)
  cross_val_scores = cross_val_score(classifier, data[0], y_train, cv=5)
  classifier.fit(data[0], y_train)

  # predict on validation set
  y_pred = classifier.predict(data[1])

  # calculate accuracy on validation set
  validation_accuracy = accuracy_score(y_val, y_pred)
  print(data[2], validation_accuracy)

Euclidean accuracy:  0.9333333333333333
Manhattan accuracy:  0.95
Cosine accuracy:  0.9333333333333333
