# Assignment 3: Hierarchical Clustering 

1. Retrieve and load the Olivetti faces dataset

In [1]:
from sklearn.datasets import fetch_olivetti_faces

# Load Olivetti faces dataset
data = fetch_olivetti_faces(shuffle=True, random_state=86)
X, y = data.data, data.target

2. Split the training set, a validation set, and a test set using stratified sampling to ensure that there are the same number of images per person in each set. 

In [2]:
from sklearn.model_selection import train_test_split

# 60% training, 20% validation, 20% test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=86)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=86)

3. Using k-fold cross validation, train a classifier to predict which person is represented in each picture, and evaluate it on the validation set.

In [3]:
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Initialize the classifier
clf = SVC(kernel='linear', random_state=86)

# Perform k-fold cross-validation
skf = StratifiedKFold(n_splits=5)
scores = cross_val_score(clf, X_train, y_train, cv=skf)

# Train on the full training set and evaluate on the validation set
clf.fit(X_train, y_train)
val_score = clf.score(X_val, y_val)

print(f'Cross-validation scores: {scores}')
print()
print("Mean cross-validation accuracy:", np.mean(scores))
print()
print(f'Validation score: {val_score}')

Cross-validation scores: [0.91666667 0.875      0.9375     0.89583333 0.9375    ]

Mean cross-validation accuracy: 0.9125

Validation score: 0.95


4. Using either Agglomerative Hierarchical Clustering (AHC) or Divisive Hierarchical Clustering (DHC) and using the centroid-based clustering rule, reduce the dimensionality of the set by using the following similarity measures:

In [4]:
# Determine the optimal number of clusters for each similarity measure using silhouette_score
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.metrics import pairwise_distances

def find_optimal_clusters(X, metric):
    best_score = -1
    best_n_clusters = 2
    for n_clusters in range(2, 80):
        distance_matrix = pairwise_distances(X, metric=metric)
        clustering = AgglomerativeClustering(n_clusters=n_clusters, metric='precomputed', linkage='average')
        labels = clustering.fit_predict(distance_matrix)
        score = silhouette_score(distance_matrix, labels, metric='precomputed')
        if score > best_score:
            best_score = score
            best_n_clusters = n_clusters
    return best_n_clusters, best_score

optimal_clusters_euclidean, best_score_euclidean = find_optimal_clusters(X_train, 'euclidean')
optimal_clusters_minkowski, best_score_minkowski = find_optimal_clusters(X_train, 'minkowski')
optimal_clusters_cosine, best_score_cosine = find_optimal_clusters(X_train, 'cosine')

print(f"Optimal number of clusters (Euclidean): {optimal_clusters_euclidean} with silhouette score: {best_score_euclidean}")
print(f"Optimal number of clusters (Minkowski): {optimal_clusters_minkowski} with silhouette score: {best_score_minkowski}")
print(f"Optimal number of clusters (Cosine): {optimal_clusters_cosine} with silhouette score: {best_score_cosine}")

Optimal number of clusters (Euclidean): 77 with silhouette score: 0.1895388662815094
Optimal number of clusters (Minkowski): 77 with silhouette score: 0.18953886830575567
Optimal number of clusters (Cosine): 2 with silhouette score: 0.3219642639160156


* a) Euclidean Distance 

In [5]:
# Perform clustering and transform the data using the optimal number of clusters using the Euclidean Distance Approach:
distance_matrix_euclidean = pairwise_distances(X_train, metric='euclidean')
clustering_euclidean = AgglomerativeClustering(n_clusters=optimal_clusters_euclidean, metric='precomputed', linkage='complete')
labels_euclidean = clustering_euclidean.fit_predict(distance_matrix_euclidean)

centroids_euclidean = np.array([X_train[labels_euclidean == i].mean(axis=0) for i in range(optimal_clusters_euclidean)])
X_train_transformed_euclidean = np.array([centroids_euclidean[label] for label in labels_euclidean])

* b) Minkowski Distance

In [6]:
distance_matrix_minkowski = pairwise_distances(X_train, metric='minkowski', p=3)
clustering_minkowski = AgglomerativeClustering(n_clusters=optimal_clusters_minkowski, metric='precomputed', linkage='average')
labels_minkowski = clustering_minkowski.fit_predict(distance_matrix_minkowski)

centroids_minkowski = np.array([X_train[labels_minkowski == i].mean(axis=0) for i in range(optimal_clusters_minkowski)])
X_train_transformed_minkowski = np.array([centroids_minkowski[label] for label in labels_minkowski])

* c) Cosine Similarity [20 points]

In [7]:
distance_matrix_cosine = pairwise_distances(X_train, metric='cosine')
clustering_cosine = AgglomerativeClustering(n_clusters=optimal_clusters_cosine, metric='precomputed', linkage='average')
labels_cosine = clustering_cosine.fit_predict(distance_matrix_cosine)

centroids_cosine = np.array([X_train[labels_cosine == i].mean(axis=0) for i in range(optimal_clusters_cosine)])
X_train_transformed_cosine = np.array([centroids_cosine[label] for label in labels_cosine])

5. Discuss any discrepancies observed between 4(a), 4(b), or 4(c).
* Use the silhouette score approach to choose the number of clusters for 4(a), 4(b), and 4(c). 

6. Use the set from (4(a), 4(b), or 4(c)) to train a classifier as in (3) using k-fold cross validation.

In [8]:
# Train and evaluate the classifier using the transformed datasets:
def train_and_evaluate_classifier(X, y):
    n_splits = max(2, min(5, np.min(np.bincount(y))))
    kf = StratifiedKFold(n_splits=n_splits)
    clf_svm = SVC(kernel='poly', random_state=42)
    cv_scores = cross_val_score(clf_svm, X, y, cv=kf)
    clf_svm.fit(X, y)  # Train the classifier on the full training set
    return cv_scores, clf_svm

# Train and evaluate classifier for each transformed dataset
cv_scores_euclidean, clf_svm_euclidean = train_and_evaluate_classifier(X_train_transformed_euclidean, y_train)
cv_scores_minkowski, clf_svm_minkowski = train_and_evaluate_classifier(X_train_transformed_minkowski, y_train)
cv_scores_cosine, clf_svm_cosine = train_and_evaluate_classifier(X_train_transformed_cosine, y_train)

print(f"Cross-validation scores (Euclidean): {cv_scores_euclidean}")
print(f"Mean cross-validation score (Euclidean): {np.mean(cv_scores_euclidean)}")
print()
print(f"Cross-validation scores (Minkowski): {cv_scores_minkowski}")
print(f"Mean cross-validation score (Minkowski): {np.mean(cv_scores_minkowski)}")
print()
print(f"Cross-validation scores (Cosine): {cv_scores_cosine}")
print(f"Mean cross-validation score (Cosine): {np.mean(cv_scores_cosine)}")

Cross-validation scores (Euclidean): [0.83333333 0.6875     0.8125     0.75       0.8125    ]
Mean cross-validation score (Euclidean): 0.7791666666666667

Cross-validation scores (Minkowski): [0.64583333 0.64583333 0.6875     0.64583333 0.72916667]
Mean cross-validation score (Minkowski): 0.6708333333333333

Cross-validation scores (Cosine): [0.02083333 0.04166667 0.04166667 0.02083333 0.02083333]
Mean cross-validation score (Cosine): 0.029166666666666664


In [9]:
# Evaluation the classifier on the Validation Set transformed:

# Transform the validation set using the cluster centroids
X_val_transformed_euclidean = np.array([centroids_euclidean[label] for label in clustering_euclidean.fit_predict(pairwise_distances(X_val, metric='euclidean'))])
X_val_transformed_minkowski = np.array([centroids_minkowski[label] for label in clustering_minkowski.fit_predict(pairwise_distances(X_val, metric='minkowski', p=3))])
X_val_transformed_cosine = np.array([centroids_cosine[label] for label in clustering_cosine.fit_predict(pairwise_distances(X_val, metric='cosine'))])

# Evaluate the classifier on the transformed validation set
val_score_euclidean = clf_svm_euclidean.score(X_val_transformed_euclidean, y_val)
val_score_minkowski = clf_svm_minkowski.score(X_val_transformed_minkowski, y_val)
val_score_cosine = clf_svm_cosine.score(X_val_transformed_cosine, y_val)

print(f'Validation score (Euclidean): {val_score_euclidean}')
print(f'Validation score (Minkowski): {val_score_minkowski}')
print(f'Validation score (Cosine): {val_score_cosine}')

Validation score (Euclidean): 0.025
Validation score (Minkowski): 0.0375
Validation score (Cosine): 0.025
