In [1]:
import cv2
import numpy as np
import os

data_path = 'D:/F/Machine Learning/MSML602/Face Data for Homework/ATT'

# lists to store images and labels
images = []
labels = []

for subject_id in range(1, 41):
    for img_id in range(1, 11):
        file_name = f"{subject_id}_{img_id}.png"
        img_path = os.path.join(data_path, file_name)
        
        # Check if the file exists
        if os.path.exists(img_path):
            
            # Read the image in grayscale
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            if img is not None:
                    images.append(img.flatten())  # Flatten to 1D vector
                    labels.append(subject_id - 1) # Label as (subject_id - 1) to start from 0
            else:
                print(f"Warning: Couldn't read {img_path}")
        else:
            print(f"Warning: {img_path} does not exist")

# Converting lists to numpy arrays
images = np.array(images)
labels = np.array(labels, dtype=int)


In [2]:
print(images.shape)
print(labels.shape)

(400, 10304)
(400,)


In [3]:
def pca_with_covariance(data, num_components):
    """
    Parameters:
        data (numpy.ndarray): The data matrix of shape (n_samples, n_features).
        num_components (int): The number of principal components to keep.

    Returns:
        reduced_data (numpy.ndarray): Data projected onto the top k principal components.
        mean_vector (numpy.ndarray): Mean of the original data.
        principal_components (numpy.ndarray): Top k principal components.
    """
    # Step 1: Compute the mean vector and center the data
    mean_vector = np.mean(data, axis=0)
    centered_data = data - mean_vector  # A = X - x̄

    # Step 2: Compute the covariance matrix (A^T A)
    covariance_matrix = centered_data.T @ centered_data / data.shape[0]

    # Step 3: Perform eigen decomposition
    eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)

    # Step 4: Sort eigenvalues and eigenvectors in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    top_eigenvectors = np.real(eigenvectors[:, sorted_indices[:num_components]])

    # Step 5: Project the data onto the top k eigenvectors
    reduced_data = centered_data @ top_eigenvectors
    return reduced_data, mean_vector, top_eigenvectors


In [4]:
def euclidean_distance(vec1, vec2):
    return np.sqrt(np.sum((vec1 - vec2) ** 2))

def knn_1nn(test_image, training_images, training_labels):
    # Step 1: Calculate distances and store them with labels
    distances = [(euclidean_distance(test_image, train_img), training_labels[n])for n, train_img in enumerate(training_images)]
    return sorted(distances, key=lambda x: x[0])[0][1]

In [5]:
def cross_validate_5_fold_pca(images, labels, num_components):
    """
    Parameters:
        images (numpy.ndarray): Image dataset of shape (n_samples, n_features).
        labels (numpy.ndarray): Labels corresponding to the images.
        num_components (int): Number of principal components for PCA.

    Returns:
        avg_accuracy (float): Average accuracy across the 5 folds.
    """
    fold_size = len(images) // 5
    indices = np.arange(len(images))
    np.random.shuffle(indices)
    accuracies = []

    for fold in range(5):
        test_indices = indices[fold * fold_size : (fold + 1) * fold_size]
        train_indices = np.setdiff1d(indices, test_indices)

        train_images, test_images = images[train_indices], images[test_indices]
        train_labels, test_labels = labels[train_indices], labels[test_indices]

        # Apply PCA to training images
        reduced_train_images, mean_image, top_eigenvectors = pca_with_covariance(
            train_images, num_components
        )

        # Center and project test images
        centered_test_images = test_images - mean_image
        reduced_test_images = centered_test_images @ top_eigenvectors

        # Test each image
        correct_predictions = sum(
            knn_1nn(test_img, reduced_train_images, train_labels) == test_label
            for test_img, test_label in zip(reduced_test_images, test_labels)
        )
        
        # Calculate accuracy for the current fold
        fold_accuracy = correct_predictions / len(test_images)
        accuracies.append(fold_accuracy)
        print(f"Fold {fold + 1} Accuracy: {fold_accuracy * 100:.2f}%")

    # Calculate and return the average accuracy across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    return avg_accuracy


In [6]:
#Accuracy
num_components = 60
avg_accuracy = cross_validate_5_fold_pca(images, labels, num_components)
print(f"Average Accuracy with PCA (60 components): {avg_accuracy * 100:.2f}%")

Fold 1 Accuracy: 93.75%
Fold 2 Accuracy: 97.50%
Fold 3 Accuracy: 93.75%
Fold 4 Accuracy: 93.75%
Fold 5 Accuracy: 97.50%
Average Accuracy with PCA (60 components): 95.25%
