In [6]:
import cv2
import numpy as np
import os

data_path = 'D:/F/Machine Learning/MSML602/Face Data for Homework/ATT'

# lists to store images and labels
images = []
labels = []

for subject_id in range(1, 41):
    for img_id in range(1, 11):
        file_name = f"{subject_id}_{img_id}.png"
        img_path = os.path.join(data_path, file_name)
        
        # Check if the file exists
        if os.path.exists(img_path):
            
            # Read the image in grayscale
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            if img is not None:
                    images.append(img.flatten())  # Flatten to 1D vector
                    labels.append(subject_id - 1) # Label as (subject_id - 1) to start from 0
            else:
                print(f"Warning: Couldn't read {img_path}")
        else:
            print(f"Warning: {img_path} does not exist")

# Converting lists to numpy arrays
images = np.array(images)
labels = np.array(labels)


In [7]:
print(len(images))
print(len(labels))

400
400


In [8]:
def euclidean_distance(vec1, vec2):
    return np.sqrt(np.sum((vec1 - vec2) ** 2))

def knn_1nn(test_image, training_images, training_labels):
    # Step 1: Calculate distances and store them with labels
    S = []
    for n, train_img in enumerate(training_images):
        distance = euclidean_distance(test_image, train_img)
        S.append((distance, training_labels[n]))

    # Step 2: Sort by distance (ascending)
    S.sort(key=lambda x: x[0])

    # Step 3: Get the label of the closest neighbor
    _, nearest_label = S[0]

    return nearest_label


In [9]:
def cross_validate_5_fold(train_images, train_labels):
    fold_size = len(train_images) // 5
    indices = np.arange(len(train_images))
    np.random.shuffle(indices)
    accuracies = []
    
    for fold in range(5):
        # Define training and testing indices for the current fold
        test_indices = indices[fold * fold_size : (fold + 1) * fold_size]
        train_indices = np.array([i for i in indices if i not in test_indices])
        
        # Split data into training and testing sets
        inner_train_images, inner_test_images = train_images[train_indices], train_images[test_indices]
        inner_train_labels, inner_test_labels = train_labels[train_indices], train_labels[test_indices]
        
        correct_predictions = 0
        
        # Test each image in the current testing fold
        for i, test_img in enumerate(inner_test_images):
            predicted_label = knn_1nn(test_img, inner_train_images, inner_train_labels)
            actual_label = inner_test_labels[i]
            
            if predicted_label == actual_label:
                correct_predictions += 1
        
        # Calculate accuracy for the current fold
        fold_accuracy = correct_predictions / len(inner_test_images)
        accuracies.append(fold_accuracy)
        print(f"Fold {fold + 1} Accuracy: {fold_accuracy * 100:.2f}%")
    
    # Calculate and return the average accuracy across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    return avg_accuracy

In [10]:
average_accuracy = cross_validate_5_fold(train_images, train_labels)
print(f'Average Prediction Accuracy: {average_accuracy * 100:.2f}%')


Fold 1 Accuracy: 96.88%
Fold 2 Accuracy: 98.44%
Fold 3 Accuracy: 98.44%
Fold 4 Accuracy: 93.75%
Fold 5 Accuracy: 96.88%
Average Prediction Accuracy: 96.88%
