### Partition face dataset into 8:2 training:test

### Apply PCA on training data by use of eigenvectors and eigenvalues of covaraiance matrix S= (1/N)A(A^T)

### Apply PCA using eigenvectors and eigenvalues of (1/N)(A^T)A

In [87]:
# PCA source: Week 2 slides - Manifold Learning

import numpy as np
from scipy.io import loadmat
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

np.random.seed(42)

data = loadmat('face.mat')

X = data['X'] # flat images dim (2576,520)
# l = data['l'].flatten()  # image Labels (1,520)

# Does a 8/2 split of dataset (8 images / 2 images of each person). Training data shuffled. 
def Q1_partition(data):
    X = data['X'] # flat images dim (2576,520)
    l = data['l'].flatten() 
    sets = [[], [], [], []]
    for i in range(0, 520, 10):
        im_train, im_test, l_train, l_test = train_test_split(X[:, i: i + 10].T, l[i: i + 10], test_size=0.2, random_state=42)
        sets[0].append(im_train)    
        sets[1].append(im_test)  
        sets[2].append(l_train)    
        sets[3].append(l_test)  

    for i in range(4):
        if i < 2:
             sets[i] = np.array(sets[i]).reshape(-1, 2576)
        else: 
            sets[i] = np.array(sets[i]).flatten()
    
    return sets

In [88]:
# Splits data into five equal sets of 2 images per person x 52 people = 104 
def Q2_partition(data, num_partitions):
    X = data['X'] # flat images dim (2576,520)
    t = [[] for _ in range(num_partitions)]
    y = [[] for _ in range(num_partitions)]
    
    for i in range(0, 520, 10):
        slice = X[:, i: i + 10].T
        slice_l = l[i: i + 10] 
        for j in range(num_partitions):
            t[j].append(slice[j*2:(j+1)*2])
            y[j].append(slice_l[j*2:(j+1)*2])
       
    for i in range(num_partitions-1):
        t[i] = np.array(t[i]).reshape(-1, 2576) # Also shuffle 
        # np.random.shuffle(t[i])
        y[i] = np.array(y[i]).flatten()

        indices = np.random.permutation(len(t[i]))
        t[i] = t[i][indices]
        y[i] = y[i][indices]


    t[-1] = np.array(t[-1]).reshape(-1, 2576)
    y[-1] = np.array(y[-1]) 

    return t


In [89]:
def display_image(image, label):
    # X[:, image_index]
    image_height, image_width = 46, 56 
    image = image.reshape((image_height, image_width)).T 
    # label = l[image_index]

    plt.imshow(image, cmap='gray', aspect='auto')
    plt.title(f"Face Image - Label: {label}")
    plt.axis('off')
    plt.show()
    # display_image(X_train[1:2].T, "hey")
    # display_image(t1[i:i+1].T, "Hey")

def get_sorted_eigen(M):
    eigenvalues, eigenvectors = np.linalg.eig(M)
    eigenvalues, eigenvectors = np.real(eigenvalues), np.real(eigenvectors)

    sorted_indices = np.argsort(eigenvalues)[::-1]

    sorted_eigenvalues = eigenvalues[sorted_indices]
    sorted_eigenvectors = eigenvectors[:, sorted_indices]
    return sorted_eigenvalues, sorted_eigenvectors

# S = Covariance Matrix, A = mean centred data, N = #samples
def my_PCA(batch, k = 100):
    _, mean, S = process_batch(batch)
    eigenvalues, eigenvectors = get_sorted_eigen(S)
    # k = choose_principal_components(threshold, eigenvalues)
    return eigenvalues[:k + 1], eigenvectors[:, :k + 1], mean

def choose_principal_components(threshold, eigenvalues):
    covariance_ratios = eigenvalues/np.sum(eigenvalues)
    cum_var = 0
    for k, ratio in enumerate(covariance_ratios):
        cum_var += ratio
        if cum_var >= threshold: return k 

def process_batch(batch):
    mean = np.mean(batch, axis=0)
    A = batch - mean
    N = A.shape[1]
    S = (1/N) * np.dot(A.T, A)
    #print(f'S / mean shapes: {S.shape}/{mean.shape}')
    return N, mean, S

In [90]:
t = Q2_partition(data,5)
batch = t[0]
N, mean, S = process_batch(X_train)


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (52,) + inhomogeneous part.

In [62]:
def incremental_PCA(data, num_partitions = 5):
    t = Q2_partition(data, num_partitions)
    
    N1, mu1, S1 = process_batch(t[0])

    for batch in t[1:-1]:  # every batch except first (already initialized) and last (test)
        N2, mu2, S2 = process_batch(batch)
        print(batch.shape)

        N3 = N1 + N2
        mu3 = (N1*mu1 + N2*mu2)/N3
        mu12 = mu1-mu2
        term1 = (N1/N3) * S1
        term2 = (N2/N3)*S2
        term3 = (N1*N2)/(N3**2) * np.outer(mu12, mu12.T)  
        S3 =  term1 + term2 + term3 

        N1, mu1, S1 = N3, mu3, S3

In [91]:
from sklearn.metrics import accuracy_score

def get_reduced_representation(image, W, X_mean):
    centered_image = image - X_mean  # Step 1: Center the image
    Z = np.dot(centered_image, W)    # Step 2: Project onto principal components
    return Z

def reconstruct_image(Z, W, X_mean):
    X_reconstructed = np.dot(Z, W.T) + X_mean  # Project back to original space
    return X_reconstructed

# Assuming `W`, `X_mean`, and `original_classifier` are already defined
def test_reconstructed_accuracy(X_test, y_test, W, X_mean, original_classifier):
    reconstructed_images = []
    
    # Reconstruct each test image after applying PCA
    for image in X_test:
        Z = get_reduced_representation(image, W, X_mean)  # Project to PCA space
        reconstructed_image = reconstruct_image(Z, W, X_mean)  # Reconstruct
        reconstructed_images.append(reconstructed_image)
    
    # Convert list to numpy array for classifier compatibility
    reconstructed_images = np.array(reconstructed_images)
    
    # Get predictions on reconstructed images
    y_pred = original_classifier.predict(reconstructed_images)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [92]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Example function to train a classifier on the original data
def train_classifier(X_train, y_train):
    # Use a pipeline to standardize data before applying SVM
    classifier = make_pipeline(StandardScaler(), SVC(kernel='linear', random_state=42))
    classifier.fit(X_train, y_train)
    return classifier

# Example usage
original_classifier = train_classifier(X_train, l_train)


In [93]:
from sklearn.decomposition import PCA
def get_principal_components(X_train, num_components):
    pca = PCA(n_components=num_components)
    pca.fit(X_train)  # Fit PCA on the training data
    W = pca.components_.T  # Transpose to get shape [original_dim, num_components]
    return W, pca  # Return W and the trained PCA model for further use

for num_components in [50,100,200,300,400]:


    print(f'Num_comps: {num_components}')
    # They cannot possibly be the same since they have different dimensions. 
    [X_train, X_test, l_train, l_test] = Q1_partition(data)
    ##### Look here FELIX - AAT: S = (1/N) A * A^T, ATA: S = (1/N) A^T * A  
    eigenvalues_AAT, eigenvectors_AAT, mean_AAT = my_PCA(X_train, k=num_components)
    # eigenvalues_ATA, eigenvectors_ATA, mean_ATA = PCA(X_train.T)
    ##########


    # Example usage
    
    W, pca_model = get_principal_components(X_train, num_components)

    # Example usage
    accuracy = test_reconstructed_accuracy(X_test, l_test, eigenvectors_AAT, mean_AAT, original_classifier)
    print("Accuracy on reconstructed images:", accuracy)
    accuracy = test_reconstructed_accuracy(X_test, l_test, W, mean_AAT, original_classifier)
    print("Accuracy :", accuracy)



Num_comps: 50
Accuracy on reconstructed images: 0.7788461538461539
Accuracy : 0.7788461538461539
Num_comps: 100
Accuracy on reconstructed images: 0.7980769230769231
Accuracy : 0.8173076923076923
Num_comps: 200
Accuracy on reconstructed images: 0.8269230769230769
Accuracy : 0.8173076923076923
Num_comps: 300
Accuracy on reconstructed images: 0.8461538461538461
Accuracy : 0.8461538461538461
Num_comps: 400
Accuracy on reconstructed images: 0.8461538461538461
Accuracy : 0.8461538461538461


In [63]:
incremental_PCA(data)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (52,) + inhomogeneous part.