In [None]:
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.path.abspath('__file__')), 'oracle'))
import oracle

data = oracle.q1_fish_train_test_data(23475)

import numpy as np
import matplotlib.pyplot as plt

def filter_class_data(data, labels, class_label):
    """Filters and flattens the images belonging to a specific class."""
    class_images = data[labels == class_label]  
    return 255*class_images.reshape(class_images.shape[0], -1) 
    

def sample_random_subset(class_data, n):
    """Selects a random subset of n samples from the given class data."""
    j = np.random.choice(class_data.shape[0], n, replace=False)
    return class_data[j]

def compute_mean_vector(data_subset):
    """Computes the mean vector of the given data subset."""
    return np.mean(data_subset, axis=0)

def compute_covariance_matrix(data_subset):
    """Computes the covariance matrix of the given data subset."""
    return np.cov(data_subset, rowvar=False)

def compute_norms(mean_vector, covariance_matrix):
    """Computes the L2 norm of the mean vector and the Frobenius norm of the covariance matrix."""
    mean_norm = np.linalg.norm(mean_vector)  # Default is L2 norm
    cov_norm = np.linalg.norm(covariance_matrix, 'fro')  # Frobenius norm
    return mean_norm, cov_norm

def compute_statistics_for_sample(class_data, n):
    """Computes mean and covariance norms for a random sample of size n."""
    subset = sample_random_subset(class_data, n)
    mean_vector = compute_mean_vector(subset)
    covariance_matrix = compute_covariance_matrix(subset)
    return compute_norms(mean_vector, covariance_matrix)

def compute_statistics(data, labels, class_label, sample_sizes):
    """Computes the L2 norm of the mean vector and the Frobenius norm of the covariance matrix for different sample sizes."""
    class_data = filter_class_data(data, labels, class_label)
    mean_norms, cov_norms = [], []

    for n in sample_sizes:
        mean_norm, cov_norm = compute_statistics_for_sample(class_data, n)
        mean_norms.append(mean_norm)
        cov_norms.append(cov_norm)
    
    return mean_norms, cov_norms

# Extract data and labels
train_data = np.array(data[1])  
train_labels = np.array(data[2])

# Define sample sizes to test
sample_sizes = [50, 100, 500, 1000, 2000, 4000]

# Store norms for each class
class_norms = {}
for class_label in range(4):
    mean_norms, cov_norms = compute_statistics(train_data, train_labels, class_label, sample_sizes)
    class_norms[class_label] = (mean_norms, cov_norms)

plt.figure(figsize=(12, 5))
for class_label, (mean_norms, cov_norms) in class_norms.items():
    plt.plot(sample_sizes, mean_norms, marker='o', linestyle='-', label=f'Class {class_label} Mean L2 Norm')
plt.xlabel('Number of Samples')
plt.ylabel('L2 Norm of Mean Vector')
plt.title('L2 Norm of Mean Vector vs Sample Size')
plt.legend()
plt.grid(True)
plt.show()

plt.figure(figsize=(12, 5))
for class_label, (mean_norms, cov_norms) in class_norms.items():
    plt.plot(sample_sizes, cov_norms, marker='o', linestyle='-', label=f'Class {class_label} Covariance Frobenius Norm')
plt.xlabel('Number of Samples')
plt.ylabel('Frobenius Norm of Covariance Matrix')
plt.title('Frobenius Norm of Covariance Matrix vs Sample Size')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm  # For progress tracking
from matplotlib.pylab import det
import pickle  # For saving intermediate results

# Load data
data = oracle.q1_fish_train_test_data(23475)
X = np.array(data[1])
Y = np.array(data[2]) 

# Flatten each image to a 1D vector
X = X.reshape(X.shape[0], -1)  # Shape: (N, 3072)

# Define subset sizes
subset_sizes = [2500, 3500, 4000, 4500, 5000]


# Storage for results
scatter_matrices = {}  # Store S_W and S_B
objective_values = {n: [] for n in subset_sizes}


In [None]:


def sample(X, y, n):
    """Randomly selects 'n' samples per class and returns a subset."""
    j = []
    for c in range(4):
        j.extend(np.random.choice(np.where(y == c)[0], n, replace=False))
    j = np.array(j)
    return X[j], y[j]

def class_means(X, y):
    """Computes class-wise mean vectors."""
    return {c: np.mean(X[y == c], axis=0) for c in range(4)}

def scatter_matrices(X, y, class_means):
    """Computes within-class and between-class scatter matrices."""
    overall_mean = np.mean(X, axis=0)
    S_W = np.zeros((X.shape[1], X.shape[1]))  
    S_B = np.zeros((X.shape[1], X.shape[1]))

    for c in range(4):
        X_class = X[y == c]
        mean_diff = class_means[c] - overall_mean
        S_B += len(X_class) * np.linalg.matmul(mean_diff, mean_diff)

        for x in X_class:
            mean_diff = x - class_means[c]
            S_W += np.matmul(mean_diff, mean_diff)
    
    return S_W, S_B


In [None]:
for n in tqdm(subset_sizes, desc="Processing subset sizes"):
    scatter_results = {}  # Renamed dictionary variable

    for n in tqdm(subset_sizes, desc="Processing sample sizes"):
        scatter_results[n] = []
        X_subset, y_subset = sample(X, Y, n)
        means_dict = class_means(X_subset, y_subset)
        S_W, S_B = scatter_matrices(X_subset, y_subset, means_dict)  # Function call remains unchanged
        scatter_results[n].append((S_W, S_B))
    
        # Print message after each iteration


In [None]:
def projection(S_W, S_B):
    """Computes the FLD projection matrix and objective function value."""
    eigvals, eigvecs = np.linalg.eig(np.linalg.inv(S_W) @ S_B)

    # Sort eigenvalues in descending order and select top 3 eigenvectors
    sorted_j = np.argsort(eigvals)[::-1]
    W = eigvecs[:, sorted_j[:3]]  

    # Compute objective value
    object_value = det(np.matmul(np.matmul(W.T, S_B), W)) / det(np.matmul(np.matmul(W.T, S_W), W))

    return W, object_value


In [None]:
for n in tqdm(subset_sizes, desc="Processing W for subset sizes"):
    for S_W, S_B in tqdm(scatter_matrices[n], desc=f"Computing W for n={n}", leave=False):
        W, obj_value = projection(S_W, S_B)
        objective_values[n].append(obj_value)


In [None]:
n = 5000  # Only visualize the largest subset

X_subset, y_subset = sample(X, Y, n)
W, _ = projection(*scatter_matrices[n][0])  
# Project data
Y_projected = X_subset @ W  
# 3D Scatter plot
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

for c in range(4):
    ax.scatter(Y_projected[y_subset == c, 0], 
               Y_projected[y_subset == c, 1], 
               Y_projected[y_subset == c, 2], 
               label=f'Class {c}', alpha=0.6)

ax.set_xlabel('FLD Component 1')
ax.set_ylabel('FLD Component 2')
ax.set_zlabel('FLD Component 3')
ax.set_title(f'FLD Projection in 3D (n={n})')
plt.legend()
plt.show()
