# 1. Pixel Analysis and Eigenfaces

**Project:** Image EDA (Olivetti Faces)
**Goal:** Understand images as matrices, compute the 'average face', and extract 'eigenfaces' using PCA.

---

## 1. Imports and Setup

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_olivetti_faces
from sklearn.decomposition import PCA
import os

sns.set_style('white')
plt.rcParams['figure.figsize'] = (10, 6)

## 2. Load Dataset
We use **Olivetti Faces**. It contains 400 images (10 per person).

In [None]:
print("Loading dataset...")
data_home = '../../data/raw/scikit_learn_data'
if not os.path.exists(data_home):
    os.makedirs(data_home)

olivetti = fetch_olivetti_faces(shuffle=True, random_state=42, data_home=data_home)

X = olivetti.data
y = olivetti.target
n_samples, n_features = X.shape
# Images are 64x64
h, w = 64, 64

print(f"Dataset Shape: {X.shape}")
print(f"Features (Pixels): {n_features}")
print(f"Classes (People): {len(np.unique(y))}")

## 3. Visualize Raw Images
Let's look at some random faces from the dataset.

In [None]:
def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
    plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
    plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
    for i in range(n_row * n_col):
        plt.subplot(n_row, n_col, i + 1)
        plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
        plt.title(titles[i], size=12)
        plt.xticks(()) 
        plt.yticks(())
    plt.show()

# Show first 12 images
titles = [f"Person {y[i]}" for i in range(12)]
plot_gallery(X[:12], titles, h, w)

## 4. Pixel Intensity Distribution
Are the images mostly dark, light, or balanced?

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(X.ravel(), bins=50, color='gray', kde=True)
plt.title('Pixel Intensity Distribution (All Images)')
plt.xlabel('Pixel Value (0-1 scaled)')
plt.ylabel('Count')
plt.show()

## 5. The "Average Face"
What does the mean of all human faces in this dataset look like?

In [None]:
mean_face = np.mean(X, axis=0)

plt.figure(figsize=(6, 6))
plt.imshow(mean_face.reshape((h, w)), cmap=plt.cm.gray)
plt.title('The Average Face')
plt.axis('off')
plt.show()

## 6. Eigenfaces (PCA)
We use PCA to find the "principal components" of the face space.

In [None]:
n_components = 50
print(f"Extracting the top {n_components} eigenfaces...")
pca = PCA(n_components=n_components, svd_solver='randomized', whiten=True).fit(X)

eigenfaces = pca.components_.reshape((n_components, h, w))

eigenface_titles = [f"Eigenface {i}" for i in range(12)]
plot_gallery(eigenfaces[:12], eigenface_titles, h, w)

plt.figure(figsize=(10, 5))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance vs. Number of Eigenfaces')
plt.grid()
plt.show()