# Before you begin

Download the following jupyter notebook that is part of Jake VanderPlas's Python Data Science Handbook. It will provide exceptional support to help you complete the this part of the assignment.

[Pricipal Component Analysis](https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/05.09-Principal-Component-Analysis.ipynb)

If you are using google colab, uncomment these lines to upload data files to Google Colab

In [None]:
# from goodle.colab import files
# uploaded = files.upload()
# %ls

# Import libraries
Do not use any other libraries.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import kagglehub
from skimage.io import imread_collection
from skimage.color import rgb2gray
from skimage.transform import resize

Fit a Principal component analysis to the following data, then plot the pricipal component vectors scaled to 2 standard deviations.

In [None]:
rng = np.random.RandomState(1)
X = np.dot([[1.0,2.0],[0.3, -0.6]], rng.randn(2, 200)).T
plt.scatter(X[:, 0], X[:, 1])
plt.axis('equal')

In [None]:
# Fit a Principal component analysis to the following data, then plot the pricipal component vectors scaled to 3 standard deviations.
pca = None

# Plot the principal components
def draw_vector(v0, v1, ax=None):
    ax = ax or plt.gca()
    arrowprops=dict(arrowstyle='->', linewidth=2,
                    shrinkA=0, shrinkB=0)
    ax.annotate('', v1, v0, arrowprops=arrowprops)

# plot data
plt.scatter(X[:, 0], X[:, 1], alpha=0.2)
for length, vector in zip(pca.explained_variance_, pca.components_):
    v = vector * 3 * np.sqrt(length)
    draw_vector(pca.mean_, pca.mean_ + v)
plt.axis('equal')

Demonstrate PCA dimensionality reduction by using PCA with 1 component to fit the data to the largest principal component.

In [None]:
# Fit PCA with only 1 component

# Transform the data to the first principal component

# Transform the data back to its original space

# Plot the results


The Fashion MNIST datasets contains labelled images of different clothing items. See examples below.

In [None]:
# Load fashion MNIST dataset
from sklearn.datasets import fetch_openml
X, y = fetch_openml("Fashion-MNIST", version=1, return_X_y=True)
y = y.astype(int)
X = X / 255.0  # scale to [0, 1]
print(X.shape, y.shape)
fashion_labels = {0: "T-shirt/top", 1: "Trouser", 2: "Pullover", 3: "Dress", 4: "Coat", 5: "Sandal", 6: "Shirt", 7: "Sneaker", 8: "Bag", 9: "Ankle boot"}

# Plot some examples from the dataset
imgs = X.values.reshape(-1, 28, 28)
fig, axes = plt.subplots(figsize=(10, 5))
for i in range(10):
    index = np.where(y == i)[0][0]
    plt.subplot(2, 5, i + 1)
    plt.imshow(imgs[index], cmap='binary', 
               interpolation='nearest', clim=(0,1))
    plt.title(f'{fashion_labels[y[index]]} ({y[index]})')
    plt.axis('off')
plt.tight_layout()
plt.show()



Project the images down to 2 dimensions using PCA, then plot them.

In [None]:
# Project from 784 to 2 dimensions
projected = None

# Plot the projected data
plt.scatter(projected[:, 0], projected[:, 1],
            c=y, edgecolor='none', alpha=0.25,
            cmap=plt.get_cmap('rainbow', 10),
            s=3)
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.colorbar()

Plot the change in variance captured by PCA versus the number of components.

Using the MNIST Fashion dataset, show how you can denoise noisy data using PCA. 

In [None]:
def plot_fashion(data):
    fig, axes = plt.subplots(3, 10, figsize=(10, 3),
                             subplot_kw={'xticks':[], 'yticks':[]},
                             gridspec_kw=dict(hspace=0.1, wspace=0.1))
    for i, ax in enumerate(axes.flat):
        ax.imshow(data[i].reshape(28, 28),
                  cmap='binary', interpolation='nearest',
                  clim=(0, 1))
plot_fashion(X.values)
rng = np.random.default_rng(100)
noisy = rng.normal(X, 0.25)
plot_fashion(noisy)

Use PCA to fit 50% of the variance, then plot the filtered clothing images.

In [None]:
# Use PCA to fit 50% of the variance
filtered = None

# Plot the original, noisy, and filtered images
plot_fashion(X.values)
plot_fashion(noisy)
plot_fashion(filtered)

The pubfig dataset is an image dataset that includes many labeled pictures of famous public figures. Compute eigenfaces for this dataset.

First, scale the images down to 64x64 and convert them to greyscale. (May take a few minutes.)

In [None]:
# Download latest version
path = kagglehub.dataset_download("kaustubhchaudhari/pubfig-dataset-256x256-jpg")
path = path + "\\CelebDataProcessed"

# Load the dataset
faces = imread_collection(path + "\\*\\*.jpg")
print(f"Number of images: {len(faces)}")

# Downsample and vectorize the images
faces = np.array([rgb2gray(resize(img, (64, 64))).ravel()
                      for img in faces])


In [None]:
# Display some of the faces
np.random.seed(100)
fig, axes = plt.subplots(3, 5, figsize=(10, 6),
                         subplot_kw={'xticks':[], 'yticks':[]},
                         gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i, ax in enumerate(axes.flat):
    index = np.random.randint(len(faces))
    ax.imshow(faces[index].reshape(64, 64), cmap='bone')

Compute 150 PCA components. Project the faces down to these components, then perform the inverse to scale them back from the compressed state.

In [None]:
# Compute the components and projected faces


Plot the PCA component vectors.

Once again, plot the change in variance captured by PCA versus the number of components.

Plot a sample of the original images and their reconstruction.

In [None]:
# Plot the results
fig, ax = plt.subplots(2, 10, figsize=(10, 2.5),
                       subplot_kw={'xticks':[], 'yticks':[]},
                       gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i in range(10):
    index = np.random.randint(len(faces))
    ax[0, i].imshow(faces[index].reshape(64, 64), cmap='binary_r')
    ax[1, i].imshow(projected[index].reshape(64, 64), cmap='binary_r')

ax[0, 0].set_ylabel('full-dim\ninput')
ax[1, 0].set_ylabel('150-dim\nreconstruction')