<a href="https://colab.research.google.com/github/PrincetonUniversity/python_machine_learning/blob/main/notebook4_dimensionality_reduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

rng = np.random.RandomState(1)
qualitiative_colors = ['#1b9e77','#d95f02','#7570b3','#e7298a']

# Dimensionality Reduction

## Make Dimensionality Reduction Data Set

In [None]:
n_samples = 500
cov = [[3, 3], [3, 4]]
X = rng.multivariate_normal(mean=[0, 0], cov=cov, size=n_samples)

In [None]:
fig, axs = plt.subplots(figsize=(4.,4.), nrows=1, ncols=1, facecolor='white', dpi=200)
axs.scatter(X[:, 0], X[:, 1], s=10, color = qualitiative_colors[0], edgecolor='None')
axs.set_ylabel(r'$x_2$')
axs.set_xlabel(r'$x_1$')

## Principal Component Analysis

In [None]:
from sklearn.decomposition import PCA

### Create Principal Component Analysis object

In [None]:
pca = PCA(n_components=2)

### Train the model using the training set

In [None]:
pca.fit(X)

### Apply coordinate transform

In [None]:
reduced_data = pca.transform(X)

In [None]:
fig, axs = plt.subplots(figsize=(8.,4.), nrows=1, ncols=2, facecolor='white', dpi=200)
axs[0].scatter(X[:, 0], X[:, 1], s=10, color = 'k', edgecolor='None')
for i, (comp, var) in enumerate(zip(pca.components_, pca.explained_variance_)):
    comp = comp * var  # scale component by its variance explanation power
    axs[0].arrow(0, 0, comp[0], comp[1], width=0.1, color = qualitiative_colors[i])
axs[0].set_ylabel(r'$x_2$')
axs[0].set_xlabel(r'$x_1$')

bins = np.linspace(np.min(reduced_data)-0.01, np.max(reduced_data)+0.01, int(n_samples/10))
axs[1].hist(reduced_data[:,0], histtype='step', bins= bins, lw=3, color = qualitiative_colors[0], label = 'Data projected onto first component')
axs[1].hist(reduced_data[:,1], histtype='step', bins= bins, lw=3, color = qualitiative_colors[1], label = 'Data projected onto second component')
axs[1].legend(loc=0, fontsize=8)
axs[1].set_xlabel(r'$\hat{x}$')

## Load Digits Data Set

In [None]:
from sklearn.datasets import load_digits

In [None]:
data, labels = load_digits(return_X_y=True)
(n_samples, n_features), n_digits = data.shape, np.unique(labels).size

print(f"number of digits: {n_digits}; number of samples: {n_samples}; number of features {n_features}")

In [None]:
fig, axs = plt.subplots(figsize=(30.,3.), nrows=1, ncols=10, facecolor='white', dpi=200)
for ax, image, label in zip(axs, data, labels):
    ax.set_axis_off()
    ax.imshow(image.reshape((8, 8)), cmap=plt.cm.gray_r, interpolation="nearest")
    ax.set_title("Training: %i" % label)

## Principal Component Analysis

### Create Principal Component Analysis object

In [None]:
pca = PCA(n_components=2)

### Train the model using the training set

In [None]:
pca.fit(data)

### Reduce dimensions by applying coordinate transform

In [None]:
reduced_data = pca.transform(data)

In [None]:
print("Data: \n", data.shape)
print("Reduced Data: \n", reduced_data.shape)

In [None]:
fig, axs = plt.subplots(figsize=(4.,4.), nrows=1, ncols=1, facecolor='white', dpi=200)
axs.scatter(reduced_data[:, 0], reduced_data[:, 1], s=5, c=qualitiative_colors[1])
axs.set_ylabel(r'$\hat{x}_2$')
axs.set_xlabel(r'$\hat{x}_1$')

In [None]:
colors = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a']
markers = ["$0$","$1$","$2$","$3$","$4$","$5$","$6$","$7$","$8$","$9$"]
fig, axs = plt.subplots(figsize=(4.,4.), nrows=1, ncols=1, facecolor='white', dpi=400)
for i in range(len(markers)):
    to_plot = labels == i
    axs.scatter(reduced_data[to_plot, 0], reduced_data[to_plot, 1], s=20, c=colors[i], marker=markers[i])
axs.set_ylabel(r'$\hat{x}_2$')
axs.set_xlabel(r'$\hat{x}_1$')

## Exercise

Try to apply the t-SNE (t-distributed Stochastic Neighbor Embedding) approach to the dataset above using 2 dimensions. You will need to search for the Scikit-Learn documentition. Does t-SNE appear to do a better job than PCA?