In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import matplotlib.pyplot as plt # this module contains functions that allow to quickly generate many kinds of plots .
import mpl_toolkits.mplot3d # provides 3D plotting
from sklearn import datasets # this module includes utilities to load datasets
from sklearn.decomposition import PCA #principal component analysis
import numpy as np #library that contains multidimensional array and matrix data structures
import cv2 #real-time optimized Computer Vision library

In [None]:
"""The Iris data set consists of 3 different types of irises’ (Setosa, Versicolour,and Virginica) petal and sepal length, stored in a 150x4 (n_samples x n_features) numpy.ndarray.

The rows being the samples and the columns being: Sepal Length, Sepal Width, Petal Length and Petal Width
"""

# import some data to play with

iris = datasets.load_iris()

'''
Load and return the iris dataset (classification).

The iris dataset is a classic and very easy multi-class classification dataset.

Classes: 3
Samples per class: 50
Samples total: 150
'''

In [None]:
X = iris.data[:, :2]  # the data is 4-dimensional (4 features) but we only take the first two features for graphic purposes (we can visualize 2 features at a time using a scatter plot).
y = iris.target

print("feature matrix size", X.shape)
print("label vector size", y.shape)

In [None]:
#plot the features (we can do it because this is 2D)

x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5

plt.figure(2, figsize=(8, 6))
plt.clf()

# Plot the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1, edgecolor="k")
plt.xlabel("Sepal length")
plt.ylabel("Sepal width")

plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())

"""In the plot above, the colors represent the 3 different classes."""


In [None]:
#iris = datasets.load_iris()
X_ = iris.data[:, :3]  # we only take the first three features for graphic purposes --> visualization in a 3-dimensional space.
y = iris.target

print("feature matrix size", X_.shape)
print("label vector size", y.shape)

In [None]:
fig = plt.figure(1, figsize=(4, 3))
plt.clf()

ax = fig.add_subplot(111, projection="3d", elev=48, azim=134)
ax.set_position([0, 0, 0.95, 1])

plt.cla()

for name, label in [("Setosa", 0), ("Versicolour", 1), ("Virginica", 2)]:
    ax.text3D(
        X_[y == label, 0].mean(),
        X_[y == label, 1].mean() + 1.5,
        X_[y == label, 2].mean(),
        name,
        horizontalalignment="center",
        bbox=dict(alpha=0.5, edgecolor="w", facecolor="w"),
    )

# Reorder the labels to have colors matching the cluster results
y = np.choose(y, [1, 2, 0]).astype(float)
ax.scatter(X_[:, 0], X_[:, 1], X_[:, 2], c=y, cmap=plt.cm.nipy_spectral, edgecolor="k")

ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_zticklabels([])

ax.set_xlabel('Sepal length')
ax.set_ylabel('Sepal width')
ax.set_zlabel('Petal length')

plt.show()



# Principal Component Analysis (PCA)

PCA is an unsupervised learning algorithm that tries to identify the subspace in which data approximately lies.

Suppose we are given a dataset $\{ x^{(i)}; i = 1, ... n\}$ of $n$ features, where $x^{(i)} \in \mathbb{R}^d (d \ll n)$. But unknown to us, two different features are almost linearly dependent, up to only small differences. We might say that the data lies approximately on an $n-1$ dimensional subspace.

How can we automaticaly detect, and perhaps remove, this redudancy?

If, for instance, we have just two features, $x_1$ and $x_2$ that are strongly correlated (linearly dependend), we might say that the data actually lies along some diagonal axis (the $u$ direction). How can we automatically compute this $u$ direction, the direction on which the data approximately lies?

One way is to pose this problem as finding the unit vector $u$ so that when the data is projected onto the direction corresponding to $u$, the variance of the projected data is maximized. Intuitively, the data starts off with some amount of variance/information in it. We would like to choose a direction $u$ such that if we approximate the data as lying in the direction/subspace corresponding to $u$, as much s possible of this variance is still retained.

Consider the following dataset 

<img src="img/dataset.png" alt="Dataset Visualization" width="500">

Now, suppose we pick u to correspond the the direction shown in this figure 

<img src="img/first_u.png" alt="First Principal Component" width="500">

The circles denote the projections of the original data onto this line.
We see that the projected data still has a fairly large variance,and the points tend to be far from zero. 
In contrast, suppose had instead picked this other direction 

<img src="img/second_u.png" alt="Second Principal Component" width="500">

The projections have a significantly smaller variance, and are much
closer to the origin.

we would like to select the direction $u$ corresponding to the first of the two figures above.
To formalize thism note that given a unit vector $u$ and a point $x$, the length of the projection of $x$ onto $u$ is given by the scalar product $x^Tu$, i.e. if $x^{(i)}$ is a point in our datasen, its progection onto $u$ is the distance $x^Tu$ from the origin. 

Hence, to maximize the variance of the projections, we would like to choose a unit-length $u$ so as to maximize:
\begin{equation*}
\frac{1}{n} \sum_{i=1}^n ( x^{(i)^T} u )^2 = \frac{1}{n} \sum_{i=1}^n u^T x^{(i)} x^{(i)^T} u = u^T \Big( \frac{1}{n} \sum_{i=1}^n x^{(i)} x^{(i)^T} \Big) u \ .
\end{equation*}
Maximizing this, subject to $|| u ||_2 = 1$ gives the _principal eigenvector_ of $\Sigma = \frac{1}{n} \Big( \sum_{i=1}^n x^{(i)} x^{(i)^T} \Big)$, which is just the empirical covariance matrix of the data.

__To summarize__, we have found that if we wish to find a $1$-dimensional
subspace with witch to approximate the data, we should choose $u$ to be the
principal eigenvector of $\Sigma$. More generally, if we wish to project our data
into a $k$-dimensional subspace ($k < d$), we should choose $u_1 , . . . , u_k$ to be the
top $k$ eigenvectors of $\Sigma$. The u_i ’s now form a new, orthogonal basis for the
data.
Then, to represent $x^{(i)}$ in this basis, we need only compute the corresponding vector

\begin{equation}
y^{(i)} = \begin{pmatrix} 
u^T_1 x^{(i)} \\ u^T_2 x^{(i)} \\ ... \\ u^T_k x^{(i)} 
\end{pmatrix} 
\in \mathbb{R}^k \ .
\end{equation}

Thus, the vector $y^{(i)}$ gives a lower, $k$-dimenisional, approximation/representation for $x^{(i)} \in \mathbb{R}^d$.

PCA is also referred to as __dimensionality reduction__ algorithm. The vectors $u_1 , . . . , u_k$ are called the _first k_ __principal components__ of the data.

A standard application f PCA is to preprocess a dataset to reduce its
dimension before running a supervised learning learning algorithm with the
$x^{(i)}$ ’s as inputs. Apart from computational benefits, reducing the data’s
dimension can also reduce the complexity of the hypothesis class considered
and help avoid overfitting.

***PCA on the IRIS dataset***

Principal component analysis (PCA) performs a linear transformation on the data so that most of the variance or information in your high-dimensional dataset is captured by the first few principal components.

The first principal component will capture the most variance, followed by the second principal component, and so on.

Each principal component is a linear combination of the original variables. Because all the principal components are orthogonal to each other, there is no redundant information.


In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn import decomposition
from sklearn import datasets


np.random.seed(5)  #input seed value to generate repeated random value

iris = datasets.load_iris()
X = iris.data
y = iris.target

print("feature matrix size", X.shape)
print("label vector size", y.shape)

In [None]:
from sklearn.decomposition import PCA #principal component analysis

pca = PCA(n_components=3) # here I have to choose the number of PCA components
pca.fit(X) # Fit the model with X

X_red = pca.transform(X) # Apply dimensionality reduction to X

print(X_red.shape)
print(pca.explained_variance_ratio_) # Percentage of variance explained by each of the selected components.

In [None]:
# Determine explained variance using explained_variance_ration_ attribute
#
exp_var_pca = pca.explained_variance_ratio_
#
# Cumulative sum of eigenvalues; This will be used to create step plot
# for visualizing the variance explained by each principal component.
#
cum_sum_eigenvalues = np.cumsum(exp_var_pca)
print(cum_sum_eigenvalues)

In [None]:
# Create the visualization plot
#
plt.bar(range(0,len(exp_var_pca)), exp_var_pca, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(0,len(cum_sum_eigenvalues)), cum_sum_eigenvalues, where='mid',label='Cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

In [None]:
#how many component do we want to save?

pca = decomposition.PCA(n_components=3) #here I have to choose the number of PCA components
pca.fit(X)
X = pca.transform(X)
print(X.shape)

In [None]:
y_true = iris.target  # original labels
y_vis = np.choose(y_true, [1, 2, 0]).astype(float)  # for coloring

print(y_vis)

In [None]:

fig = plt.figure(1, figsize=(4, 3))
plt.clf()

ax = fig.add_subplot(111, projection="3d", elev=48, azim=134)
ax.set_position([0, 0, 0.95, 1])

# Make sure y_true is integer (this is the key fix!)
y_true = y.astype(int)  # Ensure it's int for indexing and np.choose

# Then use y_true in indexing:
for name, label in [("Setosa", 0), ("Versicolour", 1), ("Virginica", 2)]:
    ax.text3D(
        X[y_true == label, 0].mean(),
        X[y_true == label, 1].mean() + 1.5,
        X[y_true == label, 2].mean(),
        name,
        horizontalalignment="center",
        bbox=dict(alpha=0.5, edgecolor="w", facecolor="w"),
    )

# Use y_vis for coloring
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y_vis, cmap=plt.cm.nipy_spectral, edgecolor="k")

# Hide tick labels
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_zticklabels([])

# Set axis labels
ax.set_xlabel('Sepal length')
ax.set_ylabel('Sepal width')
ax.set_zlabel('Petal length')

plt.show()

In [None]:
fig = plt.figure(1, figsize=(4, 3))
plt.clf()

ax = fig.add_subplot(111, projection="3d", elev=48, azim=134)
ax.set_position([0, 0, 0.95, 1])

# Make sure y_true is integer (this is the key fix!)
y_true = y.astype(int)  # Ensure it's int for indexing and np.choose

# Add class labels at the mean position
for name, label in [("Setosa", 0), ("Versicolour", 1), ("Virginica", 2)]:
    ax.text3D(
        X[y_true == label, 0].mean(),
        X[y_true == label, 1].mean() + 1.5,
        X[y_true == label, 2].mean(),
        name,
        horizontalalignment="center",
        bbox=dict(alpha=0.5, edgecolor="w", facecolor="w"),
    )

# Reorder labels just for color visualization
y_vis = np.choose(y_true, [1, 2, 0]).astype(float)

# Scatter plot
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y_vis, cmap=plt.cm.nipy_spectral, edgecolor="k")

# Remove tick labels
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_zticklabels([])

# Axis labels
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')

plt.show()