## Principle Component Analysis
PCA is used for feature reduction. Also, useful for visualization or getting slow running algorithms a smaller dataset to work on. Considered *unsupervised* as there is no labelled data to train from.

Remember to standardize the data before running PCA.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import (datasets, decomposition, ensemble, 
                     metrics, model_selection, preprocessing)
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.features.pca import PCADecomposition

In [None]:
iris = datasets.load_iris()
target = pd.Series(iris.target)
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['target'] = target


In [None]:
iris_df

In [None]:
# 4 columns of data (cm), going to use PCA
# with 4 columns of result.
# The explained_variance_ratio_ tells us that
# by using 1 principal component, we will get
# 73% of the variance
X = iris_df.iloc[:, :4]
X_s = preprocessing.StandardScaler().fit_transform(X)
pca4 = decomposition.PCA(n_components=None)
res4 = pca4.fit_transform(X_s)
pca4.explained_variance_ratio_

In [None]:
# 1D visualization of PCA
# (first 2 columns of iris)
xy = dict(zip('xy', iris_df.columns[:2]))
X = iris_df.iloc[:,:2]
X.plot(kind='scatter', **xy, alpha=.3)
pca1 = decomposition.PCA(n_components=1)
res = pca1.fit_transform(X)
flat = pca1.inverse_transform(res)
plt.scatter(flat[:, 0], flat[:, 1], c='g', alpha=.5)

In [None]:
# explained variance - How much of the variance is explained by 
# each of the principal components
pca1.explained_variance_ratio_

In [None]:
# With yellowbrick
# PCA 2 dimensions reduction from 4
X = iris_df.iloc[:,:4]

fig, ax = plt.subplots(figsize=(10, 8))
y =iris_df.iloc[:, [-1]]
colors = ['rgb'[idx] for idx in y.target]
viz = PCADecomposition(color=colors)
viz.fit_transform(X)
viz.poof()

In [None]:
# Plotting with matplotlib
# PCA with 2 Dimensions
X_s = preprocessing.StandardScaler().fit_transform(X)
pca2 = decomposition.PCA(n_components=2)
res2 = pca2.fit_transform(X_s)
plt.scatter(res2[:, 0], res2[:, 1], c=colors)

In [None]:
pca2.explained_variance_ratio_

In [None]:
# how do features influence components
pca2.components_

In [None]:
# Plot of of how features impact components
plt.imshow(pca2.components_.T, cmap='plasma')
plt.yticks(range(len(X.columns)), X.columns)
plt.colorbar()

## Exercise: PCA

The (wheat) seed dataset has a feature engineered column, compactness
\begin{align}
C=4*pi*area/perimeter^2
\end{align}

* Run PCA on this data set to create 2 components. (Ignore variety)
* Visualize the results
* Plot how the features impact the components

The file is at ``../data/seeds_dataset.txt``


It has the following fields:

1. area A, 
2. perimeter P, 
3. compactness C = 4*pi*A/P^2, 
4. length of kernel, 
5. width of kernel, 
6. asymmetry coefficient 
7. length of kernel groove. 
8. variety (Kama, Rosa, Canadian)

https://archive.ics.uci.edu/ml/datasets/seeds




## Classification with PCA

In [None]:
# random forest on all data
X = iris_df.iloc[:, :4]
y = iris_df.target
X_train, X_test, y_train, y_test = model_selection.\
    train_test_split(X, y, test_size=.3, random_state=42)
rf1 = ensemble.RandomForestClassifier()
rf1.fit(X_train, y_train)
rf1.score(X_test, y_test)

In [None]:
# random forest on PCA data

X = iris_df.iloc[:, :4]
y = iris_df.target
X_s = preprocessing.StandardScaler().fit_transform(X)
pca2 = decomposition.PCA(n_components=2)
X = pca2.fit_transform(X_s)
X_train, X_test, y_train, y_test = model_selection.\
    train_test_split(X, y, test_size=.3, random_state=42)
rf2 = ensemble.RandomForestClassifier()
rf2.fit(X_train, y_train)
rf2.score(X_test, y_test)

In [None]:
# Yellowbrick version
fig, ax = plt.subplots(figsize=(8, 8))
mapping = dict(zip([0,1,2], iris.target_names))
cm = ConfusionMatrix(rf2, classes=iris.target_names,
                    label_encoder=mapping)
# don't need to call .fit as rf2 is already "fit"
#cm.fit(X, y)
cm.score(X_test, y_test) 
cm.poof()

In [None]:

def fig_with_title(ax, title, figkwargs):
    if figkwargs is None:
        figkwargs = {}
    if not ax:
        fig = plt.figure(**figkwargs)
        ax = plt.subplot(111)
    else:
        fig = plt.gcf()
    if title:
        ax.set_title(title)
    return fig, ax


def plot_confusion_matrix(clf, X, y, labels, random_state=42, annotate=True,
                          cmap=plt.cm.Blues,
                          title="Confusion Matrix", ax=None, figkwargs=None):
    fig, ax = fig_with_title(ax, title, figkwargs)
    y_pred = clf.predict(X)
    cm = metrics.confusion_matrix(y, y_pred)
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    fig.colorbar(im)
    ax.set_xticks(range(len(labels)))
    ax.set_xticklabels(labels, rotation=45)
    ax.set_yticks(range(len(labels)))
    ax.set_yticklabels(labels)
    ax.set_ylabel('True Label')
    ax.set_xlabel('Predicted Label')
    if annotate:
        for x in range(len(labels)):
            for y in range(len(labels)):
                plt.annotate(str(cm[x][y]),
                             xy=(y,x),
                             ha='center',va='center',color='red', fontsize=25, fontstyle='oblique')

    return fig, ax

In [None]:
iris.target_names

In [None]:
plot_confusion_matrix(rf2, X, y, labels=iris.target_names)

In [None]:
# Plot 2d PCA
# Color by random forest prediction
# Shape is actual species
pred = rf2.predict(X_test)
shapes = 'sox'
shape = [shapes[i] for i in y_test]
fig = plt.figure(figsize=(14,10))
for actual in range(3):
    mask = y_test==actual
    xs = X_test[mask]
    color = ['rgb'[m] for m in pred[mask]]
    shape = shapes[actual]
    plt.scatter(xs[:,0], xs[:,1], c=color, marker=shape)

## Exercise: Classification with PCA
* Run a classification on PCA'd data
* How does it perform versus the raw data?