In [None]:
import numpy as np
import matplotlib.pyplot as plt

angle = np.pi/5
stretch = 5
m = 200

np.random.seed(0)
X = np.random.randn(m, 2)/10
X = X.dot(np.array([[stretch, 0],[0, 1]])) # stretch
X = X.dot([[np.cos(angle), np.sin(angle)], [-np.sin(angle), np.cos(angle)]]) # rotate

plt.figure(figsize=(4,4))
plt.plot(X[:, 0], X[:, 1], "bo", alpha=0.5)
plt.axis([-1.4, 1.4, -1.4, 1.4])
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$x_2$", fontsize=18, rotation=0)
plt.grid(True)

### Feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler

X_scaled = StandardScaler().fit_transform(X)

plt.figure(figsize=(4,4))
plt.plot(X_scaled[:, 0], X_scaled[:, 1], "bo", alpha=0.5)
plt.axis([-4, 4, -4, 4])
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$x_2$", fontsize=18, rotation=0)
plt.grid(True)


### Covariance matrix of the original data

In [None]:
np.cov(X_scaled.T)

We can see that the two features are highly correlated with each other.

### Apply PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(4,4))
plt.plot(X_pca[:, 0], X_pca[:, 1], "bo", alpha=0.5)
plt.axis([-4, 4, -4, 4])
plt.xlabel("$PC_1$", fontsize=18)
plt.ylabel("$PC_2$", fontsize=18, rotation=0)
plt.grid(True)

### Eigenvectors

In [None]:
pca.components_

### Visualizing Eigenvectors

In [None]:
plt.figure(figsize=(4,4))
plt.plot(X_scaled[:, 0], X_scaled[:, 1], "bo", alpha=0.5)

# Plotting PC1
plt.arrow(0,0,pca.components_[0,0],pca.components_[0,1], width=0.1, color='r', zorder=10)

# Plotting PC2
plt.arrow(0,0,pca.components_[1,0],pca.components_[1,1], width=0.1, color='g', zorder=10)

plt.axis([-4, 4, -4, 4])
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$x_2$", fontsize=18, rotation=0)
plt.grid(True)

### Variance along the PCs

In [None]:
pca.explained_variance_

### Let's check the covariance matrix after PCA.

In [None]:
np.cov(X_pca.T)

The two PCs are uncorelated.

### Variance ratio along the PCs (i.e., importance of the "new" features)

In [None]:
np.cov(X_pca.T)[0,0], np.cov(X_pca.T)[1,1]

In [None]:
pca.explained_variance_ratio_

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=1)

X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(4,4))
plt.plot(X_pca[:, 0], np.zeros(X_pca.shape), "bo", alpha=0.5)

plt.axis([-4, 4, -4, 4])
plt.xlabel("$PC_1$", fontsize=18)
plt.grid(True)

In [None]:
from sklearn.decomposition import PCA

X_inverse = pca.inverse_transform(X_pca)

plt.figure(figsize=(4,4))
plt.plot(X_inverse[:, 0], X_inverse[:, 1], "bo", alpha=0.5)

plt.axis([-4, 4, -4, 4])
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$x_2$", fontsize=18, rotation=0)
plt.grid(True)

### Dimensionality reduction is effective in removing highly correlated features.

### Let's play with the breast cancer dataset.
This example is adopted from: https://towardsdatascience.com/how-do-you-apply-pca-to-logistic-regression-to-remove-multicollinearity-10b7f8e89f9b

In [None]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df

In [None]:
cancer.target

### Let's plot correlation coefficients.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(25, 20))
sns.heatmap(df.corr(), annot=True)

### Let's make a DT model and check the performance.

In [None]:
X = df
y = pd.Series(cancer.target)

# Make train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=5, random_state=0)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test) 

# Measure accuracy
from sklearn.metrics import accuracy_score
import numpy as np

print("Test score: {:.2f}".format(model.score(X_test, y_test)))

# Make the confusion matrix
from sklearn.metrics import confusion_matrix

cf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(cf_matrix, annot=True, cmap='Blues')
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('True', fontsize=12)

### Let's check how many components we want in PCA

In [None]:
# feature scaling
from sklearn.preprocessing import StandardScaler

X_scaled = StandardScaler().fit_transform(X)

# Apply PCA
from sklearn.decomposition import PCA

model = PCA()
model.fit(X_scaled)

# Get explained variances
print("Variances (Percentage):")
print(model.explained_variance_ratio_ * 100)
print()

print("Cumulative Variance (Percentage):")
print(np.cumsum(model.explained_variance_ratio_ * 100))
print()

# Make the scree plot
plt.plot(np.cumsum(model.explained_variance_ratio_ * 100))
plt.xlabel("Number of components (Dimensions)")
plt.ylabel("Cumulative variance (%)")

plt.savefig('explained_variance.png',dpi=300)

In [None]:
# feature scaling
from sklearn.preprocessing import StandardScaler

X_scaled = StandardScaler().fit_transform(X)

# Apply PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=7)

# This is the data in the "new" low-dimensional feature space.
X_pca = pca.fit_transform(X_scaled)
X_pca = pd.DataFrame(X_pca)
X_pca

### Let's check correlation coefficients of the transformed data.

In [None]:
fig = plt.figure(figsize=(10, 8))
sns.heatmap(X_pca.corr(), annot=True)

### Another way to do this is to determine how much variance you'd like to keep.

In [None]:
# feature scaling
from sklearn.preprocessing import StandardScaler

X_scaled = StandardScaler().fit_transform(X)

# Apply PCA, 90% variance
from sklearn.decomposition import PCA
pca = PCA(n_components=0.9)

# This is the data in the "new" low-dimensional feature space.
X_pca = pca.fit_transform(X_scaled)
X_pca = pd.DataFrame(X_pca)
X_pca

### Fractional variance and the number of components to achieve that fractional variance.

In [None]:
pca.n_components, pca.n_components_

### Let's make a DT model using the "PCA-ed" data.

In [None]:
from sklearn.model_selection import train_test_split
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, random_state=0)

from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=5, random_state=0)

# Train the model
model.fit(X_train_pca, y_train)

# Make predictions
y_pred = model.predict(X_test_pca) # Predictions

# Measure accuracy
from sklearn.metrics import accuracy_score
import numpy as np
print("Test score: {:.2f}".format(model.score(X_test_pca, y_test)))

# Make the confusion matrix
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(y_test, y_pred)
print("\nTest confusion_matrix")
sns.heatmap(cf_matrix, annot=True, cmap='Blues')
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('True', fontsize=12)

We got the same accuracy of 0.9 in the lower dimensional space (7 vs. 30), which is great!

### Let's move to even higher-dimesional space.  We will use the MNIST dataset.

In [None]:
from sklearn.datasets import fetch_openml
import numpy as np

mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.target = mnist.target.astype(np.uint8)

In [None]:
from sklearn.model_selection import train_test_split

X = mnist["data"]
y = mnist["target"]

X = X[::10]
y = y[::10]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
X_train.shape

### TODO: Make a kNN model using k=10 without PCA. Print out test score so that we can compare the performance.

### TODO: Now apply PCA before we make a kNN model while keeping 90% variance.

### Q: How many components did you end up keeping and how does that compare with the original dimension (i.e., 784)?

### TODO: Compare some of the original images and their inverse transformed images from the data that contain only 90% of the original information.

### Now, let's plot some of the Eigenvectors.

In [None]:
plt.figure(figsize=(20,8));

plt.subplot(2, 5, 1)
plt.imshow(pca.components_[0].reshape(28,28))
plt.title('PC1', fontsize = 14);

plt.subplot(2, 5, 2)
plt.imshow(pca.components_[1].reshape(28,28))
plt.title('PC2', fontsize = 14);

plt.subplot(2, 5, 3)
plt.imshow(pca.components_[2].reshape(28,28))
plt.title('PC3', fontsize = 14);

plt.subplot(2, 5, 4)
plt.imshow(pca.components_[3].reshape(28,28))
plt.title('PC4', fontsize = 14);

plt.subplot(2, 5, 5)
plt.imshow(pca.components_[4].reshape(28,28))
plt.title('PC5', fontsize = 14);

plt.subplot(2, 5, 6)
plt.imshow(pca.components_[9].reshape(28,28))
plt.title('PC10', fontsize = 14);

plt.subplot(2, 5, 7)
plt.imshow(pca.components_[19].reshape(28,28))
plt.title('PC20', fontsize = 14);

plt.subplot(2, 5, 8)
plt.imshow(pca.components_[29].reshape(28,28))
plt.title('PC30', fontsize = 14);

plt.subplot(2, 5, 9)
plt.imshow(pca.components_[39].reshape(28,28))
plt.title('PC40', fontsize = 14);

plt.subplot(2, 5, 10)
plt.imshow(pca.components_[49].reshape(28,28))
plt.title('PC50', fontsize = 14);


### Q: What does the figure tell us?

### PCA for Galaxy Zoo data

In [None]:
# If you need to install skimage, see https://scikit-image.org/docs/stable/install.html

import glob
import skimage
from skimage.transform import resize, rescale
from skimage import io
import matplotlib.pyplot as plt
import numpy as np

path = './galaxyzoo/'

images = []

fig, axes = plt.subplots(ncols= 5, nrows = 5, figsize=(50,50))

ax = axes.ravel()

for i, file in enumerate(glob.glob(path+"*")):
    img = skimage.io.imread(file)
    img = img.mean(axis=2)
    if i < 25:
        ax[i].imshow(img)
        ax[i].set_xticks([])
        ax[i].set_yticks([])
    img_resized = resize(img,(100,100))
    length = np.prod(img_resized.shape)
    img_resized = np.reshape(img_resized,length)
    images.append(img_resized)
    
images = np.vstack(images)

images /= 255 # normalize such that each pixel is in between 0 and 1

In [None]:
images.shape

### TODO: Compute mean squared error when you keep 80%, 90%, and 95% of the variance. How does the error vary?

### TODO: Keep 90% of the variance and apply PCA.

### TODO: Inverse transform the PCA-ed images. Make figures showing (1) original image, (2) inverse transformed image, and (3) the difference between the two.