# *k*-NN and PCA
The intrinsic dimension of the data is important in *k*-NN.  
Here we use Principle Component Analysis (PCA) to get an estimate of the intrinsic dimension of a dataset. 

In [None]:
import numpy as np
import kNNDataLoader
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from sklearn.decomposition import PCA

Load the datasets. 

In [None]:
Name_arr, X_dir, y_dir = kNNDataLoader.data_loader()
Name_arr

# PCA
Check the variation in the data *explained* by the first 4 PCs. 

In [None]:
npc = 4
pca_dir = {}
pca = PCA(n_components=npc)
for n in Name_arr:
    X = X_dir[n]
    X_r = pca.fit(X).transform(X)
    pca_dir[n]=pca.explained_variance_ratio_
    print("{0:s}: Variance explained by 4 PCs: {1:.2f}".format(n,pca.explained_variance_ratio_.sum()))



Plot the variance explained by PCs for 1 to 4 for the `Shuttle` and `HTRU` datasets. 

In [None]:
w = 0.25

x = list(range(1,npc+1))
x1 = [i-w/2 for i in x]
x2 = [i+w/2 for i in x]
fig, ax1 = plt.subplots(figsize=(6,4))
h_col = 'blue'
s_col = 'r'
ax1.set_xlabel('Principal Components')
ax1.set_ylabel('Variance')
ax1.bar(x1, pca_dir['HTRU'], color=h_col, alpha = 0.5, width = w)
ax1.bar(x2, pca_dir['Shuttle'], color=s_col, alpha = 0.5, width = w)

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

ax2.set_ylabel('Cumulative Variance')  
ax2.plot(x1, np.cumsum(pca_dir['HTRU']), color=h_col, marker = 'x', label = 'HTRU')
ax2.plot(x2, np.cumsum(pca_dir['Shuttle']), color=s_col, marker = 'x', label = 'Shuttle')

ax1.set_ylim(0,1)

ax2.set_ylim(0,1)

ax2.legend(loc = 'upper left')


ax1.xaxis.set_major_locator(MaxNLocator(integer=True))

# plt.show()