![WhatsApp%20Image%202020-09-logo%20at%2011.52.29%20PM.jpeg](attachment:WhatsApp%20Image%202020-09-logo%20at%2011.52.29%20PM.jpeg)

# Dimensionality Reduction and Manifold Learning
# Principal Components Analysis (PCA)

![Blog_pca_6b.png](attachment:Blog_pca_6b.png)

In [1]:
%matplotlib notebook
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
# Breast cancer dataset
cancer = load_breast_cancer()
(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)

# Using PCA to find the first two principal components of the breast cancer dataset

## Let's create a DataFrame so that we can visualize the data

In [37]:
df=pd.DataFrame(X_cancer)
arr2=pd.Series(y_cancer,name='target')

In [38]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [39]:
arr2

0      0
1      0
2      0
3      0
4      0
      ..
564    0
565    0
566    0
567    0
568    1
Name: target, Length: 569, dtype: int64

## Adding our target to our features

In [40]:
df['target']=arr2

In [41]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


## as you may notice the values for each feature are not on similar scale
## that's why we need to scale the data first
### Before applying PCA, each feature should be centered (zero mean) and with unit variance 

In [44]:
from sklearn.preprocessing import StandardScaler

df_scaled= StandardScaler().fit(df.iloc[:,:-1]).transform(df.iloc[:,:-1])
df_scaled

array([[ 1.09706398, -2.07333501,  1.26993369, ...,  2.29607613,
         2.75062224,  1.93701461],
       [ 1.82982061, -0.35363241,  1.68595471, ...,  1.0870843 ,
        -0.24388967,  0.28118999],
       [ 1.57988811,  0.45618695,  1.56650313, ...,  1.95500035,
         1.152255  ,  0.20139121],
       ...,
       [ 0.70228425,  2.0455738 ,  0.67267578, ...,  0.41406869,
        -1.10454895, -0.31840916],
       [ 1.83834103,  2.33645719,  1.98252415, ...,  2.28998549,
         1.91908301,  2.21963528],
       [-1.80840125,  1.22179204, -1.81438851, ..., -1.74506282,
        -0.04813821, -0.75120669]])

In [46]:
pd.DataFrame(df_scaled).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,1.097064,-2.073335,1.269934,0.984375,1.568466,3.283515,2.652874,2.532475,2.217515,2.255747,...,1.88669,-1.359293,2.303601,2.001237,1.307686,2.616665,2.109526,2.296076,2.750622,1.937015
1,1.829821,-0.353632,1.685955,1.908708,-0.826962,-0.487072,-0.023846,0.548144,0.001392,-0.868652,...,1.805927,-0.369203,1.535126,1.890489,-0.375612,-0.430444,-0.146749,1.087084,-0.24389,0.28119
2,1.579888,0.456187,1.566503,1.558884,0.94221,1.052926,1.363478,2.037231,0.939685,-0.398008,...,1.51187,-0.023974,1.347475,1.456285,0.527407,1.082932,0.854974,1.955,1.152255,0.201391
3,-0.768909,0.253732,-0.592687,-0.764464,3.283553,3.402909,1.915897,1.451707,2.867383,4.910919,...,-0.281464,0.133984,-0.249939,-0.550021,3.394275,3.893397,1.989588,2.175786,6.046041,4.93501
4,1.750297,-1.151816,1.776573,1.826229,0.280372,0.53934,1.371011,1.428493,-0.00956,-0.56245,...,1.298575,-1.46677,1.338539,1.220724,0.220556,-0.313395,0.613179,0.729259,-0.868353,-0.3971


## now let's see the PCA representation

In [83]:
df_scaled = pd.DataFrame(df_scaled)
pca = PCA(n_components = 2).fit( df_scaled )

df_pca = pca.transform( df_scaled )
print(df_scaled.shape , df_pca.shape)

(569, 30) (569, 2)


In [79]:
df_pca

array([[ 9.19283683,  1.94858307],
       [ 2.3878018 , -3.76817174],
       [ 5.73389628, -1.0751738 ],
       ...,
       [ 1.25617928, -1.90229671],
       [10.37479406,  1.67201011],
       [-5.4752433 , -0.67063679]])

In [80]:
pd.DataFrame( df_pca ).head()

Unnamed: 0,0,1
0,9.192837,1.948583
1,2.387802,-3.768172
2,5.733896,-1.075174
3,7.122953,10.275589
4,3.935302,-1.948072


### Let's visualize the 2 components

In [81]:
from adspy_shared_utilities import plot_labelled_scatter

plot_labelled_scatter( df_pca , y_cancer, ['malignant', 'benign'])

plt.xlabel('First principal component')
plt.ylabel('Second principal component')
plt.title('Breast Cancer Dataset PCA (n_components = 2)');

<IPython.core.display.Javascript object>

## let's concatenate the features into 4 features now?

In [89]:
df_scaled4 = pd.DataFrame(df_scaled)
pca4 = PCA(n_components = 4).fit( df_scaled4 )

df_pca4 = pca4.transform( df_scaled4 )
print(df_scaled4.shape , df_pca4.shape)

(569, 30) (569, 4)


In [77]:
pd.DataFrame( df_pca ).head()

Unnamed: 0,0,1,2,3
0,9.192837,1.948583,-1.123166,3.633731
1,2.387802,-3.768172,-0.529293,1.118264
2,5.733896,-1.075174,-0.551748,0.912083
3,7.122953,10.275589,-3.23279,0.152547
4,3.935302,-1.948072,1.389767,2.940639


### Let's visualize 3 components of the 4 

In [75]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

img = ax.scatter(df_pca[:,0],df_pca[:,1],df_pca[:,2], cmap=plt.hot())
fig.colorbar(img)
plt.show()

<IPython.core.display.Javascript object>

## Let's see the importance of each Component, at 2 components

In [88]:
fig = plt.figure(figsize=(8, 4))
plt.imshow(pca.components_, interpolation = 'none', cmap = 'plasma')
feature_names = list(cancer.feature_names)

plt.gca().set_xticks(np.arange(-.5, len(feature_names)));
plt.gca().set_yticks(np.arange(0.5, 2));
plt.gca().set_xticklabels(feature_names, rotation=90, ha='left', fontsize=11);
plt.gca().set_yticklabels(['First Comp.', 'Sec. Comp.'], va='bottom', fontsize=11);

plt.colorbar(orientation='horizontal', ticks=[ pca.components_.min(), 0, pca.components_.max() ], pad=0.65);

<IPython.core.display.Javascript object>

## Let's see the importance of each Component, at 4 components

In [98]:
fig = plt.figure(figsize=(8, 4))
plt.imshow(pca4.components_, interpolation = 'none', cmap = 'plasma')
feature_names = list(cancer.feature_names)

plt.gca().set_xticks(np.arange(-.5, len(feature_names)));
plt.gca().set_yticks(np.arange(0.5, 2));
plt.gca().set_xticklabels(feature_names, rotation=90, ha='left', fontsize=11);
plt.gca().set_yticklabels(labels=[' ', ' '], va='bottom', fontsize=11);

plt.colorbar(orientation='horizontal', ticks=[ pca4.components_.min(), 0, pca4.components_.max() ], pad=0.65);

<IPython.core.display.Javascript object>

## there are other Algorithms for this job like:

### Multidimensional scaling (MDS)
### t-SNE

# The rest of the researching is up to you  to know more
![That%2527s_all_Folks_tagline.jpg](attachment:That%2527s_all_Folks_tagline.jpg)