# Principal Component Analysis (PCA)

# UNsupervised

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
df = pd.read_csv(url, header=None, sep=',')

In [6]:
df.shape

(178, 14)

In [8]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [10]:
df.dtypes

0       int64
1     float64
2     float64
3     float64
4     float64
5       int64
6     float64
7     float64
8     float64
9     float64
10    float64
11    float64
12    float64
13      int64
dtype: object

In [23]:
x = df.iloc[:,1:].values
y = df.iloc[:,0].values

In [26]:
# investigate the distrubution of class labels
unique, counts = np.unique(y, return_counts=True)  #panda use valuecounts
dict(zip(unique, counts))

{1: 59, 2: 71, 3: 48}

In [28]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, 
                                                   stratify=y, random_state=0)

In [30]:
from sklearn.preprocessing import StandardScaler
# standardize the features
sc = StandardScaler()
x_train_std = sc.fit_transform(x_train)
x_test_std = sc.transform(x_test)

In [32]:
# transform the original data into a new feature space using PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=None)

In [33]:
# transform data in the training set
x_train_pca = pca.fit_transform(x_train_std)

In [34]:
# show the eigenvalues of each principal componente in the training set
# eigenvalues
pca.explained_variance_

array([4.84274532, 2.41602459, 1.54845825, 0.96120438, 0.84166161,
       0.6620634 , 0.51828472, 0.34650377, 0.3131368 , 0.21357215,
       0.1808613 , 0.15362835, 0.10754642])

In [35]:
# display the explaiend ratio of each principal component
# explaied variance ratio = eigenvalue/total eigenvalues for each component
pca.explained_variance_ratio_

array([0.36951469, 0.18434927, 0.11815159, 0.07334252, 0.06422108,
       0.05051724, 0.03954654, 0.02643918, 0.02389319, 0.01629614,
       0.01380021, 0.01172226, 0.00820609])

In [37]:
# There are 13 principal components constructed from 13 original features
pca.components_

array([[-1.37242175e-01,  2.47243265e-01, -2.54515927e-02,
         2.06945084e-01, -1.54365821e-01, -3.93769523e-01,
        -4.17351064e-01,  3.05728961e-01, -3.06683469e-01,
         7.55406578e-02, -3.26132628e-01, -3.68610222e-01,
        -2.96696514e-01],
       [ 5.03034778e-01,  1.64871190e-01,  2.44564761e-01,
        -1.13529045e-01,  2.89745182e-01,  5.08010391e-02,
        -2.28733792e-02,  9.04888470e-02,  8.35232677e-03,
         5.49775805e-01, -2.07164328e-01, -2.49025357e-01,
         3.80229423e-01],
       [-1.37748734e-01,  9.61503863e-02,  6.77775667e-01,
         6.25040550e-01,  1.96135481e-01,  1.40310572e-01,
         1.17053859e-01,  1.31217777e-01,  3.04309008e-02,
        -7.99299713e-02,  5.30591506e-02,  1.32391030e-01,
        -7.06502178e-02],
       [-3.29610003e-03,  5.62646692e-01, -1.08977111e-01,
         3.38187002e-02, -3.67511070e-01,  2.40245127e-01,
         1.87053299e-01, -2.29262234e-02,  4.96262330e-01,
         1.06482939e-01, -3.69053747e

In [38]:
# two principal components will be used
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

In [39]:
# fit and transform data
x_train_pca = pca.fit_transform(x_train_std)
x_test_pca = pca.transform(x_test_std)

In [48]:
# construct a logistic regression model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(multi_class='auto', solver='liblinear')

In [49]:
# fit the logistic regrssion model to the training data
lr.fit(x_train_pca, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [50]:
predictions = lr.predict(x_test_pca)

In [53]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions, labels=[1, 2, 3])

array([[15,  3,  0],
       [ 1, 20,  0],
       [ 0,  0, 15]], dtype=int64)

In [56]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average=None)
recall = recall_score(y_test, predictions, average=None)
f1 = f1_score(y_test, predictions, average=None)

print('performance = ')
print('- accuracy = ' + str(accuracy))
print('- precision = ' + str(precision))
print('- recall = ' + str(recall))
print('- f1 = ' + str(f1))

performance = 
- accuracy = 0.9259259259259259
- precision = [0.9375     0.86956522 1.        ]
- recall = [0.83333333 0.95238095 1.        ]
- f1 = [0.88235294 0.90909091 1.        ]


In [57]:
# micro-average precision, recall and f1
precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')

print('performance = ')
print('- precision = ' + str(precision))
print('- recall = ' + str(recall))
print('- f1 = ' + str(f1))

performance = 
- precision = 0.9259259259259259
- recall = 0.9259259259259259
- f1 = 0.9259259259259259


In [58]:
# macro-average precision, recall and f1
precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')

print('performance = ')
print('- precision = ' + str(precision))
print('- recall = ' + str(recall))
print('- f1 = ' + str(f1))

performance = 
- precision = 0.9356884057971014
- recall = 0.9285714285714285
- f1 = 0.9304812834224597


In [59]:
# test 10 feature
# ten principal components will be used
from sklearn.decomposition import PCA
pca = PCA(n_components=10)

In [60]:
# fit and transform data
x_train_pca = pca.fit_transform(x_train_std)
x_test_pca = pca.transform(x_test_std)

In [61]:
# construct a logistic regression model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(multi_class='auto', solver='liblinear')

In [62]:
# fit the logistic regrssion model to the training data
lr.fit(x_train_pca, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [63]:
predictions = lr.predict(x_test_pca)

In [64]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions, labels=[1, 2, 3])

array([[18,  0,  0],
       [ 0, 21,  0],
       [ 0,  0, 15]], dtype=int64)

In [65]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average=None)
recall = recall_score(y_test, predictions, average=None)
f1 = f1_score(y_test, predictions, average=None)

print('performance = ')
print('- accuracy = ' + str(accuracy))
print('- precision = ' + str(precision))
print('- recall = ' + str(recall))
print('- f1 = ' + str(f1))

performance = 
- accuracy = 1.0
- precision = [1. 1. 1.]
- recall = [1. 1. 1.]
- f1 = [1. 1. 1.]


In [66]:
# micro-average precision, recall and f1
precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')

print('performance = ')
print('- precision = ' + str(precision))
print('- recall = ' + str(recall))
print('- f1 = ' + str(f1))

performance = 
- precision = 1.0
- recall = 1.0
- f1 = 1.0


In [67]:
# macro-average precision, recall and f1
precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')

print('performance = ')
print('- precision = ' + str(precision))
print('- recall = ' + str(recall))
print('- f1 = ' + str(f1))

performance = 
- precision = 1.0
- recall = 1.0
- f1 = 1.0
