<center><img src="img/logo_hse_black.jpg"></center>

<h1><center>Data Analysis</center></h1>
<h2><center>Seminar: PCA. t-SNE. </center></h2>

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 6)

# PCA

Lets try to use PCA on some dataset with various different ways

In [2]:
# Load data
df_wine = pd.read_csv('data/winequality-red.csv', sep=';')

# Make classification target feature
df_wine.loc[:, 'quality_cat'] = (df_wine.quality > 5).astype(int)
df_wine = df_wine.drop('quality', axis=1)

# Get descriptive and target features
X = df_wine.iloc[:, :-1].values
y = df_wine.iloc[:, -1].values

In [4]:
X.shape

(1599, 11)

In [5]:
y

array([0, 0, 0, ..., 1, 0, 1])

## PCA via sklearn

In [3]:
from sklearn.decomposition import PCA

In [6]:
pca = PCA(n_components=5)

In [7]:
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [8]:
PC = pca.transform(X)

In [9]:
PC.shape

(1599, 5)

In [14]:
pca.components_.shape

(5, 11)

In [15]:
PC[4:]

array([[-13.22490501,  -2.02389981,  -1.12682053,  -0.39923318,
         -1.08538902],
       [ -6.93400027,  -1.38790176,  -1.12127705,  -0.56009974,
         -1.01986846],
       [ 12.03682832,  -3.61177553,  -0.67163691,  -1.00390299,
         -0.75505874],
       ...,
       [ -3.43135351,  14.27124349,  -1.75071927,   0.17465219,
          0.23627052],
       [  1.13287834,  16.31616732,  -2.13441396,  -0.23538796,
         -0.51902768],
       [ -3.8743766 ,   3.12634754,  -1.87414925,   1.7006739 ,
          0.10432974]])

In [12]:
PC1 = X.dot(pca.components_.T)

In [16]:
PC1[:4]

array([[35.49124148,  3.27633804,  7.09548962,  1.39282058,  9.86729436],
       [70.75387009,  9.7085594 ,  7.91193216,  1.59211602, 10.39557399],
       [55.87881981,  2.79877699,  7.64044185,  1.57298448, 10.39974983],
       [62.14620931,  3.3491157 , 10.85634969,  0.21799436, 10.88887765]])

In [17]:
X_ = (X - X.mean(axis=0)) / X.std(axis=0)

In [18]:
X_

array([[-0.52835961,  0.96187667, -1.39147228, ...,  1.28864292,
        -0.57920652, -0.96024611],
       [-0.29854743,  1.96744245, -1.39147228, ..., -0.7199333 ,
         0.1289504 , -0.58477711],
       [-0.29854743,  1.29706527, -1.18607043, ..., -0.33117661,
        -0.04808883, -0.58477711],
       ...,
       [-1.1603431 , -0.09955388, -0.72391627, ...,  0.70550789,
         0.54204194,  0.54162988],
       [-1.39015528,  0.65462046, -0.77526673, ...,  1.6773996 ,
         0.30598963, -0.20930812],
       [-1.33270223, -1.21684919,  1.02199944, ...,  0.51112954,
         0.01092425,  0.54162988]])

## PCA via covariance matrix 

In [19]:
from numpy.linalg import eig

In [20]:
C = X_.T.dot(X)

In [21]:
C.shape

(11, 11)

In [23]:
lmbd, A = eig(C)

In [24]:
A.shape

(11, 11)

In [26]:
lmbd.shape

(11,)

In [27]:
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [28]:
PC2 = pca.transform(X_)

In [29]:
pca.components_

array([[-6.13247431e-03,  3.84465551e-04,  1.70902595e-04,
         8.64894648e-03,  6.37307290e-05,  2.18857434e-01,
         9.75678369e-01,  3.72498542e-06, -2.68008619e-04,
         2.23381730e-04, -6.35846721e-03],
       [-2.38994985e-02, -2.00966661e-03, -3.03480788e-03,
         1.11348551e-02, -2.36654751e-04,  9.75265982e-01,
        -2.18916841e-01, -2.49998510e-05,  3.27182194e-03,
         6.18926046e-04,  1.45642451e-02],
       [ 9.53135980e-01, -2.51315387e-02,  7.37082746e-02,
         2.80913620e-01,  2.94578815e-03,  2.08968395e-02,
        -1.52685886e-03,  7.76139600e-04, -5.86305467e-02,
         1.75252442e-02, -4.85991164e-02],
       [-2.65092214e-01,  7.27374987e-03, -9.83215158e-03,
         9.43177990e-01, -6.45262673e-04, -2.11894371e-02,
        -3.97992967e-03,  5.52088341e-05,  2.05779719e-02,
        -7.19926307e-03,  1.97613982e-01],
       [ 9.81476468e-02, -4.11570558e-02,  4.14815020e-02,
        -1.76626862e-01, -9.47985067e-03, -7.86843454e-03,
  

## PCA via SVD

## Singular Value Decomposition

Each matrix $X$ of size $n \times m$ and rank $r$ can be decomposed as:
$$ X = U S V^\top ,$$
where 
* $U$ - unary matrix, consists of eigenvectors of $XX^\top$
* $V$ - unary matrix, consists of eigenvectors of $X^\top X$
* $S$ - diagonal matrix with singular values $s_i = \sqrt{\lambda_i}$

<img src='img/pca_svd.png'>

In [None]:
from numpy.linalg import svd

# Quality and number of components

Build 2 pipelines:
    * StandartScaler + LogisticRegression
    * StandartScaler + PCA + LogisticRegression

Investigate accuracy of the second pipeline w.r.t. number of components in PCA. Compare it with the first pipeline.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score

# t-SNE and Digits

In [None]:
from sklearn.manifold import TSNE
from sklearn.datasets import load_digits

Run T-SNE on digits data. Investigate the incluence of `perplexity` parameter.