# 主成分分析
## 功能
<p>用来对多维度的数据进行降维，压缩数据或编码，多维数据可视化。</p>

## 算法
<p>数据集X（以7X5为例），通过最优化解码矩阵D使得解码后的数据与原数据X相差较小，可得出最优编码矩阵为D<sup>T</sup>。通过最优化编码损失为最小，可证明最优化的解码矩阵为X<sup>T</sup>X的特征向量B。</p>

<p>计算出X<sup>T</sup>的特征向量后，取较大的特征值对应的特征向量作为编码矩阵，例如取前两个特征向量，将7X5的数据编码成7X2的数据。</p>

In [55]:
import numpy as np

In [56]:
#示例矩阵，通过pca来对X进行编码
X = np.array([[1,2,3,4,5,6,7],[2,3,4,5,6,7,8],[3,4,5,6,7,8,9],[4,5,6,7,8,9,10],[5,6,7,8,9,10,11]]).T
X

array([[ 1,  2,  3,  4,  5],
       [ 2,  3,  4,  5,  6],
       [ 3,  4,  5,  6,  7],
       [ 4,  5,  6,  7,  8],
       [ 5,  6,  7,  8,  9],
       [ 6,  7,  8,  9, 10],
       [ 7,  8,  9, 10, 11]])

In [57]:
K = np.matmul(X.T,X)
K

array([[140, 168, 196, 224, 252],
       [168, 203, 238, 273, 308],
       [196, 238, 280, 322, 364],
       [224, 273, 322, 371, 420],
       [252, 308, 364, 420, 476]])

In [58]:
#X.T*X的特征分解，特征值A，特征向量B
A,B = np.linalg.eig(K)
A,B

(array([ 1.46330282e+03,  6.69717837e+00, -1.28864465e-13, -5.11899283e-14,
         2.80131645e-14]),
 array([[-0.30554081,  0.71178987,  0.33823955, -0.05015158, -0.02181198],
        [-0.37146435,  0.40250992, -0.7317982 ,  0.45839887, -0.1920943 ],
        [-0.4373879 ,  0.09322997,  0.17410819, -0.2490197 ,  0.6716786 ],
        [-0.50331144, -0.21604998,  0.49422002, -0.67655088, -0.67982636],
        [-0.56923498, -0.52532993, -0.27476956,  0.51732329,  0.22205405]]))

In [59]:
B

array([[-0.30554081,  0.71178987,  0.33823955, -0.05015158, -0.02181198],
       [-0.37146435,  0.40250992, -0.7317982 ,  0.45839887, -0.1920943 ],
       [-0.4373879 ,  0.09322997,  0.17410819, -0.2490197 ,  0.6716786 ],
       [-0.50331144, -0.21604998,  0.49422002, -0.67655088, -0.67982636],
       [-0.56923498, -0.52532993, -0.27476956,  0.51732329,  0.22205405]])

In [60]:
#pca编码矩阵D.T
D = B
C = np.matmul(D.T,X.T)
C

array([[-7.22005389e+00, -9.40699338e+00, -1.15939329e+01,
        -1.37808724e+01, -1.59678118e+01, -1.81547513e+01,
        -2.03416908e+01],
       [-1.69434995e+00, -1.22820011e+00, -7.62050269e-01,
        -2.95900426e-01,  1.70249417e-01,  6.36399260e-01,
         1.10254910e+00],
       [-3.99680289e-15, -3.10862447e-15, -2.22044605e-15,
        -1.77635684e-15, -4.44089210e-16,  8.88178420e-16,
         1.33226763e-15],
       [ 8.88178420e-16,  8.88178420e-16,  8.88178420e-16,
         8.88178420e-16,  8.88178420e-16,  8.88178420e-16,
         8.88178420e-16],
       [-2.66453526e-15, -8.88178420e-16, -8.88178420e-16,
        -4.44089210e-16,  1.33226763e-15,  2.22044605e-15,
         3.10862447e-15]])

In [61]:
#解码矩阵D对C进行解码
R = np.matmul(D,C)
R

array([[ 1.,  2.,  3.,  4.,  5.,  6.,  7.],
       [ 2.,  3.,  4.,  5.,  6.,  7.,  8.],
       [ 3.,  4.,  5.,  6.,  7.,  8.,  9.],
       [ 4.,  5.,  6.,  7.,  8.,  9., 10.],
       [ 5.,  6.,  7.,  8.,  9., 10., 11.]])

In [62]:
#取前两维特征向量对数据进行编码与解码
D = B[:,0:2]
C = np.matmul(D.T,X.T)
R = np.matmul(D,C)
C,R

(array([[ -7.22005389,  -9.40699338, -11.59393287, -13.78087236,
         -15.96781185, -18.15475133, -20.34169082],
        [ -1.69434995,  -1.22820011,  -0.76205027,  -0.29590043,
           0.17024942,   0.63639926,   1.1025491 ]]),
 array([[ 1.,  2.,  3.,  4.,  5.,  6.,  7.],
        [ 2.,  3.,  4.,  5.,  6.,  7.,  8.],
        [ 3.,  4.,  5.,  6.,  7.,  8.,  9.],
        [ 4.,  5.,  6.,  7.,  8.,  9., 10.],
        [ 5.,  6.,  7.,  8.,  9., 10., 11.]]))