In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import load_iris

In [11]:
iris = load_iris()
df = pd.DataFrame(iris.data,columns = iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [15]:
dataMat = df.copy()

## PCA

In [17]:
# 1.对所有样本进行中心化（所有样本属性减去属性的平均值）
meanVals = np.mean(dataMat, axis=0)
meanRemoved = dataMat - meanVals

In [18]:
meanVals

sepal length (cm)    5.843333
sepal width (cm)     3.054000
petal length (cm)    3.758667
petal width (cm)     1.198667
dtype: float64

In [31]:
meanRemoved.shape

(150, 4)

In [37]:
n_dimension = np.shape(meanRemoved)[0]

In [45]:
# 2.计算样本的协方差矩阵 XXT
#covmat = np.cov(meanRemoved, rowvar=n_dimension)
covmat = (1/n_dimension-1)*meanRemoved.T.dot(meanRemoved)
print(covmat)

                   sepal length (cm)  sepal width (cm)  petal length (cm)  \
sepal length (cm)        -101.487211          5.811993        -188.513476   
sepal width (cm)            5.811993        -27.825849          47.615632   
petal length (cm)        -188.513476         47.615632        -460.771308   
petal width (cm)          -76.505209         17.462005        -191.873988   

                   petal width (cm)  
sepal length (cm)        -76.505209  
sepal width (cm)          17.462005  
petal length (cm)       -191.873988  
petal width (cm)         -86.201202  


In [54]:
# 3.对协方差矩阵做特征值分解，求得其特征值和特征向量，并将特征值从大到小排序，筛选出前topNfeat个
n_components = 2
eigVals, eigVects = np.linalg.eig(np.mat(covmat))
idx = np.argsort(eigVals) #对特征值从大到小排列
#eigValInd = eigValInd[:-(n_components+1):-1]    # 取前n_components大的特征值的索引
#redEigVects = eigVects[:, eigValInd]        # 取前n_components大的特征值所对应的特征向量
eigValInd = eigVals[idx][:n_dimension]
redEigVects = np.atleast_1d(eigVects[:,idx])[:,:n_dimension]

In [55]:
# 4.将数据转换到新的低维空间中
lowDDataMat = meanRemoved.dot(redEigVects)     # 降维之后的数据
reconMat = (lowDDataMat * redEigVects.T) + meanVals # 重构数据，可在原数据维度下进行对比查看
#return np.array(lowDDataMat), np.array(reconMat)

ValueError: Unable to coerce to DataFrame, shape must be (150, 4): given (4, 4)

In [51]:
lowDDataMat

Unnamed: 0,0,1
0,0.001006,-0.021512
1,0.099602,-0.203521
2,0.019305,0.024709
3,-0.075955,0.037672
4,-0.063129,0.096230
5,-0.027147,0.174326
6,-0.050100,0.264251
7,-0.046282,-0.015802
8,-0.026615,0.027335
9,-0.055891,-0.191533


In [None]:


def showData(dataMat, reconMat):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(dataMat[:, 0], dataMat[:, 1], c='green')
    ax.scatter(np.array(reconMat[:, 0]), reconMat[:, 1], c='red')
    plt.show()

def pca(dataMat, topNfeat=999999):

    # 1.对所有样本进行中心化（所有样本属性减去属性的平均值）
    meanVals = np.mean(dataMat, axis=0)
    meanRemoved = dataMat - meanVals

    # 2.计算样本的协方差矩阵 XXT
    covmat = np.cov(meanRemoved, rowvar=0)
    print(covmat)

    # 3.对协方差矩阵做特征值分解，求得其特征值和特征向量，并将特征值从大到小排序，筛选出前topNfeat个
    eigVals, eigVects = np.linalg.eig(np.mat(covmat))
    eigValInd = np.argsort(eigVals)
    eigValInd = eigValInd[:-(topNfeat+1):-1]    # 取前topNfeat大的特征值的索引
    redEigVects = eigVects[:, eigValInd]        # 取前topNfeat大的特征值所对应的特征向量

    # 4.将数据转换到新的低维空间中
    lowDDataMat = meanRemoved * redEigVects     # 降维之后的数据
    reconMat = (lowDDataMat * redEigVects.T) + meanVals # 重构数据，可在原数据维度下进行对比查看
    return np.array(lowDDataMat), np.array(reconMat)


# ---------------------------- main ---------------------------- #

dataMat = loadDataSet('./data/testSet.txt')
lowDDataMat, reconMat = pca(dataMat, 1)
#showData(dataMat, lowDDataMat)
showData(dataMat, reconMat)
print(lowDDataMat)