# CH13 PCA

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Dimensionality reduction
- 使数据集更容易使用
- 降低计算开销
- 去除噪声
- 使得结果容易理解

三种降维技术
- **主成分分析**（Principal Component Analysis）
- 因子分析(Factor Analysis)
- 独立成分分析(Independent Component Analysis)

## MLiA

### PCA

#### Raw data

##### Load Data

In [None]:
df = pd.read_table("./Data/CH13/testSet.txt",header=None)
df.head()

In [None]:
df.describe()

##### Visualization

In [None]:
dataMat = df.values
plt.figure(figsize=(6,7))
plt.scatter(x=dataMat[:,0],y=dataMat[:,1])
plt.xlim(5,14)
plt.show()

#### PCA

##### Algorithm

In [None]:
def pca(dataMat, topNfeat=9999999):
    meanVals = np.mean(dataMat, axis=0)
    # 1 去掉均值B = A - Mean
    meanRemoved = dataMat - meanVals #remove mean
    # 2 计算协方差矩阵C = cov(B)
    covMat = np.cov(meanRemoved, rowvar=0)
    # 3 求特征值和特征向量
    eigVals,eigVects = np.linalg.eig(np.mat(covMat))
    eigValInd = np.argsort(eigVals)            #sort, sort goes smallest to largest
    # 4 降维 TopN
    eigValInd = eigValInd[:-(topNfeat+1):-1]  #cut off unwanted dimensions
    redEigVects = eigVects[:,eigValInd]       #reorganize eig vects largest to smallest
    
    lowDDataMat = meanRemoved * redEigVects#transform data into new dimensions
    reconMat = (lowDDataMat * redEigVects.T) + meanVals
    return lowDDataMat, reconMat

In [None]:
lowDMat, reconMat = pca(dataMat,1)

In [None]:
reconMat

##### Visualization

In [None]:
plt.figure(figsize=(6,7))
plt.scatter(x=dataMat[:,0],y=dataMat[:,1])
plt.scatter(x=reconMat.A[:,0],y=reconMat.A[:,1])
plt.xlim(5,14)
plt.show()

In [None]:
lowDMat, reconMat = pca(dataMat,2)
plt.figure(figsize=(6,7))
plt.scatter(x=reconMat.A[:,0],y=reconMat.A[:,1])
plt.xlim(5,14)
plt.show()

### EX : Secom

In [None]:
df = pd.read_csv("./Data/CH13/secom.data",sep=" ",header = None)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.fillna(dict(df.mean()),inplace=True)
df.describe()

In [None]:
dataMat = df.values

In [None]:
meanVals = np.mean(dataMat,axis=0)

In [None]:
meanRemoved = dataMat - meanVals

In [None]:
covMat = np.cov(meanRemoved, rowvar=0)

In [None]:
covMat

In [None]:
# 特征向量和特征值
eigVals, eigVects = np.linalg.eig(np.mat(covMat))

In [None]:
print(np.shape(eigVals),np.shape(eigVects))

In [None]:
eigVals # 特征值

In [None]:
eigVects # 特征向量

In [None]:
np.shape(eigVects)

In [None]:
np.var(eigVects)

方差占比


## Sklearn

In [None]:
import pandas as pd

In [None]:
df = pd.read_table("./Data/CH13/testSet.txt",header=None)
df.head()

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=1)
pca.fit(df.values)

In [None]:
lowDData  = pca.transform(df.values)

In [None]:
pca.get_covariance()

In [None]:
pca.mean_

In [None]:
pca.n_components_

In [None]:
pca.singular_values_

In [None]:
pca.explained_variance_ratio_

In [None]:
pca.explained_variance_

In [None]:
plt.scatter(df.values[:,0],df.values[:,1])
plt.show()

In [None]:
eigVals,eigVects = np.linalg.eig(pca.get_covariance())

In [None]:
eigVals

In [None]:
eigVects

In [None]:
df = pd.read_csv("./Data/CH13/secom.data",sep=" ",header = None)

In [None]:
df.fillna(0,inplace=True)

In [None]:
pca = PCA(n_components=20)
pca.fit(df.values)

In [None]:
pca.explained_variance_ratio_

In [None]:
pca.components_

In [None]:
pca.get_covariance()

### OneClass

In [None]:
from sklearn import svm
from sklearn import metrics

In [None]:
clf = svm.OneClassSVM(nu=0.2, kernel="rbf", gamma=3.8)
X = pd.read_csv("./Data/CH13/secom.data",sep=" ",header = None)
y = pd.read_csv("./Data/CH13/secom_labels.data",sep=" ",header = None)
X.fillna(0,inplace=True)
clf.fit(X[y[0]==1].values)
y_pred = clf.predict(X.values)
y["y_pred"] = y_pred
print("F1 Score: ", metrics.f1_score(y[0],y["y_pred"]), "ACC:", metrics.accuracy_score(y[0],y["y_pred"]))

In [None]:
metrics.auc(y["y_pred"],y[0],reorder=True)

In [None]:
metrics.roc_curve(y_true=y[0],y_score=y["y_pred"])

### IForest

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
iF = IsolationForest(max_samples=20, random_state=0)
iF.fit(X[y[0]==1].values)
y_pred = iF.predict(X.values)
y["y_pred"] = y_pred
metrics.accuracy_score(y[0],y["y_pred"])

In [None]:
metrics.accuracy_score(y[y[0]==1][0],y[y[0]==1]["y_pred"] )

### SVC

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:
clf = svm.SVC(kernel="rbf", gamma=0.2)
X = pd.read_csv("./Data/CH13/secom.data",sep=" ",header = None)
y = pd.read_csv("./Data/CH13/secom_labels.data",sep=" ",header = None)
X.fillna(0,inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X.values,y[0].values,test_size=0.3)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("F1 Score: ", metrics.f1_score(y_test,y_pred), "ACC:", metrics.accuracy_score(y_test,y_pred))

## Other

### iris

In [None]:
from sklearn import datasets
from sklearn.decomposition import PCA
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [None]:
# 了解一下数据集
X.shape, y.shape

In [None]:
# 生成数据集选择list
from itertools import combinations
combins = [c for c in  combinations(range(4), 3)]
for c in combins:
    print(list(c))

In [None]:
# 可视化
from pyecharts import Scatter3D, Grid
names = iris.target_names
# 更改索引看不同的特征选择
feas = combins[0]
target = y
scatter3D_ = Scatter3D("", width=300, height=300)
scatter3D_.use_theme("dark")
data = []
for idx in range(3):
    data = X[target == idx][:, feas]
    scatter3D_.add(names[idx], data)
scatter3D_

1. 可以尝试注释部分的不同的组合, 旋转坐标轴看在不同组合的空间里，大概这些类别是可以分开的
1. 思考两个问题：
    1. 是不是可以把数据的维度降低到3或者更少
    1. 有没有量化的指标可以代替我们可视化的这个过程。
