In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
df_wine.columns = ['Class Label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.fit_transform(X_test)

In [4]:
# 特征抽取
# PCA:根据特征之间的相关性确定数据存在的模式.寻找高维数据中最大方差的方向,将高维数据映射到一个新的子空间,选取的是方差最大的K个主成分
# PCA第一步构建协方差矩阵
cov_mat = np.cov(X_train_std.T)
# 得到特征值和特征向量
eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)
print('\nEigenvalues \n', eigen_vals)
# 协方差矩阵的特征值代表了该特征的信息量,降序排序选择前K个主成分,代表着方差解释率高


Eigenvalues 
 [4.8923083  2.46635032 1.42809973 1.01233462 0.84906459 0.60181514
 0.52251546 0.08414846 0.33051429 0.29595018 0.16831254 0.21432212
 0.2399553 ]


In [5]:
# 特征转换
# 先对特征值排序
eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:, i]) for i in range(len(eigen_vals))]
eigen_pairs.sort(reverse=True)

In [6]:
# 选择最大的两个特征值对应的特征向量
w = np.hstack((eigen_pairs[0][1][:, np.newaxis],
              eigen_pairs[1][1][:, np.newaxis]))
w

array([[ 0.14669811,  0.50417079],
       [-0.24224554,  0.24216889],
       [-0.02993442,  0.28698484],
       [-0.25519002, -0.06468718],
       [ 0.12079772,  0.22995385],
       [ 0.38934455,  0.09363991],
       [ 0.42326486,  0.01088622],
       [-0.30634956,  0.01870216],
       [ 0.30572219,  0.03040352],
       [-0.09869191,  0.54527081],
       [ 0.30032535, -0.27924322],
       [ 0.36821154, -0.174365  ],
       [ 0.29259713,  0.36315461]])

In [7]:
# X'=XW
X_train_pca = X_train_std.dot(w)
X_train_pca

array([[ 2.59891628,  0.00484089],
       [ 0.15819134, -2.26659577],
       [-2.6372337 ,  2.66488569],
       [-2.52848449,  0.51846618],
       [ 1.70922581, -0.91719459],
       [-2.83057003,  0.41936129],
       [-2.82251879,  1.99763147],
       [ 1.36618015,  0.04639099],
       [-2.46584868, -0.07932269],
       [-2.28554906, -0.40096658],
       [ 1.14246632, -2.39587633],
       [-2.28497881, -1.09274988],
       [-2.52924945,  0.6477328 ],
       [ 0.169245  , -1.1264982 ],
       [ 2.53088166,  1.05798498],
       [-0.71596964, -2.80365836],
       [ 2.46922033, -0.15871191],
       [-0.58044574, -0.69290749],
       [ 0.54583852,  0.41042188],
       [ 3.5604963 ,  1.42561284],
       [ 1.58679826, -1.51260121],
       [ 2.54872139, -0.05280515],
       [-3.59338727,  0.88321901],
       [-1.60406659,  2.40373662],
       [ 1.48668426, -1.40863724],
       [ 0.00830468, -2.04898307],
       [-0.15646658, -2.80278355],
       [-2.39863877,  2.47524175],
       [-3.13549157,

In [8]:
# PCA
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
lr = LogisticRegression()
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

lr.fit(X_train_std, y_train)
print('Training accuracy: ', lr.score(X_train_std, y_train))
print('Test accuracy: ', lr.score(X_test_std, y_test))

lr.fit(X_train_pca, y_train)
print('PCA Training accuracy: ', lr.score(X_train_pca, y_train))
print('PCA Test accuracy: ', lr.score(X_test_pca, y_test))

  return f(*args, **kwds)


Training accuracy:  0.9919354838709677
Test accuracy:  1.0
PCA Training accuracy:  0.967741935483871
PCA Test accuracy:  0.9814814814814815


In [13]:
# LDA 线性判别分析,可以减小非正则模型模型的过拟合
# LDA是为每个类单独优化,各得到个各类的最优特征子集. PCA和LDA都是线性转换,PCA无监督,LDA监督.
# 步骤: 1. 将d维度原始数据标准化; 2. 计算每个类的d维度平均向量(每个类的中心点); 3. 类间散点矩阵Sb,类内散点矩阵Sw; 4. 计算矩阵Sw^-1Sb的特征向量和特征值(同类之间尽可能接近,不同类之间尽可能远离); 5. 选择前K个最大的特征值和特征向量,用特征向量构成d*k维的W, 6. 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=2)
lr = LogisticRegression()
X_train_lda = lda.fit_transform(X_train_std, y_train)
X_test_lda = lda.transform(X_test_std)

lr.fit(X_train_std, y_train)
print('Training accuracy: ', lr.score(X_train_std, y_train))
print('Test accuracy: ', lr.score(X_test_std, y_test))

lr.fit(X_train_lda, y_train)
print('LDA Training accuracy: ', lr.score(X_train_lda, y_train))
print('LDA Test accuracy: ', lr.score(X_test_lda, y_test))

Training accuracy:  0.9919354838709677
Test accuracy:  1.0
LDA Training accuracy:  0.9919354838709677
LDA Test accuracy:  1.0




In [20]:
# 核PCA: 使用kernel trick将原始数据转换到一个高维空间,然后再这个高维空间运用PCA降维,低于原始空间,最后用线性分类器就可以解决问题了.
# 样本太小的话可能效果不是很好
from sklearn.decomposition import KernelPCA
from sklearn.datasets import make_moons
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=1000, random_state=123) # make-moons得到的结果本身就是标准化数据,所以不需要在PCA之前再标准化
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
scikit_kpca = KernelPCA(n_components=2, kernel='rbf', gamma=15)
X_train_skernpca = scikit_kpca.fit_transform(X_train)
X_test_skernpca = scikit_kpca.transform(X_test)


lr.fit(X_train, y_train)
print('Training accuracy: ', lr.score(X_train, y_train))
print('Test accuracy: ', lr.score(X_test, y_test))

lr.fit(X_train_skernpca, y_train)
print('kernel PCA Training accuracy: ', lr.score(X_train_skernpca, y_train))
print('kernel PCA Test accuracy: ', lr.score(X_test_skernpca, y_test))

Training accuracy:  0.8971428571428571
Test accuracy:  0.8666666666666667
kernel PCA Training accuracy:  0.9985714285714286
kernel PCA Test accuracy:  1.0


