# Feature Extraction

任务:采用 80%作样本作训练集，20%样本做测试集，报告降至不同维数时的分类性能。

In [None]:
# 以下分别是Vehicle数据集、ORL数据集存储路径。
VEHICLE_DATASET_DIR = 'data/vehicle.txt'
ORL_DATASET_DIR = 'data/ORLData_25.txt'

# 读取数据集
import numpy as np
VEHICLE_DATASET = np.loadtxt(VEHICLE_DATASET_DIR, delimiter='\t', dtype=int)
VEHICLE_DATASET_FEATURES, VEHICLE_DATASET_LABEL = VEHICLE_DATASET[:, :-1], VEHICLE_DATASET[:, -1]
VEHICLE_DATASET_NDIM = VEHICLE_DATASET_FEATURES.shape[1]

ORL_DATASET = np.loadtxt(ORL_DATASET_DIR, delimiter='\t', dtype=int)
ORL_DATASET_FEATURES, ORL_DATASET_LABEL = ORL_DATASET[:, :-1], ORL_DATASET[:, -1]
ORL_DATASET_NDIM = ORL_DATASET_FEATURES.shape[1]

from Animator import MyAnimator

## Q1: PCA+KNN

即首先 PCA 进行降维，然后采用最近邻分类器 (1 近邻分类器) 作为分类器进行分类。PCA+KNN: 即首先 PCA 进行降维，然后采用最近邻分类器 (1 近邻分类器) 作为分类器进行分类。

### Vehicle数据集

In [None]:
import pca

print("[VEHICLE_DATASET PCA+KNN]")
vehicle_pca = pca.PCA()  #  PCA初始化
vehicle_pca.fit(VEHICLE_DATASET_FEATURES)

vehicle_pca_knn_clf_acc = []
animator = MyAnimator(xlabel='n_components', xlim=[3, VEHICLE_DATASET_NDIM], ylim=[0.2, 1.0],
                      legend=['test accuracy'])  # 动态绘图

for cnt, n_component in enumerate(range(2, VEHICLE_DATASET_NDIM, 2)):
    acc = pca.pca_knn_clf(vehicle_pca, VEHICLE_DATASET_LABEL, n_component)  # PCA+KNN分类器
    print("Number of dimensions reduced to {:^3d}, classification accuracy: {:.4f}".format(n_component, acc))

    vehicle_pca_knn_clf_acc.append(acc)
    animator.add(n_component, acc)

# animator.savefig("fig/vehicle_pca_knn_clf.pdf")

### ORL数据集

In [None]:
print("[ORL_DATASET PCA+KNN]")
orl_pca = pca.PCA()
orl_pca.fit(ORL_DATASET_FEATURES)

orl_pca_knn_clf_acc = []
animator = MyAnimator(xlabel='n_components', xlim=[3, VEHICLE_DATASET_NDIM], ylim=[0.2, 1.0], legend=['test accuracy'])  # 绘图

for cnt, n_component in enumerate(range(10, ORL_DATASET_NDIM, 5)):
    acc = pca.pca_knn_clf(orl_pca, ORL_DATASET_LABEL, n_component)  # PCA+KNN分类器
    print("Number of dimensions reduced to {:^3d}, classification accuracy: {:.4f}".format(n_component, acc))
    orl_pca_knn_clf_acc.append((n_component, acc))
    animator.add(n_component, acc)

# animator.savefig("fig/vehicle_pca_knn_clf.pdf")

## Q2: LDA+KNN

LDA +KNN，即首先 LDA 进行降维，然后采用最近邻分类器(1 近邻分类 器)作为分类器进行分类。

### Vehicle数据集

In [None]:
import lda

print("[VEHICLE_DATASET LDA+KNN]")
vehicle_lda = lda.LDA()
vehicle_lda.fit(VEHICLE_DATASET_FEATURES, VEHICLE_DATASET_LABEL)

vehicle_lda_knn_clf_acc = []
animator = MyAnimator(xlabel='n_components', xlim=[3, VEHICLE_DATASET_NDIM], ylim=[0.2, 1.0], legend=['test accuracy'])  # 绘图

for cnt, n_component in enumerate(range(1, 4)):
    acc = lda.lda_knn_clf(vehicle_lda, n_component)  # LDA+KNN分类器
    print("Number of dimensions reduced to {:^3d}, classification accuracy: {:.4f}".format(n_component, acc))

    vehicle_lda_knn_clf_acc.append((n_component, acc))
    animator.add(n_component, acc)


### ORL数据集

In [None]:
import lda
print("[ORL_DATASET LDA+KNN]")
ORL_lda = lda.LDA()
ORL_lda.fit(ORL_DATASET_FEATURES/255, ORL_DATASET_LABEL)

ORL_lda_knn_clf_acc = []
animator = MyAnimator(xlabel='n_components', xlim=[3, VEHICLE_DATASET_NDIM], ylim=[0.2, 1.0], legend=['test accuracy'])  # 绘图

for cnt, n_component in enumerate(range(1, 10, 1)):
    acc = lda.lda_knn_clf(ORL_lda, n_component)  # LDA+KNN分类器
    print("Number of dimensions reduced to {:^3d}, classification accuracy: {:.4f}".format(n_component, acc))

    ORL_lda_knn_clf_acc.append((n_component, acc))
    animator.add(n_component, acc)