In [5]:
from sklearn.datasets import fetch_openml
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# 加载MNIST数据集
mnist = fetch_openml('mnist_784', version=1,parser='auto',as_frame= False)
X, y = mnist['data'], mnist['target']

y = y.astype(np.int64)
print(X.shape)
print(y.shape)

# 挑选数据集

In [6]:
# 初始化所需的数据结构
X_train, y_train, X_test, y_test = [], [], [], []

# 挑选训练集和测试集
for digit in range(10):
    # 获取当前数字的所有索引
    indices = np.where(y == digit)[0]
    # 打乱索引
    np.random.shuffle(indices)
    # 挑选前100个作为训练集，50个作为测试集
    X_train.extend(X[indices[:1000]])
    y_train.extend(y[indices[:1000]])
    X_test.extend(X[indices[1000:1500]])
    y_test.extend(y[indices[1000:1500]])

X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

In [7]:
#查看当前数据集类的数量情况
y_train_df = pd.DataFrame(data = y_train, columns = ['class'])
y_test_df = pd.DataFrame(data = y_test, columns = ['class'])
y_train_df['class'].value_counts().plot(kind = 'bar', colormap = 'Paired')
plt.xlabel('Class')
plt.ylabel('Number of samples for each category')
plt.title('Training set')
plt.savefig('knn_svm1.png')
plt.show()

In [8]:
y_test_df['class'].value_counts().plot(kind = 'bar', colormap = 'Paired')
plt.xlabel('Class')
plt.ylabel('Number of samples for each category')
plt.title('Testing set')
plt.savefig('knn_svm2.png')
plt.show()

# PCA AND LDA

In [72]:
from sklearn.decomposition import PCA
# 定义降维后的维度数
dimensions = [10, 20,30,40,50,60,70,80,90,100]

# PCA降维
for k in dimensions:
    pca = PCA(n_components=k)
    pca.fit(X_train)
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print(f"shape:{X_train_pca.shape} PCA with k={k}: Explained Variance Ratio: {np.sum(pca.explained_variance_ratio_)}")

# 显示PCA前10个成分图像
pca_10 = PCA(n_components=10)
pca_10.fit(X_train)
X_train_pca_10 = pca_10.transform(X_train)
components_pca = pca_10.components_
print(X_train_pca_10.shape)

plt.figure(figsize=(15, 5))
for i in range(10):
    plt.subplot(2, 5, i + 1)
    plt.imshow(components_pca[i].reshape(28, 28), cmap='gray')
    plt.title(f'PCA Component {i+1}')
    plt.axis('off')
plt.savefig("pca.png")

# 计算解释方差比例
explained_variance_ratio = pca.explained_variance_ratio_

# 绘制Explained Variance Ratio图
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(explained_variance_ratio), marker='o', linestyle='--', color='b')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('PCA Explained Variance Ratio')
plt.grid()
plt.savefig("explained_variance_ratio.png")
plt.show()

In [82]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=9)  # LDA的最大维度是类别数-1
X_train_lda = lda.fit_transform(X_train, y_train)
X_test_lda = lda.transform(X_test)
print(f"LDA with k=9: Explained Variance Ratio: {lda.explained_variance_ratio_}")

# 显示LDA前10个成分图像
components_lda = lda.scalings_.T

plt.figure(figsize=(15, 5))
for i in range(9):  # LDA只有9个成分
    plt.subplot(2, 5, i + 1)
    plt.imshow(components_lda[i].reshape(28, 28), cmap='gray')
    plt.title(f'LDA Component {i+1}')
    plt.axis('off')
plt.savefig("lda.png")
plt.show()

In [10]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier


# 定义PCA降维后的维度数
pca_dimensions = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
lda_dimensions = [1, 2, 3, 4, 5, 6, 7, 8, 9]  # LDA的最大维度是类别数-1

# 用于存储结果的字典
results = {'PCA': {}, 'LDA': {}}

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# 定义超参数网格
knn_param_grid = {'n_neighbors': [1,2,3,4,5,6,7,8,9,10]}
svm_param_grid = {'C':[1,2,3,4,5,6]}

# PCA降维和分类
for k in pca_dimensions:
    pca = PCA(n_components=k)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    
    # KNN分类
    knn = KNeighborsClassifier()
    knn_grid = GridSearchCV(knn, knn_param_grid, cv=5)
    knn_grid.fit(X_train_pca, y_train)
    best_knn = knn_grid.best_estimator_
    y_pred_knn = best_knn.predict(X_test_pca)
    acc_knn = accuracy_score(y_test, y_pred_knn)
    
    # SVM分类
    svm = SVC()
    svm_grid = GridSearchCV(svm, svm_param_grid, cv=5)
    svm_grid.fit(X_train_pca, y_train)
    best_svm = svm_grid.best_estimator_
    y_pred_svm = best_svm.predict(X_test_pca)
    acc_svm = accuracy_score(y_test, y_pred_svm)
    
    results['PCA'][k] = {'KNN': acc_knn, 'SVM': acc_svm, 'KNN_params': knn_grid.best_params_, 'SVM_params': svm_grid.best_params_}

# LDA降维和分类
for k in lda_dimensions:
    lda = LDA(n_components=k)
    X_train_lda = lda.fit_transform(X_train, y_train)
    X_test_lda = lda.transform(X_test)
    
    # KNN分类
    knn = KNeighborsClassifier()
    knn_grid = GridSearchCV(knn, knn_param_grid, cv=5)
    knn_grid.fit(X_train_lda, y_train)
    best_knn = knn_grid.best_estimator_
    y_pred_knn = best_knn.predict(X_test_lda)
    acc_knn = accuracy_score(y_test, y_pred_knn)
    
    # SVM分类
    svm = SVC()
    svm_grid = GridSearchCV(svm, svm_param_grid, cv=5)
    svm_grid.fit(X_train_lda, y_train)
    best_svm = svm_grid.best_estimator_
    y_pred_svm = best_svm.predict(X_test_lda)
    acc_svm = accuracy_score(y_test, y_pred_svm)
    
    results['LDA'][k] = {'KNN': acc_knn, 'SVM': acc_svm, 'KNN_params': knn_grid.best_params_, 'SVM_params': svm_grid.best_params_}


In [12]:
import matplotlib.pyplot as plt

# 获取分类精度
pca_knn_accuracies = [results['PCA'][dim]['KNN'] for dim in pca_dimensions]
pca_svm_accuracies = [results['PCA'][dim]['SVM'] for dim in pca_dimensions]
lda_knn_accuracies = [results['LDA'][dim]['KNN'] for dim in lda_dimensions]
lda_svm_accuracies = [results['LDA'][dim]['SVM'] for dim in lda_dimensions]

# 绘制分类精度对比曲线
plt.figure(figsize=(14, 7))

# PCA + KNN and SVM
plt.subplot(1, 2, 1)
plt.plot(pca_dimensions, pca_knn_accuracies, marker='o', linestyle='-', color='b', label='PCA + KNN')
plt.plot(pca_dimensions, pca_svm_accuracies, marker='o', linestyle='--', color='r', label='PCA + SVM')
plt.xlabel('Number of PCA Components')
plt.ylabel('Accuracy')
plt.title('PCA: KNN and SVM Classification Accuracy')
plt.legend()
plt.grid()
plt.savefig('pca_accuracy.png')
# LDA + KNN and SVM
plt.subplot(1, 2, 2)
plt.plot(lda_dimensions, lda_knn_accuracies, marker='o', linestyle='-', color='b', label='LDA + KNN')
plt.plot(lda_dimensions, lda_svm_accuracies, marker='o', linestyle='--', color='r', label='LDA + SVM')
plt.xlabel('Number of LDA Components')
plt.ylabel('Accuracy')
plt.title('LDA: KNN and SVM Classification Accuracy')
plt.legend()
plt.grid()

plt.tight_layout()
plt.savefig('lda_accuracy.png')
plt.show()


In [13]:
# 选择最优保留维度
best_pca_dim_knn = max(results['PCA'], key=lambda k: results['PCA'][k]['KNN'])
best_pca_dim_svm = max(results['PCA'], key=lambda k: results['PCA'][k]['SVM'])
best_lda_dim_knn = max(results['LDA'], key=lambda k: results['LDA'][k]['KNN'])
best_lda_dim_svm = max(results['LDA'], key=lambda k: results['LDA'][k]['SVM'])

print("\nBest PCA dimension for KNN:", best_pca_dim_knn)
print("Best PCA dimension for SVM:", best_pca_dim_svm)
print("Best LDA dimension for KNN:", best_lda_dim_knn)
print("Best LDA dimension for SVM:", best_lda_dim_svm)

print("\nBest KNN accuracy with PCA:", results['PCA'][best_pca_dim_knn]['KNN'])
print("Best SVM accuracy with PCA:", results['PCA'][best_pca_dim_svm]['SVM'])
print("Best KNN accuracy with LDA:", results['LDA'][best_lda_dim_knn]['KNN'])
print("Best SVM accuracy with LDA:", results['LDA'][best_lda_dim_svm]['SVM'])

print("\nBest KNN parameters with PCA:", results['PCA'][best_pca_dim_knn]['KNN_params'])
print("Best SVM parameters with PCA:", results['PCA'][best_pca_dim_svm]['SVM_params'])
print("Best KNN parameters with LDA:", results['LDA'][best_lda_dim_knn]['KNN_params'])
print("Best SVM parameters with LDA:", results['LDA'][best_lda_dim_svm]['SVM_params'])


In [92]:
from sklearn.cluster import KMeans
from scipy.stats import mode

def calculate_clustering_accuracy(y_true, y_pred):
    labels = np.zeros_like(y_pred)
    for i in range(10):
        mask = (y_pred == i)
        labels[mask] = mode(y_true[mask])[0]
    return accuracy_score(y_true, labels)

results_kmeans = {'PCA': {}, 'LDA': {}}

# PCA降维和KMeans聚类
for k in pca_dimensions:
    pca = PCA(n_components= k)
    X_train_pca = pca.fit_transform(X_train)
    
    # KMeans聚类
    kmeans = KMeans(n_clusters=10)
    y_pred = kmeans.fit_predict(X_train_pca)
    
    # 计算聚类精度
    accuracy = calculate_clustering_accuracy(y_train, y_pred)
    results_kmeans['PCA'][k] = {'accuracy': accuracy, 'config': kmeans.get_params()}
    
# LDA降维和KMeans聚类
for k in lda_dimensions:
    pca = PCA(n_components=600)
    X_train_pca = pca.fit_transform(X_train)
    lda = LDA(n_components=k)
    X_train_lda = lda.fit_transform(X_train_pca, y_train)
    
    # KMeans聚类
    kmeans = KMeans(n_clusters=10)
    y_pred = kmeans.fit_predict(X_train_lda)
    
    # 计算聚类精度
    accuracy = calculate_clustering_accuracy(y_train, y_pred)
    results_kmeans['LDA'][k] = {'accuracy': accuracy, 'config': kmeans.get_params()}
    

In [93]:
# 获取聚类精度
pca_kmeans_accuracies = [results_kmeans['PCA'][dim]['accuracy'] for dim in pca_dimensions]
lda_kmeans_accuracies = [results_kmeans['LDA'][dim]['accuracy'] for dim in lda_dimensions]

# 绘制聚类精度对比曲线
plt.figure(figsize=(14, 7))

# PCA + KMeans
plt.subplot(1, 2, 1)
plt.plot(pca_dimensions, pca_kmeans_accuracies, marker='o', linestyle='-', color='b', label='PCA + KMeans')
plt.xlabel('Number of PCA Components')
plt.ylabel('Clustering Accuracy')
plt.title('PCA: KMeans Clustering Accuracy')
plt.legend()
plt.grid()

# LDA + KMeans
plt.subplot(1, 2, 2)
plt.plot(lda_dimensions, lda_kmeans_accuracies, marker='o', linestyle='-', color='r', label='LDA + KMeans')
plt.xlabel('Number of LDA Components')
plt.ylabel('Clustering Accuracy')
plt.title('LDA: KMeans Clustering Accuracy')
plt.legend()
plt.grid()

plt.tight_layout()
plt.savefig('kmeans.png')
plt.show()


In [94]:
# 选择最优保留维度
best_pca_dim_kmeans = max(results_kmeans['PCA'], key=lambda k: results_kmeans['PCA'][k]['accuracy'])
best_lda_dim_kmeans = max(results_kmeans['LDA'], key=lambda k: results_kmeans['LDA'][k]['accuracy'])

print("\nBest PCA dimension for KMeans:", best_pca_dim_kmeans)
print("Best LDA dimension for KMeans:", best_lda_dim_kmeans)

print("\nBest KMeans accuracy with PCA:", results_kmeans['PCA'][best_pca_dim_kmeans]['accuracy'])
print("Best KMeans accuracy with LDA:", results_kmeans['LDA'][best_lda_dim_kmeans]['accuracy'])

print("\nBest KMeans configuration with PCA:", results_kmeans['PCA'][best_pca_dim_kmeans]['config'])
print("Best KMeans configuration with LDA:", results_kmeans['LDA'][best_lda_dim_kmeans]['config'])


In [95]:
import numpy as np
import plotly.graph_objs as go
from sklearn.datasets import fetch_openml
from sklearn.cluster import KMeans
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# 加载MNIST数据集
X, y = X_train,y_train

# 使用LDA将数据投射到3维空间
lda = LDA(n_components=3)
X_lda = lda.fit_transform(X, y)

# 执行KMeans聚类
kmeans = KMeans(n_clusters=10, random_state=42)
kmeans.fit(X_lda)

# 获取聚类结果
cluster_labels = kmeans.labels_

# 绘制聚类结果
data = []
for i in range(10):
    trace = go.Scatter3d(
        x=X_lda[cluster_labels == i, 0],
        y=X_lda[cluster_labels == i, 1],
        z=X_lda[cluster_labels == i, 2],
        mode='markers',
        marker=dict(
            size=5,
            line=dict(
                color='rgba(217, 217, 217, 0.14)',
                width=0.5
            ),
            opacity=0.8
        ),
        name=f'Cluster {i}'
    )
    data.append(trace)

layout = go.Layout(
    title='KMeans Clustering of MNIST Data (LDA Projection)',
    scene=dict(
        xaxis=dict(title='LDA Component 1'),
        yaxis=dict(title='LDA Component 2'),
        zaxis=dict(title='LDA Component 3')
    )
)

fig = go.Figure(data=data, layout=layout)
fig.show()


In [96]:
X, y =  X_train,y_train

# 使用LDA将数据投射到2维空间
lda = LDA(n_components=2)
X_lda = lda.fit_transform(X, y)

# 执行KMeans聚类
kmeans = KMeans(n_clusters=10)
kmeans.fit(X_lda)

# 获取聚类结果
cluster_labels = kmeans.labels_

# 绘制聚类结果
plt.figure(figsize=(10, 8))
for i in range(10):
    plt.scatter(X_lda[cluster_labels == i, 0], X_lda[cluster_labels == i, 1], label=f'Cluster {i}', s=10)
plt.title('KMeans Clustering of MNIST Data (LDA Projection)')
plt.xlabel('LDA Component 1')
plt.ylabel('LDA Component 2')
plt.legend()
plt.grid(True)
plt.savefig('k9.png')
plt.show()


In [65]:

# 加载MNIST数据集
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist['data'], mnist['target'].astype(np.int)

# 使用PCA将数据投射到2维空间
pca = PCA(n_components=80)
X_pca = pca.fit_transform(X)

# 执行KMeans聚类
kmeans = KMeans(n_clusters=10, random_state=42)
kmeans.fit(X_pca)

# 获取聚类结果
cluster_labels = kmeans.labels_

# 绘制聚类结果
plt.figure(figsize=(10, 8))
for i in range(10):
    plt.scatter(X_pca[cluster_labels == i, 0], X_pca[cluster_labels == i, 1], label=f'Cluster {i}', s=10)
plt.title('KMeans Clustering of MNIST Data (PCA Projection)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.grid(True)
plt.show()
