# Load data

In [14]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
from google.colab import drive

drive.mount('/content/drive')
data_path = '/content/drive/My Drive/MNIST_CSV/mnist_train.csv'


df = pd.read_csv(data_path, header=None)
tr_data = df[(df.iloc[:, 0] == 3) | (df.iloc[:, 0] == 4)]

# PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(tr_data.iloc[:,1:])

pca_df = pd.DataFrame(pca_result, columns=['PCA1', 'PCA2'])
y_train, x_train = tr_data.iloc[:, 0], tr_data.iloc[:, 1:]
y_train = pd.DataFrame(np.array(y_train), columns=['y'])
pca_train = pd.concat([pca_df, y_train], axis=1)
pca_train

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,PCA1,PCA2,y
0,-547.480203,-437.024419,4
1,552.342651,751.796546,3
2,-860.211222,446.878279,4
3,1016.061778,576.680909,3
4,1333.012510,-857.883391,3
...,...,...,...
11968,118.084979,839.593033,3
11969,-688.864411,274.727637,4
11970,1012.511661,468.047360,3
11971,602.674341,634.429101,3


In [15]:
test_data_path = '/content/drive/My Drive/MNIST_CSV/mnist_test.csv'

# 使用Pandas读取CSV文件, 筛选要的数据
df_test = pd.read_csv(test_data_path, header=None)
# filter 3 & 4
test_data = df_test[(df_test.iloc[:, 0] == 3) | (df_test.iloc[:, 0] == 4)]
test_data
X_test, y_test = test_data.iloc[:,1:], test_data.iloc[:,0].to_numpy()
# pca transform X
pca_test = pca.transform(X_test)
pca_test
pca_test = pd.DataFrame(pca_test, columns=['PCA1', 'PCA2'])
pca_test


Unnamed: 0,PCA1,PCA2
0,-789.275435,-437.871115
1,-400.728484,-61.621818
2,483.689506,-695.368557
3,-655.878808,-108.019560
4,-526.303771,-413.752747
...,...,...
1987,-673.876968,483.627080
1988,590.689416,125.255249
1989,-932.387650,408.652659
1990,1292.079845,-252.500957


In [None]:
tr_data.iloc[:,1:]

# ML

In [16]:
mean_3 = pca_train[pca_train["y"] == 3].mean()[:-1]
mean_4 = pca_train[pca_train["y"] == 4].mean()[:-1]
cov_3 = pca_train[pca_train["y"] == 3].iloc[:,:-1].cov()
cov_4 = pca_train[pca_train["y"] == 4].iloc[:,:-1].cov()
print(f"sample mean and covariance for the training set of the class 3 is:\n{mean_3},\n{cov_3}")
print(f"sample mean and covariance for the training set of the class 3 is:\n{mean_4},\n{cov_4}")

sample mean and covariance for the training set of the class 3 is:
PCA1    648.650089
PCA2     33.437263
dtype: float64,
               PCA1           PCA2
PCA1  137067.836007  -25289.046961
PCA2  -25289.046961  291668.777940
sample mean and covariance for the training set of the class 3 is:
PCA1   -680.738394
PCA2    -35.091383
dtype: float64,
              PCA1           PCA2
PCA1  78627.708393  -20117.773580
PCA2 -20117.773580  165898.591442


In [17]:
pca_test.to_numpy()

array([[-789.27543487, -437.87111524],
       [-400.72848435,  -61.62181838],
       [ 483.68950613, -695.36855726],
       ...,
       [-932.38764954,  408.6526589 ],
       [1292.07984475, -252.50095698],
       [-637.72125634,  707.98246664]])

In [18]:
def get_accuracy(true_y, predicted_y):
    correct_predictions = sum(t == p for t, p in zip(true_y, predicted_y))
    accuracy = correct_predictions / len(true_y)

    print(f"Accuracy: {accuracy:.8f}")
    return accuracy


def gaussian(X, mean, cov):
    n = 2 # dim
    diff = (X - mean).T
    return (1. / (np.sqrt((2 * np.pi)**n * np.linalg.det(cov))) *
            np.exp(-0.5 * np.dot(np.dot(diff.T, np.linalg.inv(cov)), diff)))

# X is df, will be converted to np in this func
def ml(X):
    pred = []
    for x in X.to_numpy():
        gauss_3 = gaussian(x, mean_3, cov_3)
        gauss_4 = gaussian(x, mean_4, cov_4)
        if gauss_3 > gauss_4:
            pred.append(3)
        else:
            pred.append(4)
    return np.array(pred)

res = ml(pca_test)
print("Accuracy of ML is: ", get_accuracy(y_test, res))

Accuracy: 0.98443775
Accuracy of ML is:  0.9844377510040161


# MAP


In [19]:
# X is df, will be converted to np in this func
def max_a_post(X):
    pc_3 = 0.58
    pc_4 = 0.42
    pred = []
    for x in X.to_numpy():
        gauss_3 = gaussian(x, mean_3, cov_3)
        gauss_4 = gaussian(x, mean_4, cov_4)
        if gauss_3*pc_3 > gauss_4*pc_4:
            pred.append(3)
        else:
            pred.append(4)
    return np.array(pred)

res_map = max_a_post(pca_test)
print("Accuracy of ML is: ", get_accuracy(y_test, res_map))

Accuracy: 0.98192771
Accuracy of ML is:  0.9819277108433735
