# Maximum Likelihood VS KDE

# Load data

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
from google.colab import drive

drive.mount('/content/drive')
data_path = '/content/drive/My Drive/MNIST_CSV/mnist_train.csv'


df = pd.read_csv(data_path, header=None)
tr_data = df[(df.iloc[:, 0] == 3) | (df.iloc[:, 0] == 4)]

# PCA
pca = PCA(n_components=1)
pca_result = pca.fit_transform(tr_data.iloc[:,1:])

pca_df = pd.DataFrame(pca_result, columns=['PCA1'])
y_train, x_train = tr_data.iloc[:, 0], tr_data.iloc[:, 1:]
y_train = pd.DataFrame(np.array(y_train), columns=['y'])
pca_train = pd.concat([pca_df, y_train], axis=1)
pca_train

Mounted at /content/drive


Unnamed: 0,PCA1,y
0,-547.480274,4
1,552.342638,3
2,-860.211215,4
3,1016.061760,3
4,1333.012510,3
...,...,...
11968,118.084965,3
11969,-688.864378,4
11970,1012.511647,3
11971,602.674285,3


In [None]:
pca_df

Unnamed: 0,PCA1
0,-547.480274
1,552.342638
2,-860.211215
3,1016.061760
4,1333.012510
...,...
11968,118.084965
11969,-688.864378
11970,1012.511647
11971,602.674285


In [None]:
test_data_path = '/content/drive/My Drive/MNIST_CSV/mnist_test.csv'

# 使用Pandas读取CSV文件, 筛选要的数据
df_test = pd.read_csv(test_data_path, header=None)
# filter 3 & 4
test_data = df_test[(df_test.iloc[:, 0] == 3) | (df_test.iloc[:, 0] == 4)]
test_data
X_test, y_test = test_data.iloc[:,1:], test_data.iloc[:,0].to_numpy()
# pca transform X
pca_test = pca.transform(X_test)
pca_test
pca_test = pd.DataFrame(pca_test, columns=['PCA1'])
pca_test


Unnamed: 0,PCA1
0,-789.275434
1,-400.728488
2,483.689494
3,-655.878815
4,-526.303768
...,...
1987,-673.876963
1988,590.689423
1989,-932.387658
1990,1292.079846


In [None]:
tr_data.iloc[:,1:]

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,775,776,777,778,779,780,781,782,783,784
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59964,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59975,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59978,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59980,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# ML

In [None]:
mean_3 = pca_train[pca_train["y"] == 3].mean()[:-1]
mean_4 = pca_train[pca_train["y"] == 4].mean()[:-1]
cov_3 = pca_train[pca_train["y"] == 3].iloc[:,:-1].cov()
cov_4 = pca_train[pca_train["y"] == 4].iloc[:,:-1].cov()
print(f"sample mean and covariance for the training set of the class 3 is:\n{mean_3},\n{cov_3}")
print(f"sample mean and covariance for the training set of the class 3 is:\n{mean_4},\n{cov_4}")

sample mean and covariance for the training set of the class 3 is:
PCA1    648.650088
dtype: float64,
               PCA1
PCA1  137067.850083
sample mean and covariance for the training set of the class 3 is:
PCA1   -680.738393
dtype: float64,
              PCA1
PCA1  78627.696654


In [None]:
pca_test.to_numpy()

array([[-789.27543366],
       [-400.72848773],
       [ 483.68949366],
       ...,
       [-932.38765765],
       [1292.07984642],
       [-637.72125211]])

In [None]:
def get_accuracy(true_y, predicted_y):
    correct_predictions = sum(t == p for t, p in zip(true_y, predicted_y))
    accuracy = correct_predictions / len(true_y)

    print(f"Accuracy: {accuracy:.8f}")
    return accuracy


def gaussian(X, mean, cov):
    n = 2 # dim
    diff = (X - mean).T
    return (1. / (np.sqrt((2 * np.pi)**n * np.linalg.det(cov))) *
            np.exp(-0.5 * np.dot(np.dot(diff.T, np.linalg.inv(cov)), diff)))

# X is df, will be converted to np in this func
def ml(X):
    pred = []
    for x in X.to_numpy():
        gauss_3 = gaussian(x, mean_3, cov_3)
        gauss_4 = gaussian(x, mean_4, cov_4)
        if gauss_3 > gauss_4:
            pred.append(3)
        else:
            pred.append(4)
    return np.array(pred)

res = ml(pca_test)
print("Accuracy of ML is: ", get_accuracy(y_test, res))

Accuracy: 0.98092369
Accuracy of ML is:  0.9809236947791165


# ML with Exp


In [None]:
def exp_func(X):
    lambda_ = len(X) / np.sum(X)
    return lambda_ * np.exp(-lambda_*X)

# X is df, will be converted to np in this func
def ml_exp(X):
    pred = []
    for x in X.to_numpy():
        # new
        gauss_3 = gaussian(x, mean_3, cov_3) * exp_func(x)
        gauss_4 = gaussian(x, mean_4, cov_4) * exp_func(x)
        if gauss_3 > gauss_4:
            pred.append(3)
        else:
            pred.append(4)
    return np.array(pred)

res_new = ml_exp(pca_test)
print("Accuracy of ML is: ", get_accuracy(y_test, res_new))

Accuracy: 0.50351406
Accuracy of ML is:  0.5035140562248996


In [None]:
len(pca_train[pca_train["y"] == 3]["PCA1"].to_numpy())

6131

In [None]:
for i in pca_test.to_numpy()[:10]:
    print(i[0])

-789.27543366468
-400.72848772669136
483.68949366464307
-655.8788153126263
-526.3037676726867
-906.8479025365349
774.0350826536187
706.7224889835692
-735.9752172956594
-685.8784392562225


# 3.KDE

In [None]:
def gaussian_kernel(x, xi, bandwidth=20):
    return np.exp(-0.5 * ((x - xi) ** 2) / (bandwidth ** 2)) / (bandwidth * np.sqrt(2 * np.pi))

def kde(test, train, bandwidth=20):
    pred = []
    arr_3 = pca_train[pca_train["y"] == 3]["PCA1"].to_numpy()
    arr_4 = pca_train[pca_train["y"] == 4]["PCA1"].to_numpy()
    def get_prob(train, x, bandwidth):
        n = len(train)
        estimate = [gaussian_kernel(x, i, bandwidth) for i in train]
        return sum(estimate)/n
    for x in test:
        if get_prob(arr_3, x, bandwidth) > get_prob(arr_4, x, bandwidth):
            pred.append(3)
        else:
            pred.append(4)
    return np.array(pred)

bandwidth = 20
kde_estimates = kde(pca_test.to_numpy(), pca_train, bandwidth)

#res_kde = kde(pca_test)
print("Accuracy of ML is: ", get_accuracy(y_test, kde_estimates))


Accuracy: 0.98092369
Accuracy of ML is:  0.9809236947791165


In [None]:
print("Accuracy of ML is: ", get_accuracy(y_test, kde_estimates))

Accuracy: 0.98092369
Accuracy of ML is:  0.9809236947791165
