## Initial Process

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


## Import CSV Data

In [None]:
amnist_data = pd.read_csv("inputs/MNIST_train.csv")

In [None]:
amnist_data

In [None]:
# Display basic information about the dataset
print("Dataset Information:")
print(amnist_data.info())
# This provides an overview of the columns, their data types, and non-null counts

# Display summary statistics of the numerical columns
print("\nSummary Statistics:")
print(amnist_data.describe())
# This shows count, mean, std, min, 25%, 50%, 75%, and max for numerical columns

# Check for missing values in each column
print("\nMissing Values:")
print(amnist_data.isnull().sum())
# This helps identify columns with missing data that may need imputation

# Display the first few rows of the dataset
print("\nFirst few rows:")
print(amnist_data.head())
# This gives a quick look at the structure and content of the data

In [None]:
amnist_data.shape

In [None]:
X = amnist_data.to_numpy()

In [None]:
y = X[:,2]
y

In [None]:
set(y)

In [None]:
X = X[:, 3:]
X.shape

In [None]:
obs_1 = X[1, :]
obs_1

In [None]:
X[1].max()

In [None]:
X[1].min()

In [None]:
obs_1 = obs_1.reshape(28, 28)
obs_1

### plt.imshow Function Overview

The `plt.imshow()` function in Matplotlib is used to display images in a 2D array format. It is especially useful for visualizing data in the form of images, such as matrices, grayscale images, or color images.

#### Parameters:
- `obs_1` (array-like): The data or image you want to display. This can be a 2D array (for grayscale images) or a 3D array (for color images, where the third dimension represents color channels like RGB).

#### Functionality:
- `plt.imshow(obs_1)` will display the `obs_1` array as an image.
- If `obs_1` is a 2D array, `plt.imshow()` will interpret the values as pixel intensities.
- If `obs_1` is a 3D array, it will treat the last dimension as color channels (e.g., RGB).

#### Example: <br>`obs_1` is a matrix representing an image, `plt.imshow(obs_1)` will render this matrix as an image within a plot.

In [None]:
plt.imshow(obs_1)

## Applying Naive Bayes 

In [None]:
from scipy.stats import multivariate_normal as mvn

class GaussNB():

  def fit(self, X, y, epsilon = 1e-3):
    self.likelihoods = dict()
    self.priors = dict()
    self.K = set(y.astype(int))

    for k in self.K:
      X_k = X[y==k]
      # Naive Assumption: Observations are linearly independant of each other
      self.likelihoods[k] = {"mean": X_k.mean(axis=0), "cov":X_k.var(axis=0)+epsilon}
      self.priors[k] = len(X_k) / len(X)

  def predict(self, X):

    N, D = X.shape
    P_hat = np.zeros((N, len(self.K)))

    for k, l in self.likelihoods.items():
      P_hat[:,k] = mvn.logpdf(X, l["mean"], l["cov"]) + np.log(self.priors[k])

    return P_hat.argmax(axis=1)

In [None]:
gnb = GaussNB()
gnb.fit(X, y)

In [None]:
y_hat = gnb.predict(X)
y_hat

In [None]:
def accuracy(y, y_hat):
  return np.mean(y==y_hat)

In [None]:
accuracy(y, y_hat)

In [None]:
plt.figure()
plt.scatter(X[:,0], X[:, 1], c=y_hat, alpha=0.4, s=10)

In [None]:
accuracy(y, y_hat)

## Applying Non-Naive Gauss-Bayes Classifier

In [None]:
class GaussBayes():
    def fit(self, X, y, epsilon=1e-3):
        self.likelihoods = dict()
        self.priors = dict()
        self.K = set(y.astype(int))

        for k in self.K:
            X_k = X[y==k,:]
            N_k, D = X_k.shape
            mu_k = X_k.mean(axis=0)\
            
            self.likelihoods[k] = {"mean": X_k.mean(axis=0), "cov":(1/(N_k-1))*np.matmul((X_k-mu_k).T, X_k-mu_k) + epsilon * np.identity(D)}

            self.priors[k] = len(X_k) / len(X)
        
    def predict(self, X):
        N, D = X.shape
        P_hat = np.zeros((N, len(self.K)))

        for k, l in self.likelihoods.items():
            P_hat[:,k] = mvn.logpdf(X, l["mean"], l["cov"]) + np.log(self.priors[k])

        return P_hat.argmax(axis=1)

In [None]:
gaussBayes = GaussBayes()

In [None]:
gaussBayes.fit(X, y, epsilon=1e-3)

In [None]:
y_hat_bayes = gaussBayes.predict(X)

In [None]:
plt.figure(figsize=(10,7))
plt.scatter(X[:,0], X[:,1], c=y_hat_bayes, alpha=0.4, s=8)

In [None]:
accuracy(y, y_hat_bayes)

## Applying K-Nearest Neighbours Classifier

In [None]:
class KNNClassifier():

    def fit(self, X, y):
        self.X = X
        self.y = y

    def predict(self, X, K, epsilon=1e-3):

        N = len(X)
        y_hat = np.zeros(N)

        for i in range(N):
            dist2 = np.sum((self.X - X[i]) ** 2, axis=1)
            # list of sorted nearest indexes:
            idxt = np.argsort(dist2)[:K]
            gamma_k = 1 / (np.sqrt(dist2[idxt]+epsilon))

            y_hat[i] = np.bincount(self.y[idxt], weights = gamma_k).argmax()

        return y_hat

In [None]:
knn = KNNClassifier()
knn.fit(X, y)

In [None]:
# if executed, it will run forever!!
K = 2
y_hat_knn = knn.predict(X, K)

In [None]:
plt.figure()
plt.scatter(X[:,0], X[:, 1], c=y_hat_knn, s=10, alpha=0.4)