In [None]:
from IPython.display import clear_output

# Download the datasets

In [None]:
# Dataset 1a
! gdown 1u_VR07Kee92JrhAGq3VeXFR28uoZgxCX
! gdown 1IvZk4IvzHVnEWGFqKZoea_OoqEEagLPs
! gdown 1UItAFItujkbAo_RMouBnzBOPZ2306J7K

clear_output()

In [None]:
# Dataset 1b
! gdown 1iAPQ4tZIN1b7p3InunX5KbFd_8xMczVP
! gdown 1BJekqgyr8tf_q_c3RQyPSpZNUwh5Ojhs
! gdown 1BGG5CgFE3WClWVQPj4NJe_4jcSJ5PatO

clear_output()

In [None]:
# Dataset 2
! gdown 18nytesvrVSgyEApS9HqDBmop6vp5Rx-s
! gdown 1aHIU8LzMreWJyn6roXFFwUA9IIs4Rrmy
! gdown 1W0pGwuBlXZ8dnoZhvf8rOJD1zaG_8Htb
! gdown 1G5yg9ZF9Wtx5JiIVANISlwdgUBC_d5iP
! gdown 1ppBq_NSdtbMO6OGCi0I9mXJd5kGH6n_F
! gdown 1QmHYtmKFPLL-3TxMWKa5DI4bHcAq6e5A

clear_output()

# Imports

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.linalg import cholesky
from scipy import stats
from scipy.spatial.distance import cdist

# K- Nearest Neighbours

In [None]:
class KNN:
    def __init__(self, X_train, y_train, X_val, y_val, X_test, y_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        self.X_test = X_test
        self.y_test = y_test

        self.num_features = self.X_train.shape[1]

        # knn
        self.num_classes = None
        self.train_classes = None
        self.val_classes = None
        self.test_classes = None

        self.init_classes()

    def init_classes(self):
        unique_classes = np.unique(self.y_train)

        self.num_classes = unique_classes.shape[0]

        self.train_classes = {label: self.X_train[self.y_train == label] for label in unique_classes}
        self.val_classes = {label: self.X_val[self.y_val == label] for label in unique_classes}
        self.test_classes = {label: self.X_test[self.y_test == label] for label in unique_classes}

    def train(self, K_list):
        classification_accuracy = {
            "K" : [],
            "Training Accuracy" : [],
            "Validation Accuracy" : [],
            "Testing Accuracy" : [],
        }

        for K in K_list:
            # plot the decision boundary if it is 2-dimensional input
            if(self.num_features == 2):
                x_min, x_max = self.X_train[:, 0].min() - 1, self.X_train[:, 0].max() + 1
                y_min, y_max = self.X_train[:, 1].min() - 1, self.X_train[:, 1].max() + 1

                X1, X2 = np.meshgrid(np.linspace(x_min, x_max, 500), np.linspace(y_min, y_max, 500))
                X = np.c_[X1.ravel(), X2.ravel()]

                Z = np.array([self.predict(K, x) for x in X])
                Z = Z.reshape(X1.shape)

                plt.figure(figsize=(10, 10))
                plt.contourf(X1, X2, Z, alpha=0.4)

                for i in range(self.num_classes):
                    plt.scatter(self.train_classes[i][:, 0], self.train_classes[i][:, 1], s=20, edgecolor="k", label=f"Training Points- Class {i+1}")

                plt.title(f"KNN Decision Boundary for K = {K}")
                plt.xlabel("X1")
                plt.ylabel("X2")
                plt.legend()
                plt.savefig(f"KNN with K = {K}.jpeg", dpi=300, format="jpeg")
                plt.close()

            # generate confusion matrix and calculate the accuracy for training, validation and testing data and store them in a csv file
            confusion_matrix = np.zeros((self.num_classes, self.num_classes), dtype="int64")
            for X, y in zip(self.X_train, self.y_train):
                prediction = self.predict(K, X)
                confusion_matrix[y, prediction] += 1

            training_accuracy = np.trace(confusion_matrix) / np.sum(confusion_matrix)
            np.savetxt(f"KNN_Training_Confusion_Matrix(K={K}).csv", confusion_matrix, delimiter=",")

            confusion_matrix = np.zeros((self.num_classes, self.num_classes), dtype="int64")
            for X, y in zip(self.X_val, self.y_val):
                prediction = self.predict(K, X)
                confusion_matrix[y, prediction] += 1

            validation_accuracy = np.trace(confusion_matrix) / np.sum(confusion_matrix)
            np.savetxt(f"KNN_Validation_Confusion_Matrix(K={K}).csv", confusion_matrix, delimiter=",")

            confusion_matrix = np.zeros((self.num_classes, self.num_classes), dtype="int64")
            for X, y in zip(self.X_test, self.y_test):
                prediction = self.predict(K, X)
                confusion_matrix[y, prediction] += 1

            testing_accuracy = np.trace(confusion_matrix) / np.sum(confusion_matrix)
            np.savetxt(f"KNN_Testing Confusion_Matrix(K={K}).csv", confusion_matrix, delimiter=",")

            classification_accuracy["K"].append(K)
            classification_accuracy["Training Accuracy"].append(training_accuracy)
            classification_accuracy["Validation Accuracy"].append(validation_accuracy)
            classification_accuracy["Testing Accuracy"].append(testing_accuracy)

        accuracy_df = pd.DataFrame(classification_accuracy)
        accuracy_df.to_csv("KNN Accuracy.csv", index=False)

    def predict(self, K, origin):
        prediction = None
        prediction_radius = 1e9

        for cls in range(self.num_classes):
            distances = np.linalg.norm(self.train_classes[cls] - origin, axis=1)
            K_nearest_neighbours = np.argsort(distances)[:K]

            smallest_radius = distances[K_nearest_neighbours[-1]]
            if(prediction_radius > smallest_radius):
                prediction = cls
                prediction_radius = smallest_radius

        return prediction



# Naive Bayes Classifier

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from scipy.stats import multivariate_normal

class NaiveBayesClassifier:
    def __init__(self, X_train, y_train, X_val, y_val, X_test, y_test, covariance_flag="normal"):
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        self.X_test = X_test
        self.y_test = y_test

        self.num_classes = np.unique(y_train).shape[0]
        self.num_features = X_train.shape[1]

        # Flags
        self.covariance_flag = covariance_flag

        # Parameters
        self.class_priors = None
        self.class_means = None
        self.class_covariance = None

        self.train()

    def train(self):
        self.class_priors = [np.mean(self.y_train == i) for i in range(self.num_classes)]

        self.class_means = [np.mean(self.X_train[self.y_train == i], axis=0) for i in range(self.num_classes)]

        if self.covariance_flag == "same":
            # Compute diagonal covariance matrix assuming independence of features
            feature_variances = [np.var(self.X_train[self.y_train == i], axis=0) for i in range(self.num_classes)]
            self.class_covariance = np.diag(np.mean(feature_variances, axis=0))
        else:
            # Compute diagonal covariance matrices for each class
            self.class_covariance = [np.diag(np.var(self.X_train[self.y_train == i], axis=0)) for i in range(self.num_classes)]

    def predict(self, X):
        class_scores = []

        for i in range(self.num_classes):
            mean = self.class_means[i]
            cov = self.class_covariance if self.covariance_flag == "same" else self.class_covariance[i]

            class_scores.append(multivariate_normal.logpdf(X, mean=mean, cov=cov) + np.log(self.class_priors[i]))

        return np.argmax(class_scores, axis=0)

    def evaluate(self):
        # Table of classification accuracies
        accuracy_table = pd.DataFrame(columns=["Training Accuracy", "Validation Accuracy", "Testing Accuracy"])

        # Train the model
        self.train()

        # Evaluate accuracy
        training_accuracy = self.evaluate_accuracy(self.X_train, self.y_train)
        validation_accuracy = self.evaluate_accuracy(self.X_val, self.y_val)
        testing_accuracy = self.evaluate_accuracy(self.X_test, self.y_test)

        accuracy_table = accuracy_table.append({
            "Training Accuracy": training_accuracy,
            "Validation Accuracy": validation_accuracy,
            "Testing Accuracy": testing_accuracy
        }, ignore_index=True)

        # Generate decision region plot
        if self.num_features == 2:
            self.plot_decision_regions()

        # Save results
        self.save_results(accuracy_table, testing_accuracy)

        return accuracy_table, training_accuracy, validation_accuracy, testing_accuracy

    def evaluate_accuracy(self, X, y):
        predictions = self.predict(X)
        accuracy = np.mean(predictions == y)
        return accuracy

    def gaussian_pdf(self, x, mean, covariance):
        return (1 / np.sqrt(np.linalg.det(covariance))) * np.exp(-0.5 * np.dot(np.dot((x - mean).T, np.linalg.inv(covariance)), (x - mean)))

    def generate_confusion_matrix(self, X, y):
        num_classes = np.unique(y).shape[0]
        confusion_matrix = np.zeros((num_classes, num_classes), dtype="int64")

        predictions = self.predict(X)

        for true_label, predicted_label in zip(y, predictions):
            confusion_matrix[true_label, predicted_label] += 1

        return confusion_matrix

    def plot_decision_regions(self):
        if self.num_features == 2:
            x_min, x_max = self.X_train[:, 0].min() - 5, self.X_train[:, 0].max() + 5
            y_min, y_max = self.X_train[:, 1].min() - 5, self.X_train[:, 1].max() + 5

            xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
            positions = np.c_[xx.ravel(), yy.ravel()]
            Z = np.array([self.predict(np.array([x])) for x in positions])
            Z = Z.reshape(xx.shape)
            plt.figure(figsize=(10, 8))
            plt.contourf(xx, yy, Z, alpha=0.4)

            for i in range(self.num_classes):
                X1, X2 = np.meshgrid(np.linspace(self.class_means[i][0] - 10, self.class_means[i][0] + 10, 100),
                                    np.linspace(self.class_means[i][1] - 10, self.class_means[i][1] + 10, 100))
                positions = np.vstack([X1.ravel(), X2.ravel()])
                if self.covariance_flag == "same":
                    Z = np.array([self.gaussian_pdf(x, self.class_means[i], self.class_covariance) for x in positions.T])
                else:
                    Z = np.array([self.gaussian_pdf(x, self.class_means[i], self.class_covariance[i]) for x in positions.T])

                Z = Z.reshape(X1.shape)

                # Plot the Gaussian PDF as level contours
                plt.contour(X1, X2, Z, colors="grey", levels=20, alpha=0.7, linewidths=0.6)

                # Plot training points
                plt.scatter(self.X_train[self.y_train == i][:, 0], self.X_train[self.y_train == i][:, 1],
                            s=20, edgecolor="k", label=f"Training Points - Class {i + 1}")

            plt.xlabel("Feature 1")
            plt.ylabel("Feature 2")
            plt.title(f"Naive Bayes Decision Region - {self.covariance_flag}")

            # Ensure equal aspect ratio
            plt.axis('equal')

            plt.legend()
            plt.savefig(f"Naive_Bayes_Decision_Region_{self.covariance_flag}.jpeg", dpi=300, format="jpeg")
            plt.close()

    def save_results(self, accuracy_table, test_accuracy):
        accuracy_table.to_csv(f"NaiveBayes_Accuracy_Table_{self.covariance_flag}.csv", index=False)
        np.savetxt(f"NaiveBayes_Test_Accuracy_{self.covariance_flag}.txt", np.array([test_accuracy]), delimiter=",")


# Hypersphere Parzen Bayes Classifier

In [None]:
class HypersphereParzenBayesClassifier:
    def __init__(self, X_train, y_train, X_val, y_val, X_test, y_test, window_size):
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        self.X_test = X_test
        self.y_test = y_test
        self.window_size = window_size

        self.num_classes = np.unique(y_train).shape[0]
        self.num_features = X_train.shape[1]

    def predict(self, X):
        predictions = []

        for point in X:
            # Calculate distances to training points
            distances = cdist([point], self.X_train)

            # Count points within hypersphere
            within_hypersphere = np.sum(distances < self.window_size)

            if within_hypersphere == 0:
                # No points within hypersphere, assign label based on the first class
                prediction = 0
            else:
                # Find the majority class within hypersphere
                majority_class = np.argmax(np.bincount(self.y_train[distances[0] < self.window_size]))

                prediction = majority_class

            predictions.append(prediction)

        return np.array(predictions)

    def evaluate_accuracy(self, X, y):
        predictions = self.predict(X)
        accuracy = np.mean(predictions == y)
        return accuracy

    def train(self):
        self.class_priors = [np.mean(self.y_train == i) for i in range(self.num_classes)]
        self.class_centers = [np.mean(self.X_train[self.y_train == i], axis=0) for i in range(self.num_classes)]
        self.hypersphere_radii = [self.calculate_hypersphere_radius(self.X_train, self.y_train, i) for i in range(self.num_classes)]

    def calculate_hypersphere_radius(self, X, y, class_label):
        class_data = X[y == class_label]
        max_distance = 0

        for i in range(len(class_data)):
            distances = cdist(X, [class_data[i]])
            within_hypersphere = distances[distances < self.window_size]

            if len(within_hypersphere) > 0:
                max_distance = max(max_distance, np.max(within_hypersphere))

        return max_distance

    def generate_confusion_matrix(self, X, y):
        confusion_matrix = np.zeros((self.num_classes, self.num_classes), dtype="int64")

        predictions = self.predict(X)

        for i in range(len(y)):
            true_label = int(y[i])
            predicted_label = int(predictions[i])

            confusion_matrix[true_label, predicted_label] += 1

        return confusion_matrix


    def plot_decision_regions(self, window_size, highlight_best=False):
        if self.num_features == 2:
            x_min, x_max = self.X_train[:, 0].min() - 1, self.X_train[:, 0].max() + 1
            y_min, y_max = self.X_train[:, 1].min() - 1, self.X_train[:, 1].max() + 1

            xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
            Z = self.predict(np.c_[xx.ravel(), yy.ravel()])
            Z = Z.reshape(xx.shape)

            plt.contourf(xx, yy, Z, alpha=0.4)
            plt.scatter(self.X_train[:, 0], self.X_train[:, 1], c=self.y_train, edgecolors='k', marker='o', s=50, linewidth=1, label='Training Data')
            plt.xlabel("Feature 1")
            plt.ylabel("Feature 2")
            plt.title(f"HypersphereParzenBayes Decision Region Plot (Window Size = {window_size})")

            if highlight_best:
                plt.legend()

            if (not highlight_best):
                plt.savefig(f"Hypersphere Parzen Bayes with window size = {window_size}.jpeg", dpi=300, format="jpeg")
            else:
                plt.savefig(f"Hypersphere Parzen Bayes Best Model with window size = {window_size}.jpeg", dpi=300, format="jpeg")

            plt.close()


    def save_results(self, accuracy_table, test_accuracy, train_conf_matrix, val_conf_matrix, test_conf_matrix):
        accuracy_table.to_csv("HypersphereParzenBayes_Accuracy_Table.csv", index=False)
        np.savetxt("HypersphereParzenBayes_Test_Accuracy.txt", np.array([test_accuracy]), delimiter=",")
        np.savetxt("HypersphereParzenBayes_Training_Confusion_Matrix.csv", train_conf_matrix, delimiter=",")
        np.savetxt("HypersphereParzenBayes_Validation_Confusion_Matrix.csv", val_conf_matrix, delimiter=",")
        np.savetxt("HypersphereParzenBayes_Test_Confusion_Matrix.csv", test_conf_matrix, delimiter=",")

    def evaluate(self, w_list):
        # Table of classification accuracies
        accuracy_table = pd.DataFrame(columns=["Window Size", "Training Accuracy", "Validation Accuracy", "Test Accuracy"])

        best_accuracy = 0
        best_window_size = None

        for window_size in w_list:
            self.window_size = window_size

            training_accuracy = self.evaluate_accuracy(self.X_train, self.y_train)
            validation_accuracy = self.evaluate_accuracy(self.X_val, self.y_val)
            test_accuracy = self.evaluate_accuracy(self.X_test, self.y_test)

            accuracy_table = accuracy_table.append({
                "Window Size": window_size,
                "Training Accuracy": training_accuracy,
                "Validation Accuracy": validation_accuracy,
                "Test Accuracy": test_accuracy
            }, ignore_index=True)

            # Check if this configuration is the best so far
            if validation_accuracy > best_accuracy:
                best_accuracy = validation_accuracy
                best_window_size = window_size

            # Plot decision region for the current window size
            self.plot_decision_regions(window_size, highlight_best=False)

        # Set the best window size
        self.window_size = best_window_size

        # Test accuracy for the best window size
        best_test_accuracy = self.evaluate_accuracy(self.X_test, self.y_test)

        # Confusion matrix for training and test data
        train_conf_matrix = self.generate_confusion_matrix(self.X_train, self.y_train)
        val_conf_matrix = self.generate_confusion_matrix(self.X_val, self.y_val)
        test_conf_matrix = self.generate_confusion_matrix(self.X_test, self.y_test)

        # Decision region plots for the best window size
        self.plot_decision_regions(best_window_size, highlight_best=True)

        # Save results
        self.save_results(accuracy_table, best_test_accuracy, train_conf_matrix, val_conf_matrix, test_conf_matrix)

        return accuracy_table, best_test_accuracy, train_conf_matrix, val_conf_matrix, test_conf_matrix


# Example usage:
# hypersphere_parzen_classifier = HypersphereParzenBayesClassifier(X_train, y_train, X_val, y_val, X_test, y_test, window_size=0.1)
# accuracy_table, test_accuracy, train_conf_matrix, test_conf_matrix = hypersphere_parzen_classifier.evaluate()


In [None]:
import os

# Create directories
os.makedirs("Shaun/DataSet1a/images")
os.makedirs("Shaun/DataSet1a/sheets")
os.makedirs("Shaun/DataSet1a/texts")

os.makedirs("Shaun/DataSet1b/images")
os.makedirs("Shaun/DataSet1b/sheets")
os.makedirs("Shaun/DataSet1b/texts")

os.makedirs("Shaun/DataSet2/images")
os.makedirs("Shaun/DataSet2/sheets")
os.makedirs("Shaun/DataSet2/texts")


In [None]:
train_df = pd.read_csv("./Train-20.csv")
val_df = pd.read_csv("./Val-20.csv")
test_df = pd.read_csv("./Test-20.csv")

train_data = train_df[["input1", "input2"]]
train_label = train_df["output"].astype(int)

val_data = val_df[["input1", "input2"]]
val_label = val_df["output"].astype(int)

test_data = test_df[["input1", "input2"]]
test_label = test_df["output"].astype(int)

In [None]:
X_train = np.reshape(train_data.to_numpy(), (-1, 2))
y_train = train_label.to_numpy()

X_val = np.reshape(val_data.to_numpy(), (-1, 2))
y_val = val_label.to_numpy()

X_test = np.reshape(test_data.to_numpy(), (-1, 2))
y_test = test_label.to_numpy()

In [None]:
# Instantiate the KNN class
classifier = KNN(X_train, y_train, X_val, y_val, X_test, y_test)
K_list = [1, 7, 15]
classifier.train(K_list)

In [None]:
nb_classifier = NaiveBayesClassifier(X_train, y_train, X_val, y_val, X_test, y_test, covariance_flag="normal")
nb_classifier.evaluate()

In [None]:
nb_classifier = NaiveBayesClassifier(X_train, y_train, X_val, y_val, X_test, y_test, covariance_flag="same")
nb_classifier.evaluate()

In [None]:
w_list = [0.01, 0.1, 0.5, 1, 1.5, 2, 5, 10]
hypersphere_parzen_classifier = HypersphereParzenBayesClassifier(X_train, y_train, X_val, y_val, X_test, y_test, window_size=0.1)
hypersphere_parzen_classifier.evaluate(w_list)

In [None]:
train_df = pd.read_csv("./Train-10.csv")
val_df = pd.read_csv("./Val-10.csv")
test_df = pd.read_csv("./Test-10.csv")

train_data = train_df[["x1", "x2"]]
train_label = train_df["label"].astype(int)

val_data = val_df[["x1", "x2"]]
val_label = val_df["label"].astype(int)

test_data = test_df[["x1", "x2"]]
test_label = test_df["label"].astype(int)

In [None]:
X_train = np.reshape(train_data.to_numpy(), (-1, 2))
y_train = train_label.to_numpy()

X_val = np.reshape(val_data.to_numpy(), (-1, 2))
y_val = val_label.to_numpy()

X_test = np.reshape(test_data.to_numpy(), (-1, 2))
y_test = test_label.to_numpy()

In [None]:
# Instantiate the KNN class
classifier = KNN(X_train, y_train, X_val, y_val, X_test, y_test)
K_list = [1, 7, 15]
classifier.train(K_list)

In [None]:
nb_classifier = NaiveBayesClassifier(X_train, y_train, X_val, y_val, X_test, y_test, covariance_flag="normal")
nb_classifier.evaluate()

In [None]:
w_list = [0.01, 0.1, 0.5, 1, 1.5, 2, 5, 10]
hypersphere_parzen_classifier = HypersphereParzenBayesClassifier(X_train, y_train, X_val, y_val, X_test, y_test, window_size=0.1)
hypersphere_parzen_classifier.evaluate(w_list)

In [None]:
train_data = pd.read_csv("./train_data.csv", header=None)
train_label = pd.read_csv("./train_label.csv", header=None)

val_data = pd.read_csv("./val_data.csv", header=None)
val_label = pd.read_csv("./val_label.csv", header=None)

test_data = pd.read_csv("./test_data.csv", header=None)
test_label = pd.read_csv("./test_label.csv", header=None)

In [None]:
X_train = np.reshape(train_data.to_numpy(), (-1, 81))
y_train = np.reshape(train_label.to_numpy(dtype="int64"), (-1, ))

X_val = np.reshape(val_data.to_numpy(), (-1, 81))
y_val = np.reshape(val_label.to_numpy(dtype="int64"), (-1, ))

X_test = np.reshape(test_data.to_numpy(), (-1, 81))
y_test = np.reshape(test_label.to_numpy(dtype="int64"), (-1, ))

In [None]:
# Instantiate the KNN class
classifier = KNN(X_train, y_train, X_val, y_val, X_test, y_test)
K_list = [1, 7, 15]
classifier.train(K_list)

In [None]:
nb_classifier = NaiveBayesClassifier(X_train, y_train, X_val, y_val, X_test, y_test, covariance_flag="normal")
nb_classifier.evaluate()

In [None]:
w_list = [0.01, 0.1, 0.5, 1, 1.5, 2, 5, 10]
hypersphere_parzen_classifier = HypersphereParzenBayesClassifier(X_train, y_train, X_val, y_val, X_test, y_test, window_size=0.1)
hypersphere_parzen_classifier.evaluate(w_list)

# Bayes Classifier

In [None]:
class BayesClassifier:
    def __init__(self, X_train, y_train, X_val, y_val, X_test, y_test, method, covariance_type = None):
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        self.X_test = X_test
        self.y_test = y_test

        self.num_features = self.X_train.shape[1]

        # common
        self.num_classes = None
        self.train_classes = None
        self.val_classes = None
        self.test_classes = None

        # gaussian and gmm
        self.mean = None
        self.covariance = None

        self.prior_probabilities = None

        # gmm
        self.num_components = None
        self.covariance_type = covariance_type

        self.responsibilities = None
        self.component_probabilities = None
        self.component_means = None
        self.component_covariances = None

        self.best_num_components = None
        self.best_responsibilities = None
        self.best_component_probabilities = None
        self.best_component_means = None
        self.best_component_covariances = None

        self.init_classes()

        if(method == "knn"):
            pass
        elif(method == "gaussian" or method == "gmm"):
            self.init_gaussian()


    def gaussian_pdf(self, x, mean, covariance):
        return (1 / np.sqrt(np.linalg.det(covariance))) * np.exp(-0.5 * np.dot(np.dot((x - mean).T, np.linalg.inv(covariance)), (x - mean)))


    def gaussian_pdf_vectorized(self, x, mean, covariance):
        n = len(mean)
        x_mean = x - mean
        inv_cov = np.linalg.inv(covariance)
        normalization = 1.0 / np.sqrt((2 * np.pi) ** n * np.linalg.det(covariance))
        exponent = -0.5 * np.sum(x_mean.dot(inv_cov) * x_mean, axis=1)
        pdf = normalization * np.exp(exponent)
        return pdf


    def init_classes(self):
        unique_classes = np.unique(self.y_train)

        self.num_classes = unique_classes.shape[0]

        self.train_classes = {label: self.X_train[self.y_train == label] for label in unique_classes}
        self.val_classes = {label: self.X_val[self.y_val == label] for label in unique_classes}
        self.test_classes = {label: self.X_test[self.y_test == label] for label in unique_classes}

        self.prior_probabilities = {label: (self.train_classes[label].shape[0] / self.X_train.shape[0]) for label in unique_classes}

    def init_gaussian(self):
        for i in range(self.num_classes):
            self.mean = {label: np.mean(self.train_classes[label], axis=0) for label in self.train_classes}
            self.covariance = {label: np.cov(self.train_classes[label].T) for label in self.train_classes}


    def init_gmm(self):
        # K-means clustering to identify num_componenent clusters in each class
        max_iters = 200

        components = {}
        for i in range(self.num_classes):
            components[i] = {}

        for i in range(self.num_classes):
            n = self.train_classes[i].shape[0]
            centroids = self.train_classes[i][np.random.choice(n, self.num_components, replace=False)]

            for _ in range(max_iters):
                # Assign each data point to the nearest cluster
                distances = np.linalg.norm(self.train_classes[i][:, np.newaxis] - centroids, axis=2)
                labels = np.argmin(distances, axis=1)

                # Update cluster centroids
                new_centroids = np.array([self.train_classes[i][labels == k].mean(axis=0) if np.sum(labels == k) > 0 else centroids[k] for k in range(self.num_components)])

                # Check for convergence
                if np.all(new_centroids == centroids):
                    break

                centroids = new_centroids

            # assign the samples to the clusters based on nearest centroid
            distances = np.linalg.norm(self.train_classes[i][:, np.newaxis] - centroids, axis=2)
            labels = np.argmin(distances, axis=1)
            for sample, label in zip(self.train_classes[i], labels):
                if(label not in components[i]):
                    components[i][label] = []
                components[i][label].append(sample)

            for label in components[i]:
                components[i][label] = np.vstack(components[i][label])

        # initialize the parameters of the GMM
        self.component_means = {}
        self.component_covariances = {}
        self.component_probabilities = {}

        for i in range(self.num_classes):
            self.component_means[i] = {}
            self.component_covariances[i] = {}
            self.component_probabilities[i] = {}

            for label in components[i]:
                self.component_means[i][label] = np.mean(components[i][label], axis=0)
                self.component_covariances[i][label] = np.cov(components[i][label].T)
                if(self.covariance_type == "diagonal"):
                    self.component_covariances[i][label] = np.diag(np.diag(self.component_covariances[i][label]))
                self.component_probabilities[i][label] = components[i][label].shape[0] / self.train_classes[i].shape[0]

        self.responsibilities = {}
        for i in range(self.num_classes):
            self.responsibilities[i] = np.zeros((self.train_classes[i].shape[0], self.num_components))

        # EM algorithm
        max_iters = 200
        for _ in range(max_iters):
            # E-step
            for i in range(self.num_classes):
                for label in components[i]:
                    self.responsibilities[i][:, label] = self.component_probabilities[i][label] * self.gaussian_pdf_vectorized(self.train_classes[i], self.component_means[i][label], self.component_covariances[i][label])
                self.responsibilities[i] /= np.sum(self.responsibilities[i], axis=1, keepdims=True)

            # M-step
            for i in range(self.num_classes):
                for label in components[i]:
                    self.component_means[i][label] = np.sum(self.responsibilities[i][:, label][:, np.newaxis] * self.train_classes[i], axis=0) / np.sum(self.responsibilities[i][:, label])
                    self.component_covariances[i][label] = np.dot((self.responsibilities[i][:, label][:, np.newaxis] * (self.train_classes[i] - self.component_means[i][label])).T, (self.train_classes[i] - self.component_means[i][label])) / np.sum(self.responsibilities[i][:, label])
                    if(self.covariance_type == "diagonal"):
                        self.component_covariances[i][label] = np.diag(np.diag(self.component_covariances[i][label]))
                    self.component_probabilities[i][label] = np.sum(self.responsibilities[i][:, label]) / self.train_classes[i].shape[0]


    def KNN_predict(self, K, origin):
        prediction = None
        prediction_radius = 1e9

        for cls in range(self.num_classes):
            distances = np.linalg.norm(self.train_classes[cls] - origin, axis=1)
            K_nearest_neighbours = np.argsort(distances)[:K]

            smallest_radius = distances[K_nearest_neighbours[-1]]
            if(prediction_radius > smallest_radius):
                prediction = cls
                prediction_radius = smallest_radius

        return prediction


    def KNN(self, K_list):
        classification_accuracy = {
            "K" : [],
            "Training Accuracy" : [],
            "Validation Accuracy" : [],
            "Testing Accuracy" : [],
        }

        for K in K_list:
            # plot the decision boundary if it is 2-dimensional input
            if(self.num_features == 2):
                x_min, x_max = self.X_train[:, 0].min() - 1, self.X_train[:, 0].max() + 1
                y_min, y_max = self.X_train[:, 1].min() - 1, self.X_train[:, 1].max() + 1

                X1, X2 = np.meshgrid(np.linspace(x_min, x_max, 500), np.linspace(y_min, y_max, 500))
                X = np.c_[X1.ravel(), X2.ravel()]

                Z = np.array([self.KNN_predict(K, x) for x in X])
                Z = Z.reshape(X1.shape)

                plt.figure(figsize=(10, 10))
                plt.contourf(X1, X2, Z, alpha=0.4)

                for i in range(self.num_classes):
                    plt.scatter(self.train_classes[i][:, 0], self.train_classes[i][:, 1], s=20, edgecolor="k", label=f"Training Points- Class {i+1}")

                plt.title(f"KNN Decision Boundary for K = {K}")
                plt.xlabel("X1")
                plt.ylabel("X2")
                plt.legend()

                plt.savefig(f"Bayes Classifier using KNN with K = {K}.jpeg", dpi=300, format="jpeg")
                plt.close()

            # generate confusion matrix and calculate the accuracy for training, validation and testing data and store them in a csv file
            confusion_matrix = np.zeros((self.num_classes, self.num_classes), dtype="int64")
            for X, y in zip(self.X_train, self.y_train):
                prediction = self.KNN_predict(K, X)
                confusion_matrix[y, prediction] += 1

            training_accuracy = np.trace(confusion_matrix) / np.sum(confusion_matrix)
            np.savetxt(f"KNN_Training_Confusion_Matrix(K={K}).csv", confusion_matrix, delimiter=",")

            confusion_matrix = np.zeros((self.num_classes, self.num_classes), dtype="int64")
            for X, y in zip(self.X_val, self.y_val):
                prediction = self.KNN_predict(K, X)
                confusion_matrix[y, prediction] += 1

            validation_accuracy = np.trace(confusion_matrix) / np.sum(confusion_matrix)
            np.savetxt(f"KNN_Validation_Confusion_Matrix(K={K}).csv", confusion_matrix, delimiter=",")

            confusion_matrix = np.zeros((self.num_classes, self.num_classes), dtype="int64")
            for X, y in zip(self.X_test, self.y_test):
                prediction = self.KNN_predict(K, X)
                confusion_matrix[y, prediction] += 1

            testing_accuracy = np.trace(confusion_matrix) / np.sum(confusion_matrix)
            np.savetxt(f"KNN_Testing Confusion_Matrix(K={K}).csv", confusion_matrix, delimiter=",")

            classification_accuracy["K"].append(K)
            classification_accuracy["Training Accuracy"].append(training_accuracy)
            classification_accuracy["Validation Accuracy"].append(validation_accuracy)
            classification_accuracy["Testing Accuracy"].append(testing_accuracy)

        accuracy_df = pd.DataFrame(classification_accuracy)
        accuracy_df.to_csv("KNN Accuracy.csv", index=False)


    def gaussian_predict(self, x):
        probabilities = np.zeros(self.num_classes)

        for cls in range(self.num_classes):
            probabilities[cls] = self.prior_probabilities[cls] * self.gaussian_pdf(x, self.mean[cls], self.covariance[cls])

        return np.argmax(probabilities)


    def gaussian(self):
        # plot the decision boundary and level curves if it is 2-dimensional input
        if(self.num_features == 2):
            x_min, x_max = self.X_train[:, 0].min() - 1, self.X_train[:, 0].max() + 1
            y_min, y_max = self.X_train[:, 1].min() - 1, self.X_train[:, 1].max() + 1

            X1, X2 = np.meshgrid(np.linspace(x_min, x_max, 500), np.linspace(y_min, y_max, 500))

            Z = np.array([self.gaussian_predict(x) for x in np.c_[X1.ravel(), X2.ravel()]])
            Z = Z.reshape(X1.shape)

            plt.figure(figsize=(10, 10))
            plt.contourf(X1, X2, Z, alpha=0.4, cmap=plt.cm.RdYlBu)

            for i in range(self.num_classes):
                X1, X2 = np.meshgrid(np.linspace(self.mean[i][0] - 10, self.mean[i][0] + 10, 100), np.linspace(self.mean[i][1] - 10, self.mean[i][1] + 10, 100))
                Z = np.array([self.gaussian_pdf(x, self.mean[i], self.covariance[i]) for x in np.c_[X1.ravel(), X2.ravel()]])
                Z = Z.reshape(X1.shape)
                plt.contour(X1, X2, Z, colors="grey", levels=20, alpha=0.7, linewidths=0.6)

            for i in range(self.num_classes):
                plt.scatter(self.train_classes[i][:, 0], self.train_classes[i][:, 1], s=20, edgecolor="k", label=f"Training Points- Class {i+1}")

            plt.title(f"Gaussian Decision Boundary and Level Curves")
            plt.xlabel("X1")
            plt.ylabel("X2")
            plt.legend()

            plt.savefig(f"Bayes Classifier using Gaussian.png", dpi=300, format="png")
            plt.close()

        classification_accuracy = {
            "Training Accuracy" : [],
            "Validation Accuracy" : [],
            "Testing Accuracy" : [],
        }

        # generate confusion matrix and calculate the accuracy for training, validation and testing data and store them in a csv file
        confusion_matrix = np.zeros((self.num_classes, self.num_classes), dtype="int64")
        for X, y in zip(self.X_train, self.y_train):
            prediction = self.gaussian_predict(X)
            confusion_matrix[y, prediction] += 1

        training_accuracy = np.trace(confusion_matrix) / np.sum(confusion_matrix)
        np.savetxt("Gaussian_Training_Confusion_Matrix.csv", confusion_matrix, delimiter=",")

        confusion_matrix = np.zeros((self.num_classes, self.num_classes), dtype="int64")
        for X, y in zip(self.X_val, self.y_val):
            prediction = self.gaussian_predict(X)
            confusion_matrix[y, prediction] += 1

        validation_accuracy = np.trace(confusion_matrix) / np.sum(confusion_matrix)
        np.savetxt("Gaussian_Validation_Confusion_Matrix.csv", confusion_matrix, delimiter=",")

        confusion_matrix = np.zeros((self.num_classes, self.num_classes), dtype="int64")
        for X, y in zip(self.X_test, self.y_test):
            prediction = self.gaussian_predict(X)
            confusion_matrix[y, prediction] += 1

        testing_accuracy = np.trace(confusion_matrix) / np.sum(confusion_matrix)
        np.savetxt("Gaussian_Testing Confusion_Matrix.csv", confusion_matrix, delimiter=",")

        classification_accuracy["Training Accuracy"].append(training_accuracy)
        classification_accuracy["Validation Accuracy"].append(validation_accuracy)
        classification_accuracy["Testing Accuracy"].append(testing_accuracy)

        accuracy_df = pd.DataFrame(classification_accuracy)
        accuracy_df.to_csv("Gaussian Accuracy.csv", index=False)


    def gmm_predict(self, x):
        probabilities = np.zeros(self.num_classes)

        for cls in range(self.num_classes):
            for label in self.component_means[cls]:
                probabilities[cls] += self.component_probabilities[cls][label] * self.gaussian_pdf(x, self.component_means[cls][label], self.component_covariances[cls][label])

        return np.argmax(probabilities)


    def gmm(self, Q_list):
        # plot the decision boundary and level curves if it is 2-dimensional input
        classification_accuracy = {
            "Number of Components" : [],
            "Training Accuracy" : [],
            "Validation Accuracy" : [],
            "Testing Accuracy" : [],
        }

        best_val_accuracy = 0
        for Q in Q_list:
            self.num_components = Q
            self.init_gmm()
            if(self.num_features == 2):
                x_min, x_max = self.X_train[:, 0].min() - 1, self.X_train[:, 0].max() + 1
                y_min, y_max = self.X_train[:, 1].min() - 1, self.X_train[:, 1].max() + 1

                X1, X2 = np.meshgrid(np.linspace(x_min, x_max, 500), np.linspace(y_min, y_max, 500))

                Z = np.array([self.gmm_predict(x) for x in np.c_[X1.ravel(), X2.ravel()]])
                Z = Z.reshape(X1.shape)

                plt.figure(figsize=(10, 10))
                plt.contourf(X1, X2, Z, alpha=0.4, cmap=plt.cm.RdYlBu)

                for i in range(self.num_classes):
                    for label in self.component_means[i]:
                        X1, X2 = np.meshgrid(np.linspace(self.component_means[i][label][0] - 10, self.component_means[i][label][0] + 10, 100), np.linspace(self.component_means[i][label][1] - 10, self.component_means[i][label][1] + 10, 100))
                        Z = np.array([self.gaussian_pdf(x, self.component_means[i][label], self.component_covariances[i][label]) for x in np.c_[X1.ravel(), X2.ravel()]])
                        Z = Z.reshape(X1.shape)
                        plt.contour(X1, X2, Z, colors="grey", levels=20, alpha=0.7, linewidths=0.6)

                for i in range(self.num_classes):
                    plt.scatter(self.train_classes[i][:, 0], self.train_classes[i][:, 1], s=20, edgecolor="k", label=f"Training Points- Class {i+1}")

                plt.title(f"GMM Decision Boundary and Level Curves\nQ={Q}")
                plt.xlabel("X1")
                plt.ylabel("X2")
                plt.legend()

                plt.savefig(f"Bayes Classifier using GMM(Q={Q}).png", dpi=300, format="png")
                plt.close()

            # generate confusion matrix and calculate the accuracy for training, validation and testing data and store them in a csv file
            confusion_matrix = np.zeros((self.num_classes, self.num_classes), dtype="int64")
            for X, y in zip(self.X_train, self.y_train):
                prediction = self.gmm_predict(X)
                confusion_matrix[y, prediction] += 1

            training_accuracy = np.trace(confusion_matrix) / np.sum(confusion_matrix)
            np.savetxt(f"GMM_Training_Confusion_Matrix(Q={Q}).csv", confusion_matrix, delimiter=",")

            confusion_matrix = np.zeros((self.num_classes, self.num_classes), dtype="int64")
            for X, y in zip(self.X_val, self.y_val):
                prediction = self.gmm_predict(X)
                confusion_matrix[y, prediction] += 1

            validation_accuracy = np.trace(confusion_matrix) / np.sum(confusion_matrix)
            np.savetxt(f"GMM_Validation_Confusion_Matrix(Q={Q}).csv", confusion_matrix, delimiter=",")

            confusion_matrix = np.zeros((self.num_classes, self.num_classes), dtype="int64")
            for X, y in zip(self.X_test, self.y_test):
                prediction = self.gmm_predict(X)
                confusion_matrix[y, prediction] += 1

            testing_accuracy = np.trace(confusion_matrix) / np.sum(confusion_matrix)
            np.savetxt(f"GMM_Testing_Confusion_Matrix(Q={Q}).csv", confusion_matrix, delimiter=",")

            if(best_val_accuracy < validation_accuracy):
                best_val_accuracy = validation_accuracy
                self.best_num_components = self.num_components
                self.best_responsibilities = self.responsibilities
                self.best_component_probabilities = self.component_probabilities
                self.best_component_means = self.component_means
                self.best_component_covariances = self.component_covariances

            classification_accuracy["Number of Components"].append(Q)
            classification_accuracy["Training Accuracy"].append(training_accuracy)
            classification_accuracy["Validation Accuracy"].append(validation_accuracy)
            classification_accuracy["Testing Accuracy"].append(testing_accuracy)

        accuracy_df = pd.DataFrame(classification_accuracy)
        accuracy_df.to_csv("GMM Accuracy.csv", index=False)

# Dataset 1a

In [None]:
train_df = pd.read_csv("./Train-20.csv")
val_df = pd.read_csv("./Val-20.csv")
test_df = pd.read_csv("./Test-20.csv")

train_data = train_df[["input1", "input2"]]
train_label = train_df["output"].astype(int)

val_data = val_df[["input1", "input2"]]
val_label = val_df["output"].astype(int)

test_data = test_df[["input1", "input2"]]
test_label = test_df["output"].astype(int)

In [None]:
X_train = np.reshape(train_data.to_numpy(), (-1, 2))
y_train = train_label.to_numpy()

X_val = np.reshape(val_data.to_numpy(), (-1, 2))
y_val = val_label.to_numpy()

X_test = np.reshape(test_data.to_numpy(), (-1, 2))
y_test = test_label.to_numpy()

## Bayes Classifier using KNNs for estimation of class-conditional probability

In [None]:
classifier = BayesClassifier(X_train, y_train, X_val, y_val, X_test, y_test, "knn")

In [None]:
K_list = [10, 20]
classifier.KNN(K_list)

# Dataset 1b

In [3]:
train_df = pd.read_csv("./Train-10.csv")
val_df = pd.read_csv("./Val-10.csv")
test_df = pd.read_csv("./Test-10.csv")

train_data = train_df[["x1", "x2"]]
train_label = train_df["label"].astype(int)

val_data = val_df[["x1", "x2"]]
val_label = val_df["label"].astype(int)

test_data = test_df[["x1", "x2"]]
test_label = test_df["label"].astype(int)

FileNotFoundError: [Errno 2] No such file or directory: './Train-10.csv'

In [None]:
X_train = np.reshape(train_data.to_numpy(), (-1, 2))
y_train = train_label.to_numpy()

X_val = np.reshape(val_data.to_numpy(), (-1, 2))
y_val = val_label.to_numpy()

X_test = np.reshape(test_data.to_numpy(), (-1, 2))
y_test = test_label.to_numpy()

## Bayes Classifier using KNNs for estimation of class-conditional probability

In [None]:
classifier = BayesClassifier(X_train, y_train, X_val, y_val, X_test, y_test, "knn")

In [None]:
K_list = [10, 20]
classifier.KNN(K_list)

## Bayes Classifier with Gaussian Distribution for All Classes

In [None]:
classifier = BayesClassifier(X_train, y_train, X_val, y_val, X_test, y_test, "gaussian")

In [None]:
classifier.gaussian()

## Bayes Classifier with GMM with diagonal covariance matrix

In [None]:
classifier = BayesClassifier(X_train, y_train, X_val, y_val, X_test, y_test, "gmm", "diagonal")

In [None]:
Q_list = [2, 4, 5, 8, 10, 12]
classifier.gmm(Q_list)

## Bayes Classifier with GMM with full covariance matrix

In [None]:
classifier = BayesClassifier(X_train, y_train, X_val, y_val, X_test, y_test, "gmm", 5, "full")

In [None]:
Q_list = [2, 4, 5, 8, 10, 12]
classifier.gmm(Q_list)

# Dataset 2

In [None]:
train_data = pd.read_csv("./train_data.csv", header=None)
train_label = pd.read_csv("./train_label.csv", header=None)

val_data = pd.read_csv("./val_data.csv", header=None)
val_label = pd.read_csv("./val_label.csv", header=None)

test_data = pd.read_csv("./test_data.csv", header=None)
test_label = pd.read_csv("./test_label.csv", header=None)

In [None]:
X_train = np.reshape(train_data.to_numpy(), (-1, 81))
y_train = np.reshape(train_label.to_numpy(dtype="int64"), (-1, ))

X_val = np.reshape(val_data.to_numpy(), (-1, 81))
y_val = np.reshape(val_label.to_numpy(dtype="int64"), (-1, ))

X_test = np.reshape(test_data.to_numpy(), (-1, 81))
y_test = np.reshape(test_label.to_numpy(dtype="int64"), (-1, ))

In [None]:
classifier = BayesClassifier(X_train, y_train, X_val, y_val, X_test, y_test, "knn")

In [None]:
K_list = [10, 20]
classifier.KNN(K_list)

## Bayes Classifier with Gaussian Distribution for All Classes


In [None]:
classifier = BayesClassifier(X_train, y_train, X_val, y_val, X_test, y_test, "gaussian")

In [None]:
classifier.gaussian()

## Bayes Classifier with GMM with diagonal covariance matrix

In [None]:
classifier = BayesClassifier(X_train, y_train, X_val, y_val, X_test, y_test, "gmm", "diagonal")

In [None]:
Q_list = [2, 4, 5, 8, 10, 12]
classifier.gmm(Q_list)

## Bayes Classifier with GMM with full covariance matrix

In [None]:
classifier = BayesClassifier(X_train, y_train, X_val, y_val, X_test, y_test, "gmm", "full")

In [None]:
Q_list = [2, 4, 5, 8, 10, 12]
classifier.gmm(Q_list)