<a href="https://colab.research.google.com/github/PCBZ/CS6140/blob/main/HW4/HW4_Problem5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import shutil
import urllib.request
import zipfile
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

def fetch_data():
    url = "https://www.khoury.northeastern.edu/home/vip/teach/MLcourse/data/spam_polluted.zip"

    if os.path.exists('spam_data'):
        shutil.rmtree('spam_data')

    os.makedirs('spam_data', exist_ok=True)

    # Download the zip file
    zip_path = 'spam_data/spam_polluted.zip'
    urllib.request.urlretrieve(url, zip_path)

    # Extract the zip file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall('spam_data')

    # Clean up the zip file
    os.remove(zip_path)

    # Load the data files
    data_path = 'spam_data/spam_polluted/'

    # Load training data
    train_features = np.loadtxt(data_path + 'train_feature.txt')
    train_labels = np.loadtxt(data_path + 'train_label.txt')

    # Load test data
    test_features = np.loadtxt(data_path + 'test_feature.txt')
    test_labels = np.loadtxt(data_path + 'test_label.txt')

    return train_features, train_labels, test_features, test_labels

def train_naive_bayes_model(X_train, y_train, X_test, y_test):
    model = GaussianNB()
    model.fit(X_train, y_train)

    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, train_pred)
    test_accuracy = accuracy_score(y_test, test_pred)

    print("\nPure Gaussian NB:")
    print(f"Training Accuracy: {train_accuracy: .2%}")
    print(f"Test Accuracy: {test_accuracy: .2%}")

def apply_pca_train(X_train, y_train, X_test, y_test):

    pca = PCA(n_components=100)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    model = GaussianNB()
    model.fit(X_train_pca, y_train)

    train_pred = model.predict(X_train_pca)
    test_pred = model.predict(X_test_pca)

    train_accuracy = accuracy_score(y_train, train_pred)
    test_accuracy = accuracy_score(y_test, test_pred)

    print("\nGaussian NB with PCA:")
    print(f"Training Accuracy: {train_accuracy: .2%}")
    print(f"Test Accuracy: {test_accuracy: .2%}")

class CustomPCA:
    def __init__(self, n_components):
        self.n_components = n_components
        self.mean_ = None
        self.components_ = None

    def fit(self, X):
        self.mean_ = np.mean(X, axis=0)

        X_centered = X - self.mean_

        _, _, V = np.linalg.svd(X_centered, full_matrices=False)

        self.components_ = V[:self.n_components]

    def transform(self, X):
        X_centered = X - self.mean_
        return np.dot(X_centered, self.components_.T)

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

def apply_custom_pca_train(X_train, y_train, X_test, y_test):

    pca = CustomPCA(n_components=100)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    model = GaussianNB()
    model.fit(X_train_pca, y_train)

    train_pred = model.predict(X_train_pca)
    test_pred = model.predict(X_test_pca)

    train_accuracy = accuracy_score(y_train, train_pred)
    test_accuracy = accuracy_score(y_test, test_pred)

    print("\nGaussian NB with Custom PCA:")
    print(f"Training Accuracy: {train_accuracy: .2%}")
    print(f"Test Accuracy: {test_accuracy: .2%}")


if __name__ == "__main__":
    train_features, train_labels, test_features, test_labels = fetch_data()

    train_naive_bayes_model(train_features, train_labels, test_features, test_labels)
    apply_pca_train(train_features, train_labels, test_features, test_labels)

    apply_custom_pca_train(train_features, train_labels, test_features, test_labels)




Pure Gaussian NB:
Training Accuracy:  59.86%
Test Accuracy:  60.30%

Gaussian NB with PCA:
Training Accuracy:  73.96%
Test Accuracy:  73.10%

Gaussian NB with Custom PCA:
Training Accuracy:  74.08%
Test Accuracy:  73.54%
