<a href="https://colab.research.google.com/github/PCBZ/CS6140/blob/main/HW5/HW5_problem3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%pip install ucimlrepo

from sklearn.metrics.pairwise import pairwise_distances, rbf_kernel, polynomial_kernel
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from ucimlrepo import fetch_ucirepo

import requests
import zipfile
import os
import numpy as np

from tqdm import tqdm
import gc

class KNNClassifier:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def _calculate_distances(self, X):
        if self.distance_metric == 'euclidean' or self.distance_metric == 'cosine':
            distances = pairwise_distances(X, self.X_train, metric=self.distance_metric)
        elif self.distance_metric == 'rbf':
            gamma = 1.0 / self.X_train.shape[1]
            similarities = rbf_kernel(X, self.X_train, gamma=gamma)
            distances = np.sqrt(2*(1 - similarities))
        elif self.distance_metric == 'poly':
            similarities = polynomial_kernel(X, self.X_train, degree=2)
            similarities = (similarities - similarities.min()) / (similarities.max() - similarities.min() + 1e-8)
            distances = np.sqrt(2*(1 - similarities))
        else:
            raise ValueError(f"Invalid distance metric: {self.distance_metric}")
        return distances

    def predict(self, X):

        predictions = []

        all_test_distances = self._calculate_distances(X)

        for i in range(len(X)):
            test_distances = all_test_distances[i]

            k_nearest_indices = np.argsort(test_distances)[:self.k]
            k_nearest_labels = self.y_train[k_nearest_indices]

            vote_counts = Counter(k_nearest_labels)
            predicted_label = vote_counts.most_common(1)[0][0]
            predictions.append(predicted_label)

        return np.array(predictions)

class BatchKNNClassifier(KNNClassifier):
    """
    KNN Classifier with batch processing.
    """
    def __init__(self, k=3, distance_metric='euclidean', batch_size=1000):
        super().__init__(k, distance_metric)
        self.batch_size = batch_size

    def _calculate_distances(self, X):
        if self.distance_metric == 'euclidean' or self.distance_metric == 'cosine':
            distances = pairwise_distances(X, self.X_train, metric=self.distance_metric)
        elif self.distance_metric == 'rbf':
            gamma = 1.0 / self.X_train.shape[1]
            similarities = rbf_kernel(X, self.X_train, gamma=gamma)
            distances = np.sqrt(2*(1 - similarities))
        elif self.distance_metric == 'poly':
            similarities = polynomial_kernel(X, self.X_train, degree=2)
            similarities = (similarities - similarities.min()) / (similarities.max() - similarities.min() + 1e-8)
            distances = np.sqrt(2*(1 - similarities))
        else:
            raise ValueError(f"Invalid distance metric: {self.distance_metric}")
        return distances

    def predict(self, X):
        """
        Predict labels for a batch of samples.
        """
        X = np.array(X)
        all_predictions = []

        # The number of batches
        n_batches = (len(X) + self.batch_size - 1) // self.batch_size

        batch_progress = tqdm(total=n_batches, desc="Progress", position=0)

        for batch_idx in range(n_batches):
            # Get current batch data
            start_idx, end_idx = batch_idx * self.batch_size, min((batch_idx + 1) * self.batch_size, len(X))
            X_batch = X[start_idx:end_idx]

            batch_progress.set_postfix_str(f"Batch {batch_idx+1}/{n_batches} ({len(X_batch)} samples)")

            # Batch distance calculation
            batch_distances = self._calculate_distances(X_batch)

            batch_predictions = []

            # Predict for each sample in the batch
            for i in range(len(X_batch)):
                test_distances = batch_distances[i]
                k_nearest_indices = np.argsort(test_distances)[:self.k]
                k_nearest_labels = self.y_train[k_nearest_indices]

                vote_counts = Counter(k_nearest_labels)
                predicted_label = vote_counts.most_common(1)[0][0]
                batch_predictions.append(predicted_label)

            # Add batch prediction results
            all_predictions.extend(batch_predictions)

            # Memory cleanup
            del batch_distances, batch_predictions
            gc.collect()

            batch_progress.update(1)

        batch_progress.close()

        return np.array(all_predictions)


def fetch_spambase_data():
    """
    Fetch Spambase dataset from UCI repository
    """
    spambase = fetch_ucirepo(id=94)

    # Extract features and targets
    X = spambase.data.features.values
    y = spambase.data.targets.values.ravel()

    return X, y

def preprocess_data(X, y):
    """
    Preprocess the data
    """
    # Covert labels to 1/-1
    y = np.where(y == 0, -1, 1)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # Standardization
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test

def fetch_digit_dataset():
    zip_url = "https://www.khoury.northeastern.edu/home/vip/teach/MLcourse/data/mnist_haar_bingyu.zip"
    zip_filename = "mnist_haar_bingyu.zip"
    extracted_folder = "mnist_haar_bingyu"
    response = requests.get(zip_url)
    with open(zip_filename, 'wb') as f:
        f.write(response.content)

    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall(".")

    train_images = np.loadtxt(os.path.join(extracted_folder, "training_image.txt"), delimiter=',')
    train_labels = np.loadtxt(os.path.join(extracted_folder, "training_label.txt"), dtype=int, delimiter=',')
    test_images = np.loadtxt(os.path.join(extracted_folder, "testing_image.txt"), delimiter=',')
    test_labels = np.loadtxt(os.path.join(extracted_folder, "testing_label.txt"), dtype=int, delimiter=',')

    return train_images, test_images, train_labels, test_labels

def train_and_evaluate_spambase(X_train, X_test, y_train, y_test):
    """
    Train and evaluate the model on Spambase dataset
    """
    print("="*50)
    print("Spambase Evaluation")
    print("="*50)

    for k in [1, 3, 7]:
        model = KNNClassifier(k=k, distance_metric='euclidean')
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        train_accuracy = accuracy_score(y_train, y_train_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)

        print(f"k={k}: Train Accuracy={train_accuracy:.2%}, Test Accuracy={test_accuracy:.2%}")

def train_and_evaluate_digit(X_train, X_test, y_train, y_test):
    """
    Train and evaluate the model on Digit dataset
    """

    results = {}

    for k in [1, 3, 7]:
        for metric in ['cosine', 'rbf', 'poly']:
            model = BatchKNNClassifier(k=k, distance_metric=metric)
            model.fit(X_train, y_train)

            y_train_pred = model.predict(X_train)
            y_test_pred = model.predict(X_test)

            train_accuracy = accuracy_score(y_train, y_train_pred)
            test_accuracy = accuracy_score(y_test, y_test_pred)

            results[f"k={k}, metric={metric}"] = (train_accuracy, test_accuracy)

    print("\n" + "="*50)
    print("Digit Evaluation")
    print("="*50)

    for result, (train_acc, test_acc) in results.items():
        print(f"{result}: Train Accuracy={train_acc:.2%}, Test Accuracy={test_acc:.2%}")



if __name__ == "__main__":
    # Spambase
    X, y = fetch_spambase_data()
    X_train, X_test, y_train, y_test = preprocess_data(X, y)

    train_and_evaluate_spambase(X_train, X_test, y_train, y_test)

    # Digit
    X_train, X_test, y_train, y_test = fetch_digit_dataset()

    train_and_evaluate_digit(X_train, X_test, y_train, y_test)


Spambase Evaluation
k=1: Train Accuracy=99.97%, Test Accuracy=92.07%
k=3: Train Accuracy=95.24%, Test Accuracy=90.23%
k=7: Train Accuracy=92.66%, Test Accuracy=90.23%


Progress: 100%|██████████| 60/60 [04:04<00:00,  4.08s/it, Batch 60/60 (1000 samples)]
Progress: 100%|██████████| 10/10 [00:39<00:00,  3.98s/it, Batch 10/10 (1000 samples)]
Progress: 100%|██████████| 60/60 [02:33<00:00,  2.57s/it, Batch 60/60 (1000 samples)]
Progress: 100%|██████████| 10/10 [00:25<00:00,  2.53s/it, Batch 10/10 (1000 samples)]
Progress: 100%|██████████| 60/60 [04:26<00:00,  4.45s/it, Batch 60/60 (1000 samples)]
Progress: 100%|██████████| 10/10 [00:44<00:00,  4.41s/it, Batch 10/10 (1000 samples)]
Progress: 100%|██████████| 60/60 [03:58<00:00,  3.97s/it, Batch 60/60 (1000 samples)]
Progress: 100%|██████████| 10/10 [00:39<00:00,  3.95s/it, Batch 10/10 (1000 samples)]
Progress: 100%|██████████| 60/60 [02:31<00:00,  2.53s/it, Batch 60/60 (1000 samples)]
Progress: 100%|██████████| 10/10 [00:25<00:00,  2.52s/it, Batch 10/10 (1000 samples)]
Progress: 100%|██████████| 60/60 [04:26<00:00,  4.44s/it, Batch 60/60 (1000 samples)]
Progress: 100%|██████████| 10/10 [00:43<00:00,  4.35s/


Digit Evaluation
k=1, metric=cosine: Train Accuracy=100.00%, Test Accuracy=94.59%
k=1, metric=rbf: Train Accuracy=100.00%, Test Accuracy=94.57%
k=1, metric=poly: Train Accuracy=58.83%, Test Accuracy=58.01%
k=3, metric=cosine: Train Accuracy=97.59%, Test Accuracy=94.86%
k=3, metric=rbf: Train Accuracy=97.73%, Test Accuracy=95.10%
k=3, metric=poly: Train Accuracy=61.69%, Test Accuracy=61.31%
k=7, metric=cosine: Train Accuracy=96.07%, Test Accuracy=94.78%
k=7, metric=rbf: Train Accuracy=96.29%, Test Accuracy=94.91%
k=7, metric=poly: Train Accuracy=62.54%, Test Accuracy=61.61%





In [1]:
%pip install ucimlrepo

from sklearn.metrics.pairwise import pairwise_distances
from collections import Counter
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from ucimlrepo import fetch_ucirepo
import requests
import zipfile
import os
import gc


class FixedWindowKNNClassifier:
    """
    Fixed Window KNN Classifier
    """
    def __init__(self, radius=1.0, distance_metric='euclidean'):
        self.radius = radius
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def _calculate_distances(self, X):
        return pairwise_distances(X, self.X_train, metric=self.distance_metric)

    def predict(self, X):
        all_predictions = []

        distances = self._calculate_distances(X)

        for i in range(len(X)):
            test_distances = distances[i]

            # Filter distances within the radius
            within_radius_indices = np.where(test_distances <= self.radius)[0]

            if len(within_radius_indices) == 0:
                closest_idx = np.argmin(test_distances)
                predicted_label = self.y_train[closest_idx]
            else:
                within_radius_labels = self.y_train[within_radius_indices]
                vote_counts = Counter(within_radius_labels)
                predicted_label = vote_counts.most_common(1)[0][0]

            all_predictions.append(predicted_label)

        return np.array(all_predictions)

class BatchFixedWindowKNNClassifier(FixedWindowKNNClassifier):
    """
    Fixed Window KNN Classifier with batch processing.
    """
    def __init__(self, radius=1.0, distance_metric='euclidean', batch_size=1000):
        super().__init__(radius, distance_metric)
        self.batch_size = batch_size

    def _predict_single_sample(self, test_distances):
        within_radius_indices = np.where(test_distances <= self.radius)[0]

        if len(within_radius_indices) == 0:
            closest_idx = np.argmin(test_distances)
            predicted_label = self.y_train[closest_idx]
        else:
            within_radius_labels = self.y_train[within_radius_indices]
            vote_counts = Counter(within_radius_labels)
            predicted_label = vote_counts.most_common(1)[0][0]

        return predicted_label

    def predict(self, X):
        all_predictions = []

        n_batches = (len(X) + self.batch_size - 1) // self.batch_size

        batch_progress = tqdm(total=n_batches, desc="Progress", position=0)

        for i in range(n_batches):
            start_idx = i * self.batch_size
            end_idx = min((i + 1) * self.batch_size, len(X))
            X_batch = X[start_idx:end_idx]

            batch_progress.set_postfix_str(f"Batch {i+1}/{n_batches} ({len(X_batch)} samples)")

            batch_distances = self._calculate_distances(X_batch)

            batch_predictions = []
            batch_neighbor_counts = []

            for j in range(len(X_batch)):
                test_distances = batch_distances[j]
                predicted_label = self._predict_single_sample(test_distances)
                batch_predictions.append(predicted_label)

            batch_progress.update(1)

            all_predictions.extend(batch_predictions)

            del batch_distances, batch_predictions
            gc.collect()

        batch_progress.close()

        return np.array(all_predictions)


def find_best_radius(X_train, y_train, X_test, y_test, distance_metric, radius_values):

    best_radius = radius_values[0]
    best_accuracy = 0

    for radius in radius_values:
        model = FixedWindowKNNClassifier(radius=radius, distance_metric=distance_metric)
        model.fit(X_train, y_train)

        y_test_pred = model.predict(X_test)
        current_accuracy = accuracy_score(y_test, y_test_pred)

        if current_accuracy > best_accuracy:
            best_radius = radius
            best_accuracy = current_accuracy

    return best_radius, best_accuracy

def fetch_spambase_data():
    """
    Fetch Spambase dataset from UCI repository
    """
    spambase = fetch_ucirepo(id=94)

    # Extract features and targets
    X = spambase.data.features.values
    y = spambase.data.targets.values.ravel()

    return X, y

def preprocess_data(X, y):
    """
    Preprocess the data
    """
    # Covert labels to 1/-1
    y = np.where(y == 0, -1, 1)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # Standardization
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test

def fetch_digit_dataset():
    zip_url = "https://www.khoury.northeastern.edu/home/vip/teach/MLcourse/data/mnist_haar_bingyu.zip"
    zip_filename = "mnist_haar_bingyu.zip"
    extracted_folder = "mnist_haar_bingyu"
    response = requests.get(zip_url)
    with open(zip_filename, 'wb') as f:
        f.write(response.content)

    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall(".")

    train_images = np.loadtxt(os.path.join(extracted_folder, "training_image.txt"), delimiter=',')
    train_labels = np.loadtxt(os.path.join(extracted_folder, "training_label.txt"), dtype=int, delimiter=',')
    test_images = np.loadtxt(os.path.join(extracted_folder, "testing_image.txt"), delimiter=',')
    test_labels = np.loadtxt(os.path.join(extracted_folder, "testing_label.txt"), dtype=int, delimiter=',')

    return train_images, test_images, train_labels, test_labels

def train_and_evaluate_spambase_fixed_window(X_train, X_test, y_train, y_test):
    print("="*50)
    print("Spambase Evaluation")
    print("="*50)

    best_radius, best_accuracy = find_best_radius(X_train, y_train, X_test, y_test, 'euclidean', [0.01, 0.02, 0.03, 0.04, 0.5, 1, 1.5, 2.0, 2.5])

    model = FixedWindowKNNClassifier(radius=best_radius, distance_metric='euclidean')
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    print(f"\nBest Radius: {best_radius}")
    print(f"Train Accuracy: {train_accuracy:.2%}")
    print(f"Test Accuracy: {test_accuracy:.2%}")

def train_and_evaluate_digits_fixed_window(X_train, X_test, y_train, y_test):
    print("\n" + "="*50)
    print("Digit Evaluation")
    print("="*50)

    best_radius, best_accuracy = find_best_radius(X_train, y_train, X_test, y_test, distance_metric='cosine', radius_values=[0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.15, 0.2, 0.25])

    model = BatchFixedWindowKNNClassifier(radius=best_radius, distance_metric='cosine')
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    print(f"Best Radius: {best_radius}")
    print(f"Train Accuracy: {train_accuracy:.2%}")
    print(f"Test Accuracy: {test_accuracy:.2%}")

if __name__ == "__main__":
    X, y = fetch_spambase_data()
    X_train, X_test, y_train, y_test = preprocess_data(X, y)

    train_and_evaluate_spambase_fixed_window(X_train, X_test, y_train, y_test)

    X_train, X_test, y_train, y_test = fetch_digit_dataset()

    train_and_evaluate_digits_fixed_window(X_train, X_test, y_train, y_test)






Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
Spambase Evaluation

Best Radius: 0.5
Train Accuracy: 99.57%
Test Accuracy: 94.35%

Digit Evaluation


Progress: 100%|██████████| 60/60 [01:36<00:00,  1.60s/it, Batch 60/60 (1000 samples)]
Progress: 100%|██████████| 10/10 [00:17<00:00,  1.71s/it, Batch 10/10 (1000 samples)]

Best Radius: 0.01
Train Accuracy: 99.94%
Test Accuracy: 94.62%





In [2]:
%pip install ucimlrepo

from sklearn.metrics.pairwise import rbf_kernel
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from ucimlrepo import fetch_ucirepo

class KernelDensityBayesClassifier:
    def __init__(self, bandwidth=1.0):
        self.bandwidth = bandwidth

    def fit(self, X, y):
        self.class_data_ = {}
        self.class_counts_ = {}
        self.class_priors_ = {}

        self.classes = np.unique(y)
        n_totals = len(y)

        for class_label in self.classes:
            class_mask = (y == class_label)
            self.class_data_[class_label] = X[class_mask]
            self.class_counts_[class_label] = np.sum(class_mask)
            self.class_priors_[class_label] = self.class_counts_[class_label] / n_totals

    def _estimate_class_density(self, X, class_label):
        """
        Estimate probability density P(z|C) for specified class
        Using Gaussian kernel: P(z|C) = (1/m_C) * Σ K((z - x_i) / h)
        """
        X_class = self.class_data_[class_label]
        m_c = self.class_counts_[class_label]
        kernel_similarity = rbf_kernel(X, X_class, gamma=1.0 / (2 * self.bandwidth ** 2))
        return (1 / m_c) * np.sum(kernel_similarity, axis=1)


    def predict_proba(self, X):
        """
        Predict class probabilities
        Using Bayes' theorem: P(C|z) = P(C) * P(z|C) / P(z)
        """
        n_samples = len(X)
        n_classes = len(self.classes)
        class_probabilities = np.zeros((n_samples, n_classes))

        for i, class_label in enumerate(self.classes):
            prior = self.class_priors_[class_label]
            likelihood = self._estimate_class_density(X, class_label)
            class_probabilities[:, i] = prior * likelihood


        total_probabilities = np.sum(class_probabilities, axis=1, keepdims=True)
        total_probabilities = np.where(total_probabilities == 0, 1e-10, total_probabilities)


        return class_probabilities / total_probabilities

    def predict(self, X):
        """
        Predict class labels
        """
        class_probabilities = self.predict_proba(X)
        predict_indices = np.argmax(class_probabilities, axis=1)
        return self.classes[np.argmax(class_probabilities, axis=1)]

def fetch_spambase_data():
    """
    Fetch Spambase dataset from UCI repository
    """
    spambase = fetch_ucirepo(id=94)
    X = spambase.data.features.values
    y = spambase.data.targets.values.ravel()
    return X, y


def preprocess_data(X, y):
    """
    Preprocess the data
    """
    # Convert labels to 1/-1
    y = np.where(y == 0, -1, 1)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Standardization
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test

def train_and_evaluate_spambase_kde(X_train, X_test, y_train, y_test):
    print("="*50)
    print("Spambase Evaluation")
    print("="*50)

    model = KernelDensityBayesClassifier(bandwidth=1.0)
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    print(f"Train Accuracy: {train_accuracy:.2%}")
    print(f"Test Accuracy: {test_accuracy:.2%}")

if __name__ == "__main__":
    X, y = fetch_spambase_data()
    X_train, X_test, y_train, y_test = preprocess_data(X, y)

    train_and_evaluate_spambase_kde(X_train, X_test, y_train, y_test)

Spambase Evaluation
Train Accuracy: 96.39%
Test Accuracy: 90.45%
