In [10]:
#Importing libraries
import numpy as np
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.datasets import mnist

class KNN:
    def __init__(self, n_neighbors=3, chunk_size=1000, weighted=False):
        self.n_neighbors = n_neighbors
        self.chunk_size = chunk_size
        self.weighted = weighted

    def fit(self, X, y):
        #store the training data
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        #Predict labels for input data using cosine distance
        n_samples = X.shape[0]
        predictions = []

        #Processing data in chunks to save memory
        for start in range(0, n_samples, self.chunk_size):
            end = min(start + self.chunk_size, n_samples)
            distances = cosine_distances(X[start:end], self.X_train)
            neighbors_indices = np.argsort(distances, axis=1)[:, :self.n_neighbors]
            neighbors_distances = np.take_along_axis(distances, neighbors_indices, axis=1)
            neighbors_labels = self.y_train[neighbors_indices]

            if self.weighted:
                #Weighted KNN
                chunk_predictions = []
                for dist, labels in zip(neighbors_distances, neighbors_labels):
                    weights = 1 / (dist + 1e-9)  #Avoid division by zero
                    vote = np.bincount(labels, weights=weights).argmax()
                    chunk_predictions.append(vote)
                predictions.extend(chunk_predictions)
            else:
                #Unweighted KNN
                chunk_predictions = np.array([np.bincount(labels).argmax() for labels in neighbors_labels])
                predictions.extend(chunk_predictions)

        return np.array(predictions)

def load_mnist_data():
    #Load and preprocess the MNIST dataset using TensorFlow
    (x_train, y_train), (x_test, y_test) = mnist.load_data()

    #Flatten images and normalize pixel values
    x_train = x_train.reshape(x_train.shape[0], -1) / 255.0
    x_test = x_test.reshape(x_test.shape[0], -1) / 255.0

    # Combine train and test for full dataset
    X = np.vstack((x_train, x_test))
    y = np.hstack((y_train, y_test))
    return X, y

def load_20ng_data():
    #Load and preprocess the 20 Newsgroups dataset with normalization
    try:
        ng = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
        vectorizer = TfidfVectorizer(max_features=10000, min_df=3, max_df=0.85, stop_words='english')
        X = vectorizer.fit_transform(ng.data).toarray()

        #Dimensionality reduction
        svd = TruncatedSVD(n_components=800, random_state=42)
        X_reduced = svd.fit_transform(X)

        #Normalize the data to ensure equal feature importance
        X_normalized = StandardScaler().fit_transform(X_reduced)

        y = LabelEncoder().fit_transform(ng.target)
        return X_normalized, y
    except Exception as e:
        print(f"Error fetching 20 Newsgroups dataset: {e}")
        return None, None

def train_and_evaluate_knn(X, y, n_neighbors=5, weighted=True):
    #Train and evaluate the KNN classifier
    if X is None or y is None:
        print("Dataset not loaded properly. Skipping evaluation.")
        return

    # Split the dataset
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Train KNN with tuned parameters
    knn = KNN(n_neighbors=n_neighbors, weighted=weighted)
    knn.fit(X_train, y_train)

    #Evaluate on training, validation, and testing sets
    y_train_pred = knn.predict(X_train)
    y_val_pred = knn.predict(X_val)
    y_test_pred = knn.predict(X_test)

    train_acc = accuracy_score(y_train, y_train_pred)
    val_acc = accuracy_score(y_val, y_val_pred)
    test_acc = accuracy_score(y_test, y_test_pred)

    print(f"Training Accuracy: {train_acc:.4f}")
    print(f"Validation Accuracy: {val_acc:.4f}")
    print(f"Testing Accuracy: {test_acc:.4f}")

if __name__ == "__main__":
    # MNIST Dataset Evaluation
    print("Evaluating KNN on MNIST...")
    mnist_X, mnist_y = load_mnist_data()
    train_and_evaluate_knn(mnist_X, mnist_y, n_neighbors=3, weighted=False)

    # 20 Newsgroups Dataset Evaluation
    print("\nEvaluating KNN on 20 Newsgroups...")
    ng_X, ng_y = load_20ng_data()
    train_and_evaluate_knn(ng_X, ng_y, n_neighbors=5, weighted=True)


Evaluating KNN on MNIST...
Training Accuracy: 0.9873
Validation Accuracy: 0.9754
Testing Accuracy: 0.9777

Evaluating KNN on 20 Newsgroups...
Training Accuracy: 0.9698
Validation Accuracy: 0.5660
Testing Accuracy: 0.5729
