In [None]:
# Import necessary libraries
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier, KDTree, NearestNeighbors
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Step 1: Implementing KNN Variants with Varying Dataset Size and Number of Dimensions

def measure_knn_performance(X_train, y_train, X_test, y_test, k):
    # Naive KNN
    start_time = time.time()
    naive_knn = KNeighborsClassifier(n_neighbors=k, algorithm='brute')
    naive_knn.fit(X_train, y_train)
    naive_train_time = time.time() - start_time
    start_time = time.time()
    naive_y_pred = naive_knn.predict(X_test)
    naive_accuracy = accuracy_score(y_test, naive_y_pred)
    naive_test_time = time.time() - start_time

    # KD-Tree KNN
    start_time = time.time()
    kdtree = KDTree(X_train, leaf_size=30, metric='euclidean')
    kdtree_train_time = time.time() - start_time
    start_time = time.time()
    _, ind = kdtree.query(X_test, k=k)
    kdtree_y_pred = [np.argmax(np.bincount(y_train[neighbors])) for neighbors in ind]
    kdtree_accuracy = accuracy_score(y_test, kdtree_y_pred)
    kdtree_test_time = time.time() - start_time

    # LSH KNN
    start_time = time.time()
    lsh = NearestNeighbors(n_neighbors=k, algorithm='auto', metric='euclidean')
    lsh.fit(X_train)
    lsh_train_time = time.time() - start_time
    start_time = time.time()
    _, ind = lsh.kneighbors(X_test)
    lsh_y_pred = [np.argmax(np.bincount(y_train[neighbors])) for neighbors in ind]
    lsh_accuracy = accuracy_score(y_test, lsh_y_pred)
    lsh_test_time = time.time() - start_time

    return {
        "Naive": {"Training Time": naive_train_time,"Testing Time": naive_test_time, "Accuracy": naive_accuracy},
        "KD-Tree": {"Training Time": kdtree_train_time,"Testing Time": kdtree_test_time, "Accuracy": kdtree_accuracy},
        "LSH": {"Training Time": lsh_train_time, "Testing Time": lsh_test_time,"Accuracy": lsh_accuracy}
    }

def vary_dataset_size_and_dimensions():
    dimensions = [2, 5, 10]  # Number of dimensions
    dataset_sizes = [1000, 5000, 10000]  # Dataset sizes
    k = 5  # Number of neighbors

    results = {}

    for D in dimensions:
        for N in dataset_sizes:
            informative_features = min(D, 2)
            redundant_features = max(0, D - informative_features)
            repeated_features = 0  # We don't need repeated features
            X, y = make_classification(n_samples=N, n_features=D, n_classes=2, n_informative=informative_features, n_redundant=redundant_features, n_repeated=repeated_features, random_state=42)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            result = measure_knn_performance(X_train, y_train, X_test, y_test, k)
            results[(D, N)] = result

    return results

# Run the experiments
results = vary_dataset_size_and_dimensions()

# Print and visualize the results
for key, value in results.items():
    print(f"Dimensions: {key[0]}, Dataset Size: {key[1]}")
    for method, metrics in value.items():
        print(f"{method} - Training Time: {metrics['Training Time']:.4f} seconds, Testing Time: {metrics['Testing Time']:.4f} seconds, Accuracy: {metrics['Accuracy']:.4f}")
    print()
# Plot the results
dimensions = [2, 5, 10]  # Number of dimensions
dataset_sizes = [1000, 5000, 10000]  # Dataset sizes

fig, axes = plt.subplots(len(dimensions), len(dataset_sizes), figsize=(15, 10))
for i, D in enumerate(dimensions):
    for j, N in enumerate(dataset_sizes):
        # Get the results for this configuration
        result = results[(D, N)]
        accuracies = [result[method]['Accuracy'] for method in result]
        methods = list(result.keys())

        # Plot the accuracies
        axes[i, j].bar(methods, accuracies)
        axes[i, j].set_title(f"D={D}, N={N}")
        axes[i, j].set_ylim(0, 1)
        axes[i, j].set_ylabel("Accuracy")
        axes[i, j].set_xlabel("Method")

plt.tight_layout()
plt.show()
