In [31]:
!pip install ucimlrepo



In [32]:
from ucimlrepo import fetch_ucirepo

import numpy as np
import pandas as pd
from collections import Counter
from scipy.stats import ttest_rel
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler

#Performing k-fold cross-validation by splitting the dataset from scratch.
def performing_k_number_of_folds(X, y, k, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)
    number_of_samples = len(X)
    indices_of_samples = np.arange(number_of_samples)
    np.random.shuffle(indices_of_samples)

    sizes_of_folds = np.full(k, number_of_samples // k, dtype=int)
    sizes_of_folds[: number_of_samples % k] += 1

    array_of_folds = []
    current = 0
    for fold_size in sizes_of_folds:
        start, stop = current, current + fold_size
        test_idx = indices_of_samples[start:stop]
        train_idx = np.concatenate((indices_of_samples[:start], indices_of_samples[stop:]))
        array_of_folds.append((train_idx, test_idx))
        current = stop
    return array_of_folds






In [33]:

# Dataset Loaders + Preprocessing
def prepare_dataset(dataset, scale_data=False):

    X = dataset.data.features
    y = dataset.data.targets

    # Converting the dataset into a DataFrame to make column-based operations easier.
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)
    else:
        X = X.copy()

    if isinstance(y, (pd.Series, pd.DataFrame)):
        y = y.values.ravel()



    # Removing any rows where either input features (X) or target labels (y) contain missing values.
    X["__target__"] = y
    X.dropna(axis=0, inplace=True)
    y = X["__target__"].values
    X.drop(columns=["__target__"], inplace=True)

    # Treating features with 'object' or 'category' data types as categorical variables.
    cat_cols = [col for col in X.columns
                if X[col].dtype == "object" or str(X[col].dtype) == "category"]

    # Applying one-hot encoding to convert categorical features into a numerical format.
    X = pd.get_dummies(X, columns=cat_cols, drop_first=False)

    # Standardizing or normalizing the data to ensure consistent scaling.
    if scale_data:
        scaler = StandardScaler()
        X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

    # Encoding the target variable (y) if it’s a non-numeric type, making it suitable for modeling.
    if y.dtype.kind not in ('i', 'f'):
        le = LabelEncoder()
        y = le.fit_transform(y)

    X = X.values.astype(float)
    y = y.astype(int)

    return X, y


In [34]:

def load_breast_cancer_data(scale_data=False):
    dataset = fetch_ucirepo(id=14)
    return prepare_dataset(dataset, scale_data=scale_data)


def load_hayes_roth_data(scale_data=False):
    dataset = fetch_ucirepo(id=44)
    return prepare_dataset(dataset, scale_data=scale_data)


def load_car_eval_data(scale_data=False):
    dataset = fetch_ucirepo(id=19)
    return prepare_dataset(dataset, scale_data=scale_data)


In [35]:
# KNN Variant Implementations (from scratch)

# Simple KNN
class BasicKNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def _euclidean_distance(self, a, b):
        return np.sqrt(np.sum((a - b)**2))

    def _predict_single(self, x):
        # Calculating distances between the query point and all data points.
        distances = np.array([self._euclidean_distance(x, x_train)
                              for x_train in self.X_train])
        # Identifying the k-nearest neighbors based on computed distances.
        k_indices = distances.argsort()[:self.k]
        # Assigning a label based on the majority vote among the nearest neighbors.
        k_labels = self.y_train[k_indices]
        return Counter(k_labels).most_common(1)[0][0]

    def predict(self, X):
        return np.array([self._predict_single(x) for x in X])



In [36]:

# Kernel KNN
class KernelBasedKNN:
    def __init__(self, k=3, kernel=None, bandwidth=1.0):
        self.k = k
        self.bandwidth = bandwidth
        if kernel is None:
            self.kernel = lambda d, h: np.exp(-(d**2)/(2*(h**2)))
        else:
            self.kernel = kernel

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def _euclidean_distance(self, a, b):
        return np.sqrt(np.sum((a - b)**2))

    def predict(self, X):
        preds = []
        for x in X:
            distances = np.array([self._euclidean_distance(x, x_train)
                                  for x_train in self.X_train])
            k_indices = distances.argsort()[:self.k]
            k_distances = distances[k_indices]

            #Applying kernel functions to compute weights for neighbors based on distance.
            weights = self.kernel(k_distances, self.bandwidth)

            # Using a weighted voting mechanism, where closer points have a higher influence.
            vote_scores = {}
            for idx, w in zip(k_indices, weights):
                label = self.y_train[idx]
                vote_scores[label] = vote_scores.get(label, 0) + w

            # Selecting the label with the highest accumulated weight as the final prediction.
            pred_label = max(vote_scores.items(), key=lambda item: item[1])[0]
            preds.append(pred_label)
        return np.array(preds)



In [37]:

# Evaluation Functions
def test_basic_knn(X, y, k_neighbors=3, num_folds=10):
    folds = perform_k_fold_split(X, y, num_folds, random_state=42)
    accs_own, accs_sk = [], []
    for train_idx, test_idx in folds:
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Our BasicKNN
        knn_own = BasicKNN(k=k_neighbors)
        knn_own.fit(X_train, y_train)
        pred_own = knn_own.predict(X_test)
        accs_own.append(np.mean(pred_own == y_test))

        # Scikit-learn KNN
        knn_sk = KNeighborsClassifier(n_neighbors=k_neighbors)
        knn_sk.fit(X_train, y_train)
        pred_sk = knn_sk.predict(X_test)
        accs_sk.append(np.mean(pred_sk == y_test))

    return np.array(accs_own), np.array(accs_sk)



In [38]:

def test_kernel_knn(X, y, k_neighbors=3, num_folds=10, bandwidth=1.0):
    folds = perform_k_fold_split(X, y, num_folds, random_state=42)
    accs_own, accs_sk = [], []

    # matching Gaussian kernel for scikit-learn
    def sk_kernel(dist_array):
        return np.exp(-(dist_array**2) / (2 * bandwidth**2))

    for train_idx, test_idx in folds:
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Our KernelBasedKNN
        knn_own = KernelBasedKNN(k=k_neighbors, bandwidth=bandwidth)
        knn_own.fit(X_train, y_train)
        pred_own = knn_own.predict(X_test)
        accs_own.append(np.mean(pred_own == y_test))

        # Scikit-learn KNN with custom kernel weights
        knn_sk = KNeighborsClassifier(n_neighbors=k_neighbors, weights=sk_kernel)
        knn_sk.fit(X_train, y_train)
        pred_sk = knn_sk.predict(X_test)
        accs_sk.append(np.mean(pred_sk == y_test))

    return np.array(accs_own), np.array(accs_sk)






In [42]:

# Performing 10-fold cross-validation to evaluate model performance.
# Measuring accuracy and using a paired t-test for statistical comparisons.

def execute_single_label_tests(dataset_name, loader, scale_data=True, k_values=[1,3,5,7,9]):
    X, y = loader(scale_data=scale_data)

    records = []
    for variant in ["BasicKNN", "KernelBasedKNN"]:
        for k in k_values:
            if variant == "BasicKNN":
                acc_own, acc_sk = test_basic_knn(X, y, k_neighbors=k, num_folds=10)
            else:
                acc_own, acc_sk = test_kernel_knn(
                    X, y, k_neighbors=k, num_folds=10, bandwidth=1.0
                )

            mean_own, std_own = np.mean(acc_own), np.std(acc_own)
            mean_sk, std_sk = np.mean(acc_sk), np.std(acc_sk)
            t_stat, p_val = ttest_rel(acc_own, acc_sk)
            interpretation = (
                "Significant difference (p<0.05)."
                if p_val < 0.05 else
                "Not significant (p>=0.05)."
            )

            records.append({
                "Dataset_name": dataset_name,
                "Variants": variant,
                "k": k,
                "Mean": mean_own,
                "Std": std_own,
                "SKlearn Mean": mean_sk,
                "SKlearn Std": std_sk,
                "t-stat": t_stat,
                "p-value": p_val,
                "Interpretation": interpretation
            })

    df = pd.DataFrame(records)
    print(f"\n{dataset_name} (Single-label) Evaluation Summary")
    print(df.round(4))





In [43]:

if __name__ == "__main__":
    datasets = {
        "Breast Cancer": load_breast_cancer_data,
        "Hayes-Roth": load_hayes_roth_data,
        "Car Evaluation": load_car_eval_data
    }

    SCALE_DATA = True
    K_VALUES = [1, 3, 5, 7, 9]

    for name, loader in datasets.items():
        execute_single_label_tests(name, loader, scale_data=SCALE_DATA, k_values=K_VALUES)




Breast Cancer (Single-label) Evaluation Summary
    Dataset_name        Variants  k    Mean     Std  SKlearn Mean  \
0  Breast Cancer        BasicKNN  1  0.6538  0.0967        0.6611   
1  Breast Cancer        BasicKNN  3  0.6825  0.0787        0.6825   
2  Breast Cancer        BasicKNN  5  0.7001  0.0618        0.7001   
3  Breast Cancer        BasicKNN  7  0.7147  0.0862        0.7184   
4  Breast Cancer        BasicKNN  9  0.7292  0.0651        0.7329   
5  Breast Cancer  KernelBasedKNN  1  0.6538  0.0967        0.6611   
6  Breast Cancer  KernelBasedKNN  3  0.6754  0.0781        0.6754   
7  Breast Cancer  KernelBasedKNN  5  0.6717  0.0684        0.6717   
8  Breast Cancer  KernelBasedKNN  7  0.6828  0.0735        0.6828   
9  Breast Cancer  KernelBasedKNN  9  0.6828  0.0735        0.6828   

   SKlearn Std  t-stat  p-value              Interpretation  
0       0.0907 -1.0101   0.3388  Not significant (p>=0.05).  
1       0.0787     NaN      NaN  Not significant (p>=0.05).  
2    