In [1]:
import csv
import random
import math
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from scipy import stats

class KNN_ALogo:
    def __init__(self, k=3):
        self.k = k

    def fit_data(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict_data(self, X_test):
        y_pred = [self._predict_acc(x) for x in X_test]
        return y_pred

    def _predict_acc(self, x):
        distances = [self._Euclidean_Distance(x, x_train) for x_train in self.X_train]
        k_indices = sorted(range(len(distances)), key=lambda i: distances[i])[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        majority_vote = max(set(k_nearest_labels), key=k_nearest_labels.count)
        return majority_vote

    def _Euclidean_Distance(self, x1, x2):
        distance = 0
        for i in range(len(x1)):
            if isinstance(x1[i], (int, float)) and isinstance(x2[i], (int, float)):
                distance += (x1[i] - x2[i]) ** 2
        return math.sqrt(distance)


def k_fold_cross_validation_data(X, y, k=10, k_value=3):
    fold_size = len(X) // k
    accuracies = []
    for i in range(k):
        test_indices = list(range(i * fold_size, (i+1) * fold_size))
        train_indices = [idx for idx in range(len(X)) if idx not in test_indices]
        X_train = [X[idx] for idx in train_indices]
        y_train = [y[idx] for idx in train_indices]
        X_test = [X[idx] for idx in test_indices]
        y_test = [y[idx] for idx in test_indices]

        knn = KNN_ALogo(k=k_value)
        knn.fit_data(X_train, y_train)
        y_pred = knn.predict_data(X_test)
        accuracy = sum(1 for p, t in zip(y_pred, y_test) if p == t) / len(y_test)
        accuracies.append(accuracy)
    return accuracies


datasets = ['hayes.csv', 'cancer.csv', 'car.csv']

for dataset in datasets:
    print(f"\nDataset: {dataset}")
    
  
    with open(dataset, 'r') as file:
        reader = csv.reader(file)
        header = next(reader)  
        data = list(reader)

 
    X = []
    y = []
    non_numeric_map = {}
    non_numeric_count = 0
    for row in data:
        X_row = []
        for idx, val in enumerate(row[:-1]):
            if val.isdigit(): 
                X_row.append(float(val))
            else:
                if val not in non_numeric_map:
                    non_numeric_map[val] = non_numeric_count
                    non_numeric_count += 1
                X_row.append(non_numeric_map[val])
        X.append(X_row)
        y.append(row[-1])


    custom_knn_accuracy = k_fold_cross_validation_data(X, y)
    custom_knn_mean_accuracy = sum(custom_knn_accuracy) / len(custom_knn_accuracy)
    print("KNN Mean Accuracy:", custom_knn_mean_accuracy)
    
  
    sklearn_knn_accuracies = []
    for _ in range(10):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
        sklearn_knn = KNeighborsClassifier(n_neighbors=3)
        sklearn_knn.fit(X_train, y_train)
        sklearn_knn_accuracies.append(sklearn_knn.score(X_test, y_test))
    Sklearn_knn_mean_accuracy = sum(sklearn_knn_accuracies) / len(sklearn_knn_accuracies)
    print("Scikit-learn KNN Mean Accuracy:", Sklearn_knn_mean_accuracy)
    

    _, p_value = stats.ttest_rel(custom_knn_accuracy, sklearn_knn_accuracies)
    print("P-value for Hypothesis Testing:", p_value)
    if p_value < 0.05:
        print("Significantly difference in Accuracy Between Custom KNN and Scikit-learn KNN.")
    else:
        print("No significant Difference in Accuracy Between Custom KNN and Scikit-learn KNN.")



Dataset: hayes.csv
KNN Mean Accuracy: 0.3923076923076923
Scikit-learn KNN Mean Accuracy: 0.3857142857142856
P-value for Hypothesis Testing: 0.8995619220478899
No significant Difference in Accuracy Between Custom KNN and Scikit-learn KNN.

Dataset: cancer.csv
KNN Mean Accuracy: 0.6964285714285714
Scikit-learn KNN Mean Accuracy: 0.7034482758620689
P-value for Hypothesis Testing: 0.9232835343990964
No significant Difference in Accuracy Between Custom KNN and Scikit-learn KNN.

Dataset: car.csv
KNN Mean Accuracy: 0.7220930232558138
Scikit-learn KNN Mean Accuracy: 0.8225433526011561
P-value for Hypothesis Testing: 0.014862918468520408
Minor difference in Accuracy Between Custom KNN and Scikit-learn KNN.
