<a href="https://colab.research.google.com/github/Shreyash606/KNN-/blob/main/KNN(Hayes-Roth).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
from csv import reader
from math import sqrt
from random import randrange
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
from scipy import stats
from decimal import Decimal
from google.colab import files

In [3]:
# Load a CSV file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

In [4]:
# Calculate Minkowski distance
def getMinkowskiDistance(vector1, vector2, k):
    distance = 0.0
    for i in range(len(vector1)-1):
        distance += pow(abs(vector1[i]-vector2[i]), k)

    root = 1 / (float(k))
    return round (Decimal(distance) ** Decimal(root), 5)


In [5]:
# Calculate the Euclidean distance between two vectors
def euclidean_distance(vector1, vector2):
    distance = 0.0
    for i in range(len(vector1) - 1):  # Exclude the label column
        distance += (int(vector1[i]) - int(vector2[i])) ** 2
    return sqrt(distance)


In [6]:
# KNN Algorithm with 10-fold cross-validation
def customKNNAlgorithm(train, test, num_neighbors):
    predictions = list()
    for row in test:
        output = predict_classification(train, row, num_neighbors)
        predictions.append(output)
    return predictions


In [7]:
def scikitKNN(train_set,test_set, num_neighbors_sklearn):
    # Train and evaluate scikit-learn k-NN
    X = np.array([row[:-1] for row in train_set])
    y = np.array([row[-1] for row in train_set])
    # Initialize k-NN classifier for scikit-learn
    knn_classifier = KNeighborsClassifier(n_neighbors=num_neighbors_sklearn)
    knn_classifier.fit(X, y)
    X_test_sklearn = np.array([row[:-1] for row in test_set])
    predictions_sklearn = knn_classifier.predict(X_test_sklearn)
    return predictions_sklearn


In [8]:
# Locate the most similar neighbors
def get_neighbors(train, test_row, num_neighbors):
    # distances = [(train_row, euclidean_distance(test_row, train_row)) for train_row in train]
    distances = [(train_row, getMinkowskiDistance(test_row, train_row ,2 )) for train_row in train]
    distances.sort(key=lambda x: x[1])
    neighbors = [row for row, _ in distances[:num_neighbors]]
    return neighbors


In [9]:
# Make a prediction with neighbors
def predict_classification(train, test_row, num_neighbors):
    neighbors = get_neighbors(train, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction


In [10]:
# function to make n_folds in the dataset
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split


In [11]:
# Calculate accuracy percentage
def getAccuracy(actual, predicted):
    n_correct_predictions = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            n_correct_predictions += 1
    return (n_correct_predictions * 100.0)/ float(len(actual))


In [15]:
# Main function
if __name__ == "__main__":

    filename = 'hayes-roth.data'

     # Number of neighbours for knn
    n_neighbours = 5

    # Number of splits for k-fold cross-validation
    n_folds = 10

    dataset_name = (filename.split("/")[-1]).split(".")[0]
    print("#################################################################################")
    print(dataset_name.upper())
    print("#################################################################################")

    # Load the dataset
    dataset = load_csv(filename)

    # Removing irrelevant atrributes name and hobby.
    for row in dataset:
        del row[:2]

    # Convert string columns to integers
    for i in range(len(dataset)):
        for j in range(len(dataset[0])):
            dataset[i][j] = int(dataset[i][j])



    print("Number of neighbours in KNN : ", n_neighbours)
    print("Number of k folds for K fold cross validation : ", n_folds)

    accuracy_scores_custom = []
    accuracy_scores_sklearn = []

    # Initialize k-fold cross-validation
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None

        #Get results from Custom KNN algorithm
        predicted_custom = customKNNAlgorithm(train_set, test_set, n_neighbours)

        #Get results from Scikitlearn's KNN algorithm
        predicted_sklearn = scikitKNN(train_set, test_set, n_neighbours)

        actual = [row[-1] for row in fold]
        accuracy_custom = getAccuracy(actual, predicted_custom)
        accuracy_sklearn = getAccuracy(actual,predicted_sklearn)
        accuracy_scores_custom.append(accuracy_custom)
        accuracy_scores_sklearn.append(accuracy_sklearn)


    # Calculate and print the average accuracy across folds for custom k-NN
    average_accuracy_custom = np.mean(accuracy_scores_custom)
    print("\nCustom k-NN Accuracy: ")
    print(accuracy_scores_custom)
    print("Custom k-NN Average Accuracy: {:.2f}\n".format(average_accuracy_custom))

    # Calculate and print the average accuracy across folds for scikit-learn k-NN
    average_accuracy_sklearn = np.mean(accuracy_scores_sklearn)
    print("Scikit-learn k-NN Accuracy: ")
    print(accuracy_scores_sklearn)
    print("Scikit-learn k-NN Average Accuracy: {:.2f}\n".format(average_accuracy_sklearn))
    print("#################################################################################")
    print("HYPOTHESIS TESTING")

    # Perform a paired t-test
    t_val, p_val = stats.ttest_rel(accuracy_scores_custom,accuracy_scores_sklearn)
    print("T-value : " , t_val)
    print("P-value : " , p_val)
    print("alpha : 0.05")
    print("#################################################################################")

    # Set the significance level (alpha) for your test
    alpha = 0.05

    # Check if the p-value is less than alpha
    if p_val < alpha:
        print('Null hypothesis REJECTED: There is a significant difference.')
    else:
        print('Null hypothesis ACCEPTED: There is no significant difference.')

    print("#################################################################################\n")


#################################################################################
HAYES-ROTH
#################################################################################
Number of neighbours in KNN :  5
Number of k folds for K fold cross validation :  10

Custom k-NN Accuracy: 
[76.92307692307692, 92.3076923076923, 92.3076923076923, 69.23076923076923, 76.92307692307692, 69.23076923076923, 76.92307692307692, 84.61538461538461, 84.61538461538461, 84.61538461538461]
Custom k-NN Average Accuracy: 80.77

Scikit-learn k-NN Accuracy: 
[69.23076923076923, 92.3076923076923, 92.3076923076923, 61.53846153846154, 84.61538461538461, 53.84615384615385, 61.53846153846154, 84.61538461538461, 100.0, 76.92307692307692]
Scikit-learn k-NN Average Accuracy: 77.69

#################################################################################
HYPOTHESIS TESTING
T-value :  0.9999999999999994
P-value :  0.3434363961379139
alpha : 0.05
###################################################################