In [31]:
from csv import reader
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.model_selection import KFold
from random import seed
from random import randrange
import copy

In [32]:
dataset = pd.read_csv("./hayes+roth/hayes-roth.data.csv")
dataset_test = pd.read_csv("./hayes+roth/hayes-roth.test.csv")
dataset = dataset.mask(dataset == '')
dataset_test = dataset_test.mask(dataset_test == '')

In [33]:
# Splitting dataset
X_train = dataset.iloc[ : , :]
y_train = dataset.iloc[ : , 5]
X_test = dataset_test.iloc[ : , :]
y_test = dataset_test.iloc[ : , 5]

X_train = X_train.values.tolist()
X_test = X_test.values.tolist()

## Full code for Hayes Roth Dataset

In [39]:
# PLEASE REFER: Details regarding Hayes Roth Dataset and how it is used here is in 'Doc file'

# Step 1: Find distance between new_observation and all the existing data points.
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(2, len(row2)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

# Step 2: Use the distance to sort the distance list and fetch top k neigbhors(data points)
def get_neighbors(X_train, test_row, k):
    distance = list()
    data = []
    
    for i in X_train:
        dist = euclidean_distance(test_row, i)
        distance.append(dist)
        data.append(i)
        
    distance = np.array(distance)
    data = np.array(data, dtype=object)
    # Find the index of min distance and sorting accordingly
    index_dist = distance.argsort()
    # We arrange our data according to index
    data = data[index_dist]
    # slicing k neighbors from data[]
    neighbors = data[:k]
    return neighbors

# Step 3: Based on the k-neighbors vote the maximum class from the list, majority class count is the class of new observation.
def predict_classification(train, test_row, k):
    neighbors = get_neighbors(train, test_row, k)
    classes = []
    for i in neighbors:
        classes.append(i[-1])
    prediction = max(classes, key=classes.count)
    return prediction

new_observation_row = [55, 3, 4, 3, 2]
# Large k-value
large_k = int(sqrt(len(X_train)))
if large_k%2==1 : large_k+=1

# Calculate the distances between new_observation_row and each row in X_train
distances = []
for i in range(len(X_train)):
    distance = euclidean_distance(new_observation_row, X_train[i])
    distances.append(distance)

# Find the indices of the k nearest neighbors
nearest_indices = np.argsort(distances)[:5]

# Print the calculated distances for the new observation row
print("Euclidean Distances:")
for idx in range(len(distances)):
    print(f"Row {idx}: {distances[idx]}")

# Print the indices of the k nearest neighbors
print("\nIndices of the Nearest Neighbors:")
print(nearest_indices)

# Print the classes of the nearest neighbors for verification
print("\nClasses of the Nearest Neighbors:")
for idx in nearest_indices:
    print(f"Row {idx}: {X_train[idx]} - Class: {y_train[idx]}")

prediction = predict_classification(X_train, new_observation_row, 5)
prediction_large_k = predict_classification(X_train, new_observation_row, large_k)
print('************************ FINAL PREDICTION OUTPUT ************************')
print('New Observation:', new_observation_row)
print('Predicted class(k=5):', prediction)
print('Predicted class(large_k=sqrt(dataset_size)):', prediction_large_k)

Euclidean Distances:
Row 0: 3.605551275463989
Row 1: 3.0
Row 2: 3.3166247903554
Row 3: 1.0
Row 4: 3.605551275463989
Row 5: 3.0
Row 6: 3.0
Row 7: 2.23606797749979
Row 8: 3.0
Row 9: 3.0
Row 10: 3.0
Row 11: 2.8284271247461903
Row 12: 3.3166247903554
Row 13: 3.1622776601683795
Row 14: 2.8284271247461903
Row 15: 3.1622776601683795
Row 16: 2.8284271247461903
Row 17: 3.0
Row 18: 1.7320508075688772
Row 19: 3.0
Row 20: 2.449489742783178
Row 21: 3.0
Row 22: 3.3166247903554
Row 23: 3.0
Row 24: 1.7320508075688772
Row 25: 2.449489742783178
Row 26: 3.0
Row 27: 3.1622776601683795
Row 28: 3.0
Row 29: 3.0
Row 30: 2.23606797749979
Row 31: 3.0
Row 32: 3.0
Row 33: 3.605551275463989
Row 34: 1.0
Row 35: 3.1622776601683795
Row 36: 3.1622776601683795
Row 37: 2.449489742783178
Row 38: 2.8284271247461903
Row 39: 0.0
Row 40: 2.449489742783178
Row 41: 2.8284271247461903
Row 42: 3.0
Row 43: 3.0
Row 44: 3.3166247903554
Row 45: 3.0
Row 46: 3.605551275463989
Row 47: 3.605551275463989
Row 48: 2.449489742783178
Row 49:

## Using 10-Fold Cross Validation Accuracy

In [35]:
seed(20)
n_folds = 10
k = 3

# For large value of k=sqrt(n) -> most practical value for large datasets
large_k = int(sqrt(len(X_train)))
if large_k%2==1 : large_k+=1

def cross_validation_split(dataset, n_folds):
    dataset_split = []
    dataset_copy = copy.deepcopy(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = []
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

def evaluate_algorithm(dataset, n_folds, k):
    folds = cross_validation_split(dataset, n_folds)
    scores = []
    scores_large_k= []
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = []
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predictions = []
        predictions_large_k = []
        for row in test_set:
            output = predict_classification(train_set, row, k)
            output_large_k = predict_classification(train_set, row, large_k)
            predictions.append(output)
            predictions_large_k.append(output_large_k)
        actual = [row[-1] for row in fold]
        correct = sum(1 for i in range(len(actual)) if actual[i] == predictions[i])
        correct_large_k = sum(1 for i in range(len(actual)) if actual[i] == predictions_large_k[i])
        accuracy = (correct / float(len(actual))) * 100.0
        accuracy_large_k = (correct_large_k / float(len(actual))) * 100.0
        scores.append(accuracy)
        scores_large_k.append(accuracy_large_k)
    return [scores, scores_large_k]

# Splitting dataset
dataset = list(zip(X_train, y_train))
dataset_test = list(zip(X_test, y_test))

# Evaluating the algorithm using cross-validation
scores, scores_large_k = evaluate_algorithm(dataset, n_folds, k)

print('************************ CROSS-VALIDATION RESULTS ************************\n')
print('Accuracy Scores(k=3):', scores, "\n")
print('Accuracy Scores (Large k=sqrt(n)):', scores_large_k, "\n")
print('Average Accuracy(k=3): {:.4f}'.format(np.mean(scores)))
print('Average Accuracy(k=sqrt(n)): {:.4f}'.format(np.mean(scores_large_k)))

************************ CROSS-VALIDATION RESULTS ************************

Accuracy Scores(k=3): [38.46153846153847, 38.46153846153847, 23.076923076923077, 38.46153846153847, 46.15384615384615, 61.53846153846154, 38.46153846153847, 23.076923076923077, 30.76923076923077, 61.53846153846154] 

Accuracy Scores (Large k=sqrt(n)): [53.84615384615385, 23.076923076923077, 61.53846153846154, 46.15384615384615, 30.76923076923077, 23.076923076923077, 38.46153846153847, 53.84615384615385, 30.76923076923077, 15.384615384615385] 

Average Accuracy(k=3): 40.0000
Average Accuracy(k=sqrt(n)): 37.6923


## Measuring Training and Testing dataset accuracy

In [36]:
train_correct_predictions = 0
for i in range(len(y_train)):
    prediction = predict_classification(X_train, X_train[i], 3)
    if prediction == y_train[i]:
        train_correct_predictions += 1

train_accuracy = (train_correct_predictions / len(y_train)) * 100

test_correct_predictions = 0
for i in range(len(y_test)):
    prediction = predict_classification(X_test, X_test[i], 3)
    if prediction == y_test[i]:
        test_correct_predictions += 1

test_accuracy = (test_correct_predictions / len(y_test)) * 100

# Print accuracies
print(f'Training Accuracy: {train_accuracy:.5f}%')
print(f'Testing Accuracy: {test_accuracy:.5f}%')

Training Accuracy: 87.12121%
Testing Accuracy: 78.57143%


## References

1. https://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/
2. Youtube Link: https://www.youtube.com/watch?v=4HKqjENq9OU&t=319s
3. nan: https://sparkbyexamples.com/pandas/pandas-replace-blank-values-with-nan/?expand_article=1
4. https://www.linkedin.com/pulse/k-nearest-neighbor-algorithm-from-scratchwithout-library-ritika-yadav/
5. K-Fold implementation: https://www.datacamp.com/tutorial/k-nearest-neighbor-classification-scikit-learn
6. Choosing righ k-value(sqrt(len(dataset)): https://stackoverflow.com/questions/11568897/value-of-k-in-k-nearest-neighbor-algorithm