In [79]:
from csv import reader
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.model_selection import KFold
from random import seed
from random import randrange
import copy

In [68]:
dataset = pd.read_csv("./hayes+roth/hayes-roth.data.csv")
dataset_test = pd.read_csv("./hayes+roth/hayes-roth.test.csv")
dataset = dataset.mask(dataset == '')
dataset_test = dataset_test.mask(dataset_test == '')

In [69]:
# Splitting dataset
X_train = dataset.iloc[ : , :]
y_train = dataset.iloc[ : , 5]
X_test = dataset_test.iloc[ : , :]
y_test = dataset_test.iloc[ : , 5]

X_train = X_train.values.tolist()
X_test = X_test.values.tolist()

## Full code for Hayes Roth Dataset

In [86]:
# PLEASE REFER: Details regarding Hayes Roth Dataset and how it is used here is in 'Doc file'

def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(2, len(row2)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

def get_neighbors(X_train, test_row, k):
    distance = list()
    data = []
    
    for i in X_train:
        dist = euclidean_distance(test_row, i)
        distance.append(dist)
        data.append(i)
        
    distance = np.array(distance)
    data = np.array(data, dtype=object)
    # Find the index of min distance and sorting accordingly
    index_dist = distance.argsort()
    # We arrange our data according to index
    data = data[index_dist]
    # slicing k neighbors from data[]
    neighbors = data[:k]
    return neighbors

def predict_classification(train, test_row, k):
    neighbors = get_neighbors(train, test_row, k)
    classes = []
    for i in neighbors:
        classes.append(i[-1])
    prediction = max(classes, key=classes.count)
    return prediction

new_observation_row = [55, 3, 1, 3, 2]
# Large k-value
large_k = int(sqrt(len(X_train)))
if large_k%2==1 : large_k+=1

# Calculate the distances between new_observation_row and each row in X_train
distances = []
for i in range(len(X_train)):
    distance = euclidean_distance(new_observation_row, X_train[i])
    distances.append(distance)

# Find the indices of the k nearest neighbors
nearest_indices = np.argsort(distances)[:5]

# Print the calculated distances for the new observation row
print("Euclidean Distances:")
for idx in range(len(distances)):
    print(f"Row {idx}: {distances[idx]}")

# Print the indices of the k nearest neighbors
print("\nIndices of the Nearest Neighbors:")
print(nearest_indices)

# Print the classes of the nearest neighbors for verification
print("\nClasses of the Nearest Neighbors:")
for idx in nearest_indices:
    print(f"Row {idx}: {X_train[idx]} - Class: {y_train[idx]}")

prediction = predict_classification(X_train, new_observation_row, 5)
print('************************ FINAL PREDICTION OUTPUT ************************')
print('New Observation:', new_observation_row)
print('Predicted class:', prediction)

Euclidean Distances:
Row 0: 2.0
Row 1: 0.0
Row 2: 1.4142135623730951
Row 3: 3.1622776601683795
Row 4: 2.0
Row 5: 0.0
Row 6: 0.0
Row 7: 3.7416573867739413
Row 8: 2.449489742783178
Row 9: 2.449489742783178
Row 10: 2.449489742783178
Row 11: 2.23606797749979
Row 12: 1.4142135623730951
Row 13: 1.0
Row 14: 4.123105625617661
Row 15: 1.0
Row 16: 2.23606797749979
Row 17: 2.449489742783178
Row 18: 2.449489742783178
Row 19: 0.0
Row 20: 1.7320508075688772
Row 21: 2.449489742783178
Row 22: 1.4142135623730951
Row 23: 2.449489742783178
Row 24: 2.449489742783178
Row 25: 1.7320508075688772
Row 26: 2.449489742783178
Row 27: 1.0
Row 28: 0.0
Row 29: 2.449489742783178
Row 30: 3.7416573867739413
Row 31: 2.449489742783178
Row 32: 2.449489742783178
Row 33: 2.0
Row 34: 3.1622776601683795
Row 35: 1.0
Row 36: 1.0
Row 37: 1.7320508075688772
Row 38: 2.23606797749979
Row 39: 3.0
Row 40: 1.7320508075688772
Row 41: 2.23606797749979
Row 42: 2.449489742783178
Row 43: 2.449489742783178
Row 44: 1.4142135623730951
Row 45:

## Using 10-Fold Cross Validation Accuracy

In [91]:
seed(20)
n_folds = 10
k = 3

def cross_validation_split(dataset, n_folds):
    dataset_split = []
    dataset_copy = copy.deepcopy(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = []
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

def evaluate_algorithm(dataset, n_folds, k):
    folds = cross_validation_split(dataset, n_folds)
    scores = []
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = []
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None  # remove the class label for testing
        predictions = []
        for row in test_set:
            output = predict_classification(train_set, row, k)
            predictions.append(output)
        actual = [row[-1] for row in fold]
        correct = sum(1 for i in range(len(actual)) if actual[i] == predictions[i])
        accuracy = (correct / float(len(actual))) * 100.0
        scores.append(accuracy)
    return scores

# Splitting dataset
dataset = list(zip(X_train, y_train))
dataset_test = list(zip(X_test, y_test))

# Evaluating the algorithm using cross-validation
scores = evaluate_algorithm(dataset, n_folds, k)

print('************************ CROSS-VALIDATION RESULTS ************************')
print('Accuracy Scores:', scores)
print('Average Accuracy: {:.4f}'.format(np.mean(scores)))

************************ CROSS-VALIDATION RESULTS ************************
Accuracy Scores: [38.46153846153847, 38.46153846153847, 23.076923076923077, 38.46153846153847, 46.15384615384615, 61.53846153846154, 38.46153846153847, 23.076923076923077, 30.76923076923077, 61.53846153846154]
Average Accuracy: 40.0000


## Measuring Training and Testing dataset accuracy

In [48]:
train_correct_predictions = 0
for i in range(len(y_train)):
    prediction = predict_classification(X_train, X_train[i], 3)
    if prediction == y_train[i]:
        train_correct_predictions += 1

train_accuracy = (train_correct_predictions / len(y_train)) * 100

test_correct_predictions = 0
for i in range(len(y_test)):
    prediction = predict_classification(X_test, X_test[i], 3)
    if prediction == y_test[i]:
        test_correct_predictions += 1

test_accuracy = (test_correct_predictions / len(y_test)) * 100

# Print accuracies
print(f'Training Accuracy: {train_accuracy:.5f}%')
print(f'Testing Accuracy: {test_accuracy:.5f}%')

Training Accuracy: 87.12121%
Testing Accuracy: 78.57143%


In [None]:
# # Step 1: Calculate euclidean distance
# def euclidean_distance(row1, row2):
#     distance = 0.0
#     for i in range(2, len(row1)-1):
#         distance += (row1[i] - row2[i])**2
#     return sqrt(distance)


# sample = [
#     [92, 2, 1, 1, 2, 1],
#     [10, 2, 1, 3, 2, 2],
#     [83, 3, 1, 4, 1, 3],
#     [61, 2, 4, 2, 2, 3],
#     [107, 1, 1, 3, 4, 3]
# ]

# row1 = sample[0]
# for i in range(len(sample)):
#     row2 = sample[i]
#     distance = euclidean_distance(row1, row2)
#     print(f"Euclidean Distance between {row1} and {row2}: {distance}")


In [None]:
# # Step 2: Finding neighbors

# def get_neighbors(X_train, test_row, k):
#     distance = list()
#     data = []
#     for i in X_train:
#         dist = euclidean_distance(test_row, i)
#         distance.append(dist)
#         data.append(i)
#     distance = np.array(distance)
#     data = np.array(data)
#     # Find the index of min distance and sorting accordingly
#     index_dist = distance.argsort()
#     # We arrange our data according to index
#     data = data[index_dist]
#     # slicing k neighbors from data[]
#     neighbors = data[:k]
    
#     return neighbors

# dataset = [
#     [92, 2, 1, 1, 2, 1],
#     [10, 2, 1, 3, 2, 2],
#     [83, 3, 1, 4, 1, 3],
#     [61, 2, 4, 2, 2, 3],
#     [107, 1, 1, 3, 4, 3]
# ]
# # Choosing k: Source: Youtube link in references, and remaining details in Word file
# k = int(sqrt(len(dataset)))
# if k%2==1 : k+=1
# print('k:', k)
# neighbors = get_neighbors(dataset, dataset[0], k)
# for neighbor in neighbors:
#  print(neighbor)

In [None]:
# # Step 3: Making our prediction

# def predict_classification(train, test_row, k):
#     neighbors = get_neighbors(train, test_row, k)
#     classes = []
#     for i in neighbors:
#         classes.append(i[-1])
#     prediction = max(classes, key=classes.count)
#     return prediction

# dataset = [
#     [92, 2, 1, 1, 2, 1],
#     [10, 2, 1, 3, 2, 2],
#     [83, 3, 1, 4, 1, 3],
#     [61, 2, 4, 2, 2, 3],
#     [107, 1, 1, 3, 4, 3]
# ]
# prediction = predict_classification(dataset, dataset[0], 2)
# print('Predicted class:', prediction)

## References

1. https://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/
2. Youtube Link: 
3. nan: https://sparkbyexamples.com/pandas/pandas-replace-blank-values-with-nan/?expand_article=1
4. https://www.linkedin.com/pulse/k-nearest-neighbor-algorithm-from-scratchwithout-library-ritika-yadav/
5. K-Fold implementation: https://www.datacamp.com/tutorial/k-nearest-neighbor-classification-scikit-learn