In [22]:
from csv import reader
import pandas as pd
import numpy as np
from numpy import array
from math import sqrt
import random
import copy
from random import randrange

In [23]:
dataset = pd.read_csv("./car+evaluation/car.data.csv", header=None)
dataset = dataset.mask(dataset == '')
print(dataset.head(5))
dataset = dataset.values.tolist()

       0      1  2  3      4     5      6
0  vhigh  vhigh  2  2  small   low  unacc
1  vhigh  vhigh  2  2  small   med  unacc
2  vhigh  vhigh  2  2  small  high  unacc
3  vhigh  vhigh  2  2    med   low  unacc
4  vhigh  vhigh  2  2    med   med  unacc


In [24]:
# Converting STRING values to unique int values for each column:
# buying:   vhigh, high, med, low. -> (0,2,3,1)
# maint:    vhigh, high, med, low. -> (0,2,3,1)
# doors:    2, 3, 4, 5more -> (1, 0, 2, 3)
# persons:  2, 4, more -> (1, 2, 0)
# lug_boot: small, med, big -> (0, 2, 1)
# safety:   low, med, high -> (2, 0, 1)
# class: good, vgood, acc, unacc -> (0, 1 ,2, 3) respectively
# Each of above column will

def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = {value: index for index, value in enumerate(unique)}
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

for i in range(len(dataset[0])):
    str_column_to_int(dataset, i)

print("First 5 rows are Mapping:")
for i in range(5):
    print(dataset[i])


First 5 rows are Mapping:
[0, 0, 1, 0, 0, 2, 2]
[0, 0, 1, 0, 0, 1, 2]
[0, 0, 1, 0, 0, 0, 2]
[0, 0, 1, 0, 2, 2, 2]
[0, 0, 1, 0, 2, 1, 2]


## Full Scratch KNN model: For Car_Evaluation dataset

In [25]:
# PLEASE REFER: Details regarding Hayes Roth Dataset and how it is used here is in 'Doc file'

def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row2)-2):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

def get_neighbors(X_train, test_row, k):
    distance = list()
    data = []
    for i in X_train:
        dist = euclidean_distance(test_row, i)
        distance.append(dist)
        data.append(i)
        
    distance = np.array(distance)
    data = np.array(data)
    # Find the index of min distance and sorting accordingly
    index_dist = distance.argsort()
    # We arrange our data according to index
    data = data[index_dist]
    # slicing k neighbors from data[]
    neighbors = data[:k]
    return neighbors

def predict_classification(train, test_row, k):
    neighbors = get_neighbors(train, test_row, k)
    classes = []
    for i in neighbors:
        classes.append(i[-1])
    prediction = max(set(classes), key=classes.count)
    return prediction

# Converting STRING values to unique int values for each column:
# buying:   vhigh, high, med, low. -> (0|2,3,1)
# maint:    vhigh, high, med, low. -> (0,2,3,1)
# doors:    2, 3, 4, 5more -> (1, 0, 2, 3)
# persons:  2, 4, more -> (1, 2, 0)
# lug_boot: small, med, big -> (0, 2, 1)
# safety:   low, med, high -> (2, 0, 1)
new_observation_row = [3, 0, 3, 2, 2, 0]

# Large k-value(optional) -> replace it with 
large_k = int(sqrt(len(dataset)))
if large_k%2==1 : large_k+=1

prediction = predict_classification(dataset, new_observation_row, 3)
prediction_large_k = predict_classification(dataset, new_observation_row, large_k)

print('************************ FINAL PREDICTION OUTPUT ************************')
print('New Observation:', new_observation_row)

class_mapping = {0: 'good', 1: 'vgood', 2: 'acc', 3: 'unacc'}
predicted_class = class_mapping[prediction]
print('Predicted class(k=3):', predicted_class)

predicted_class_large_k = class_mapping[prediction_large_k]
print('Predicted class(k=sqrt(dataset_size)):', predicted_class_large_k)

************************ FINAL PREDICTION OUTPUT ************************
New Observation: [3, 0, 3, 2, 2, 0]
Predicted class(k=3): good
Predicted class(k=sqrt(dataset_size)): good


## 10 Fold Accuracy for Car_Evaluation(scratch model)

In [27]:
random.seed(20)
n_folds = 10
def cross_validation_split(dataset, n_folds):
    dataset_split = []
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = []
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split


def evaluate_algorithm(dataset, n_folds, k):
    folds = cross_validation_split(dataset, n_folds)
    scores = []
    scores_large_k = []
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = []
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None  # remove the class label for testing
        predictions = []
        predictions_large_k = []
        for row in test_set:
            output = predict_classification(train_set, row, k)
            output_large_k = predict_classification(train_set, row, large_k)
            predictions.append(output)
            predictions_large_k.append(output_large_k)
        actual = [row[-1] for row in fold]
        correct = sum(1 for i in range(len(actual)) if actual[i] == predictions[i])
        correct_large_k = sum(1 for i in range(len(actual)) if actual[i] == predictions_large_k[i])
        accuracy = (correct / float(len(actual))) * 100.0
        accuracy_large_k = (correct_large_k / float(len(actual))) * 100.0
        scores.append(accuracy)
        scores_large_k.append(accuracy_large_k)
    return [scores, scores_large_k]
    
k = 3
scores, scores_large_k = evaluate_algorithm(dataset, n_folds, k)

print('************************ CROSS-VALIDATION RESULTS ************************')
print('Accuracy Scores(k=3):', scores, "\n")
print('Accuracy Scores(k=sqrt(dataset_size)):', scores_large_k, "\n")
print('Average Accuracy(k=3):', np.mean(scores))
print('Average Accuracy(k=sqrt(dataset_size)):', np.mean(scores_large_k))

************************ CROSS-VALIDATION RESULTS ************************
Accuracy Scores(k=3): [65.69767441860465, 61.04651162790697, 67.44186046511628, 66.27906976744185, 61.627906976744185, 64.53488372093024, 66.86046511627907, 65.69767441860465, 55.81395348837209, 63.95348837209303] 

Accuracy Scores(k=sqrt(dataset_size)): [69.18604651162791, 69.76744186046511, 72.09302325581395, 70.93023255813954, 70.93023255813954, 70.34883720930233, 75.0, 73.25581395348837, 63.372093023255815, 68.6046511627907] 

Average Accuracy(k=3): 63.895348837209305
Average Accuracy(k=sqrt(dataset_size)): 70.34883720930233
