In [27]:
from csv import reader
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import random

In [28]:
dataset = pd.read_csv("./car+evaluation/car.data.csv", header=None)
dataset = dataset.mask(dataset == '')
dataset = dataset.values.tolist()

In [29]:
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = {value: index for index, value in enumerate(unique)}
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

for i in range(len(dataset[0])):
    str_column_to_int(dataset, i)

print("First 5 rows are Mapping:")
for i in range(5):
    print(dataset[i])


First 5 rows are Mapping:
[3, 3, 2, 1, 0, 1, 0]
[3, 3, 2, 1, 0, 0, 0]
[3, 3, 2, 1, 0, 2, 0]
[3, 3, 2, 1, 1, 1, 0]
[3, 3, 2, 1, 1, 0, 0]


In [36]:
random.seed(20)
# Extract features and target variable
X = np.array([row[:-1] for row in dataset])
y = np.array([row[-1] for row in dataset])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Large k value
large_k = int(sqrt(len(dataset)))
if large_k%2==1 : large_k+=1
    
# Initialize the KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn_large_k = KNeighborsClassifier(n_neighbors=large_k)

n_splits = 10

kf = KFold(n_splits=n_splits, shuffle=True)
cv_scores = cross_val_score(knn, X_scaled, y, cv=kf)
cv_scores_large_k = cross_val_score(knn_large_k, X_scaled, y, cv=kf)

print("Accuracy Scores for 10 Folds (k=3):", ", ".join(map(str, cv_scores)), "\n")
print("Accuracy Scores for 10 Folds(k=sqrt(len(dataset))):", ",".join(map(str, cv_scores_large_k*100)), "\n")

average_accuracy = cv_scores.mean()
average_accuracy_large_k = cv_scores_large_k.mean()

print("\nAverage Accuracy (Percentage)(k=3): {:.4f}".format(average_accuracy * 100))
print("Average Accuracy (Percentage)(k=sqrt(dataset_size)): {:.4f}".format(average_accuracy_large_k * 100))

Accuracy Scores for 10 Folds (k=3): 0.9075144508670521, 0.8728323699421965, 0.884393063583815, 0.8670520231213873, 0.8786127167630058, 0.8728323699421965, 0.9248554913294798, 0.861271676300578, 0.877906976744186, 0.8895348837209303 

Accuracy Scores for 10 Folds(k=sqrt(len(dataset))): 79.1907514450867,67.63005780346822,78.61271676300578,72.25433526011561,76.30057803468208,65.3179190751445,74.56647398843931,64.16184971098265,69.18604651162791,75.5813953488372 


Average Accuracy (Percentage)(k=3): 88.3681
Average Accuracy (Percentage)(k=sqrt(dataset_size)): 72.2802
