In [3]:
import pandas as pd
import numpy as np
import math
import operator
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import datasets, neighbors

In [4]:
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'target']
dataset = pd.read_csv('car.csv', names= col_names)

In [5]:
dataset.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,target
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [6]:
dataset.describe()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,target
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,vhigh,vhigh,2,2,small,low,unacc
freq,432,432,432,576,576,576,1210


In [7]:
dataset.shape

(1728, 7)

In [8]:
#kiểm tra số lượng giá trị
dataset['target'].value_counts()

unacc    1210
acc       384
good       69
vgood      65
Name: target, dtype: int64

In [9]:
dataset.columns

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'target'], dtype='object')

In [10]:
# Kiểu dữ liệu
dataset.dtypes

buying      object
maint       object
doors       object
persons     object
lug_boot    object
safety      object
target      object
dtype: object

In [11]:
buying_label =  { ni: n for n, ni in enumerate(set(dataset['buying']))}
maint_label =  { ni: n for n, ni in enumerate(set(dataset['maint']))}
doors_label =  { ni: n for n, ni in enumerate(set(dataset['doors']))}
persons_label =  { ni: n for n, ni in enumerate(set(dataset['persons']))}
lug_boot_label =  { ni: n for n, ni in enumerate(set(dataset['lug_boot']))}
safety_label =  { ni: n for n, ni in enumerate(set(dataset['safety']))}
target_label =  { ni: n for n, ni in enumerate(set(dataset['target']))}

In [12]:
buying_label

{'low': 0, 'med': 1, 'vhigh': 2, 'high': 3}

In [13]:
dataset['buying'].unique()

array(['vhigh', 'high', 'med', 'low'], dtype=object)

In [14]:
df = dataset

In [15]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,target
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [16]:
df['buying'] = df['buying'].map(buying_label)
df['maint'] = df['maint'].map(maint_label)
df['doors'] = df['doors'].map(doors_label)
df['persons'] = df['persons'].map(persons_label)
df['lug_boot'] = df['lug_boot'].map(lug_boot_label)
df['safety'] = df['safety'].map(safety_label)
df['target'] = df['target'].map(target_label)

In [17]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,target
0,2,2,3,2,0,1,2
1,2,2,3,2,0,0,2
2,2,2,3,2,0,2,2
3,2,2,3,2,1,1,2
4,2,2,3,2,1,0,2


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [19]:
X = df.iloc[:, 0:-1].to_numpy()
y = df['target'].to_numpy()

randIndex = np.arange(X.shape[0])
np.random.shuffle(randIndex)

X = X[randIndex]
y = y[randIndex]

In [20]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.1)

In [21]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [22]:
def calculate_distance(p1,p2):
    dimension = len(p1)
    distance = 0

    for i in range(dimension):
        distance += (p1[i] - p2[i])*(p1[i] - p2[i])

    return math.sqrt(distance)

def get_k_neighbors(training_X, label_y, point, k):
    distances = []
    neighbors = []
    for i in range(len(training_X)):
        distance = calculate_distance(training_X[i], point)
        distances.append((distance, label_y[i]))

    distances.sort(key=operator.itemgetter(0)) # sort by distance

    for i in range(k):
        neighbors.append(distances[i][1])

    return neighbors

def highest_votes(labels):
    labels_count = [0,0,0,0,0]
    for label in labels:
        labels_count[label] += 1

    max_count = max(labels_count)
    return labels_count.index(max_count)

def predict(training_X, label_y, point, k):
    neighbors_labels = get_k_neighbors(training_X, label_y, point, k)
    return highest_votes(neighbors_labels)

def accuracy_score(predicts, labels):
    total = len(predicts)
    correct_count = 0
    for i in range(total):
        if predicts[i] == labels[i]:
            correct_count += 1
    accuracy = correct_count/total
    return accuracy

In [27]:
k=5
y_predict = []
for p in X_test:
    label = predict(X_train, y_train, p, k)
    y_predict.append(label)

acc = accuracy_score(y_predict, y_test)
print('Accuracy My KNN: ' + str(acc))


Accuracy My KNN: 0.9364161849710982


In [28]:
from sklearn.metrics import accuracy_score
from sklearn import datasets, neighbors

In [26]:
knn = neighbors.KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
y_predict = knn.predict(X_test)

accuracy = accuracy_score(y_predict, y_test)
print('accuracy sklearn KNN: ' + str(accuracy))

accuracy sklearn KNN: 0.9364161849710982
