In [9]:
import numpy as np
import pandas as pd
from sklearn import neighbors, metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [10]:
#importing the data
data = pd.read_csv('!car.data')
print(data.head())

  buying  maint doors persons lug_boot safety  class
0  vhigh  vhigh     2       2    small    low  unacc
1  vhigh  vhigh     2       2    small    med  unacc
2  vhigh  vhigh     2       2    small   high  unacc
3  vhigh  vhigh     2       2      med    low  unacc
4  vhigh  vhigh     2       2      med    med  unacc


In [11]:
#at the top, there are the names of the attributes
#now, lets create features and labels

X = data[['buying', 'maint', 'safety']].values
#don't forget the second bracket, and don't forget the dot values
y = data[['class']]

#now lets check to see if it worked (it does)
print(X, y)

[['vhigh' 'vhigh' 'low']
 ['vhigh' 'vhigh' 'med']
 ['vhigh' 'vhigh' 'high']
 ...
 ['low' 'low' 'low']
 ['low' 'low' 'med']
 ['low' 'low' 'high']]       class
0     unacc
1     unacc
2     unacc
3     unacc
4     unacc
...     ...
1723   good
1724  vgood
1725  unacc
1726   good
1727  vgood

[1728 rows x 1 columns]


In [12]:
#these are names, so we can't fetch these into a machine learning algorithm, so we have to convert these into numbers

#conversion of X: converting the data using a label encoder
Le = LabelEncoder()
# X[0] references the first instance in the dataset, and reads how many there are in that row, which should be 3
# X[:, i] references everything in a column i by taking all rows but keeping the column i
for i in range(len(X[0])):
    X[:, i] = Le.fit_transform(X[:,i])
    
print(X)

    


[[3 3 1]
 [3 3 2]
 [3 3 0]
 ...
 [1 1 1]
 [1 1 2]
 [1 1 0]]


In [13]:
#conversion of y

#creating a dictionary to tell us which labels will be which
label_mapping = {
    'unacc':0,
    'acc':1,
    'good':2,
    'vgood':3
}

y['class'] = y['class'].map(label_mapping)

y = np.array(y)
print(y)

[[0]
 [0]
 [0]
 ...
 [0]
 [2]
 [3]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['class'] = y['class'].map(label_mapping)


In [14]:
# creating our model

knn = neighbors.KNeighborsClassifier(n_neighbors=25, weights='uniform')

In [17]:
#now, we need to seperate our data into two categories

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [18]:
print(X_test)

[[1 0 0]
 [1 2 2]
 [1 1 2]
 ...
 [1 3 1]
 [1 2 0]
 [2 3 2]]


In [19]:
#now, we can train our model

In [22]:
knn.fit(X_train, y_train)

  knn.fit(X_train, y_train)


KNeighborsClassifier(n_neighbors=25)

In [24]:
#now, we can look at the predictions

pred_y = knn.predict(X_test)

#now, we can measure accuracy

accuracy = metrics.accuracy_score(y_test, pred_y)
print("predictions", pred_y)
print("accuracy", accuracy)

predictions [3 2 2 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1
 3 1 0 0 1 0 3 0 0 1 0 1 0 0 2 0 0 1 1 0 2 1 1 0 0 0 0 0 1 1 0 0 1 0 0 1 0
 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 2 0 0 0 3 0 0 1 0 0 0 0 2 3 0 0 0 0 0 1
 1 0 1 0 0 0 0 0 3 0 0 1 1 1 0 0 0 3 0 0 1 3 1 0 0 0 2 0 1 0 2 0 0 0 0 0 1
 0 2 0 0 1 0 0 0 1 0 0 2 0 0 0 0 0 0 1 0 0 2 0 1 2 1 0 1 2 0 0 3 0 0 1 0 0
 1 3 0 0 0 1 0 1 0 0 0 3 0 1 0 0 1 0 0 1 0 0 0 2 1 1 0 1 1 3 0 0 1 0 1 1 1
 0 0 2 0 1 0 0 0 3 0 0 3 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 2
 0 1 0 1 0 0 0 0 0 0 0 0 3 0 0 0 0 1 1 0 1 1 3 0 0 1 0 0 0 0 0 0 1 0 0 2 0
 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 1 1 0 1 1 0 0 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 0 0 0 0 0]
accuracy 0.7283236994219653


In [37]:
a = 1477
print("actual values ", y[a])
print("predicted value: ", knn.predict(X)[a])

actual values  [1]
predicted value:  1
