In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import neighbors
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder

In [33]:
data = pd.read_csv("Data/car.data")
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [34]:
# Create labels and features
X = data[[
    "buying",
    "maint",
    "safety"
]].values

Y = data[["class"]]

In [35]:
print(X.shape)
print(Y.shape)

(1728, 3)
(1728, 1)


In [36]:
# Convert into numbers to feed machine learning algorithm
# Converting X using LabelEncoder
Le = LabelEncoder()
for i in range(len(X[0])):
    X[: , i] = Le.fit_transform(X[: , i])

print(X)

[[3 3 1]
 [3 3 2]
 [3 3 0]
 ...
 [1 1 1]
 [1 1 2]
 [1 1 0]]


In [37]:
# Converting Y using map
features = {
    'unacc':0,
    'acc':1,
    'good':2,
    'vgood':3
}
Y = Y['class'].map(features)
# let's convert the dataframe(Y) into an numpy array
# Y = Y.to_numpy() # or Y = np.array(Y)
Y = np.array(Y)
print(Y)


[0 0 0 ... 0 2 3]


In [38]:
# Create the KNN classifier model
knn = neighbors.KNeighborsClassifier(n_neighbors=25 , weights="uniform")

In [39]:
# Train the model (our_model.fit())
# In order to train the model, we need labels and features

# We need to seperate our data as training and testing
# Setting ‘stratify’ to Y makes our training split represent the proportion of each value in the Y variable.
X_train , X_test , Y_train , Y_test = train_test_split(X , Y , test_size=0.2 , random_state= 100 , stratify= Y)

# Fit the classifier to the data
knn.fit(X_train , Y_train)

# Test the model (our_model.predict())
knn.predict(X_test)

# Check the accuracy of our model (our_model.score())
knn.score(X_test , Y_test)



0.7167630057803468

## Our model has an accuracy of approximately 75.72%.
#### We will see how we can increase model performance below

##### * train-test-split method is also know as "holdout".
##### * k-Fold Cross-Validation is better than using holdout method.
##### * In order to train and test our model using cross-validation, we will use the "cross_val_score" function

In [40]:
# Create a new KNN model
# Let's set the parameter 'n_neighbors' to 5 with no logic behind that choice.
knn_cv = neighbors.KNeighborsClassifier(n_neighbors=5)

# Train model with cv of 5
cv_scores = cross_val_score(knn_cv , X , Y , cv=5)

print(cv_scores)
print("mean :" , np.mean(cv_scores))

[0.50289017 0.55780347 0.28034682 0.44347826 0.35652174]
mean : 0.42820809248554914


##### We got the mean score is about 42.82% using cross-validation
##### Now, we will use "GridSearchCV" to find the optimal value for 'n_neighbors'.

In [41]:
# Create a new knn model
knn_gscv_model= neighbors.KNeighborsClassifier()

# Create a dictionary of all values we want to test for 'n_neighbors'
n_neighbors_dict = {'n_neighbors' : np.arange(1 , 25)}

# Use gridsearch to test all values for n_neighbors
knn_gscv = GridSearchCV(knn_gscv_model , n_neighbors_dict , cv = 5)

# fit model to data
knn_gscv.fit(X , Y)

In [42]:
knn_gscv.best_params_

{'n_neighbors': 21}

In [43]:
knn_gscv.best_score_


0.4849610454888163