In [77]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from sklearn.model_selection import train_test_split



In [78]:
# Load Data
df = pd.read_csv('AllCars.csv')
# I # used a relative path for GitHub compatibility and easier grading, instead of through the drive

# remove non-ordinal features
df_filtered = df.drop(columns=['Make'])

x = df_filtered[['Volume', 'Doors']]
y = df_filtered['Style']

# normalize feature data
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x)
x_scaled_df = pd.DataFrame(x_scaled, columns=['Volume', 'Doors'])

x_scaled_df, y

(       Volume     Doors
 0    0.198381  0.666667
 1    0.275304  1.000000
 2    0.242915  0.666667
 3    0.327935  1.000000
 4    0.327935  1.000000
 ..        ...       ...
 145  0.275304  0.666667
 146  0.186235  0.666667
 147  0.473684  0.666667
 148  0.170040  0.666667
 149  0.336032  0.666667
 
 [150 rows x 2 columns],
 0      Sedan
 1        SUV
 2      Sedan
 3        SUV
 4        SUV
        ...  
 145      SUV
 146    Sedan
 147      SUV
 148    Sedan
 149     Jeep
 Name: Style, Length: 150, dtype: object)

In [79]:

# split data
split_index = int(len(x_scaled_df) * 0.8)

# top 80% for Training
x_train = x_scaled_df.iloc[:split_index]
y_train = y.iloc[:split_index]

# bottom 20% for Testing
x_test = x_scaled_df.iloc[split_index:]
y_test = y.iloc[split_index:]

For this submissions, I used sequential split, instead of randomizing it. We talked about it in class possibly doing that. I also have another colab notebook where I did the randomized, but I got a pretty high optimal k, so I decided to submit this one. If needed, I can submit the other one as well.

In [80]:
# create Training.csv and Testing.csv

train_csv = x_train.copy()
train_csv['Style'] = y_train.values
train_csv.to_csv('Training.csv', index=False)

test_csv = x_test.copy()
test_csv['Style'] = y_test.values
test_csv.to_csv('Testing.csv', index=False)

In [81]:
def get_predictions(K,x_train,y_train,x_test):

#----Initialize the K-NN Classifier
    knn = KNeighborsClassifier(n_neighbors=K)
#----Train the model
    knn.fit(x_train, y_train)
#----Predict for the test data
    predictions = knn.predict(x_test)

    return knn,predictions

In [82]:
#knn,predictions = get_predictions(3,x_train,y_train,x_test)
#knn,predictions

In [83]:
# find the best K value
accuracy_list = []
best_k = 1
max_accuracy = 0

for k in range(1, 21):
  _, preds = get_predictions(k, x_train, y_train, x_test)
  accuracy = accuracy_score(y_test, preds)

  accuracy_list.append({'K': k, 'Accuracy': accuracy})

  if accuracy > max_accuracy:
    max_accuracy = accuracy
    best_k = k

pd.DataFrame(accuracy_list).to_csv('Accuracy.csv', index=False)
print(f'Best K: {best_k}, Accuracy: {max_accuracy}')

Best K: 5, Accuracy: 0.6333333333333333


In [84]:
def print_predictions(knn,predictions,x_test,y_test):

#----Check the probability (How sure is the model?)
    probability = knn.predict_proba(x_test)
    confidences = np.max(probability, axis=1)

#----Output the result for each test data
    for index in range(len(y_test)):
      actual = y_test.iloc[index]
      predicted = predictions[index]
      confidence = confidences[index]
      print(f"The {actual} is classified as: {predicted}")
      print(f"Confidence is {confidence}")


#----Compute the accuracy
    accuracy = accuracy_score(y_test,predictions)
    print(f"The accuracy is {accuracy}")

    return confidences

In [85]:
# Generate final predictions using the best k
best_knn, final_preds = get_predictions(best_k, x_train, y_train, x_test)

confidences = print_predictions(best_knn, final_preds, x_test, y_test)

testing_df = pd.read_csv('Testing.csv')
testing_df['Prediction'] = final_preds
testing_df['Confidence'] = confidences
testing_df.to_csv('Testing.csv', index=False)


The Sedan is classified as: Sedan
Confidence is 0.8
The SUV is classified as: Sedan
Confidence is 0.8
The SUV is classified as: Pickup
Confidence is 0.6
The SUV is classified as: Sedan
Confidence is 0.8
The Sedan is classified as: Sedan
Confidence is 0.6
The Sedan is classified as: Sedan
Confidence is 0.6
The Sedan is classified as: Sedan
Confidence is 0.6
The Sedan is classified as: Sedan
Confidence is 0.8
The Sedan is classified as: Sedan
Confidence is 0.8
The SUV is classified as: Sedan
Confidence is 0.6
The SUV is classified as: SUV
Confidence is 0.8
The SUV is classified as: SUV
Confidence is 0.8
The SUV is classified as: SUV
Confidence is 0.8
The SUV is classified as: SUV
Confidence is 0.8
The Pickup is classified as: Sedan
Confidence is 0.8
The SUV is classified as: SUV
Confidence is 0.8
The Pickup is classified as: Sedan
Confidence is 0.8
The SUV is classified as: SUV
Confidence is 0.8
The Pickup is classified as: Sedan
Confidence is 0.8
The SUV is classified as: SUV
Confidence