In [33]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [35]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AllCars.csv")

In [36]:
features = df[["Volume", "Doors"]]
target = df["Style"]

In [37]:
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)

features_scaled = pd.DataFrame(features_scaled, columns=["Volume", "Doors"])

In [38]:
processed_df = features_scaled.copy()
processed_df["Style"] = target

In [39]:
train_df, test_df = train_test_split(
    processed_df,
    test_size=0.2,
    random_state=3, stratify=processed_df["Style"]
)

In [40]:
train_df.to_csv("Training.csv", index=False)
test_df.to_csv("Testing.csv", index=False)

print("Files created successfully!")

Files created successfully!


In [41]:
def get_data_from_csv(file_name):

    data = np.genfromtxt(file_name, delimiter=',', names=True, filling_values=0, dtype=None, \
ndmin=1)
    # print(f"Extracting from {file_name} and got {data}")
#----Extract all except the last column name as feature names
    feature_names = list(data.dtype.names[:-1])
    print(f"Feature names {feature_names}")
#----Extract unique values from last column as class names
    class_names = np.unique(data[data.dtype.names[-1]]).tolist()
    # print(f"Class names {class_names}")

#----Extract all except the last column as features
    features = data[feature_names].tolist()
    # print(f"Features are {features}")
#----Extract last column as classes
    np_classes = list(data[data.dtype.names[-1]])
    classes = [str(item) for item in np_classes]
    # print(f"All classes {classes}")

    return features,classes,feature_names,class_names

In [42]:
features_train,classes_train,feature_names,class_names = get_data_from_csv("Training.csv")
features_test,classes_test,*_ = get_data_from_csv("Testing.csv")
feature_names,class_names,features_train,classes_train,features_test,classes_test

Feature names ['Volume', 'Doors']
Feature names ['Volume', 'Doors']


(['Volume', 'Doors'],
 ['Jeep', 'Pickup', 'SUV', 'Sedan', 'Van'],
 [(0.17004048582995954, 0.6666666666666666),
  (0.2145748987854251, 0.6666666666666666),
  (0.0769230769230769, 0.6666666666666666),
  (0.12955465587044537, 0.0),
  (0.2105263157894737, 0.9999999999999999),
  (0.22267206477732795, 0.6666666666666666),
  (0.17813765182186234, 0.6666666666666666),
  (0.20647773279352225, 0.6666666666666666),
  (0.2186234817813765, 0.6666666666666666),
  (0.1862348178137652, 0.6666666666666666),
  (0.4655870445344129, 0.6666666666666666),
  (0.33603238866396756, 0.9999999999999999),
  (0.31983805668016196, 0.6666666666666666),
  (0.1902834008097166, 0.6666666666666666),
  (0.3157894736842105, 0.6666666666666666),
  (0.26315789473684215, 0.6666666666666666),
  (0.31983805668016196, 0.6666666666666666),
  (0.27125506072874495, 0.6666666666666666),
  (0.2510121457489879, 0.6666666666666666),
  (0.2388663967611336, 0.6666666666666666),
  (0.5546558704453441, 0.9999999999999999),
  (0.3319838056

In [43]:
def get_predictions(K,features_train,classes_train,features_test):

#----Initialize the K-NN Classifier
    knn = KNeighborsClassifier(n_neighbors=K)
#----Train the model
    knn.fit(features_train, classes_train)
#----Predict for the test data
    predictions = knn.predict(features_test)

    return knn,predictions

In [44]:
results = []

best_k = None
best_accuracy = 0
best_knn = None
best_predictions = None

for k in range(1, 101):

    knn, predictions = get_predictions(
        k,
        features_train,
        classes_train,
        features_test
    )

    acc = accuracy_score(classes_test, predictions)

    results.append([k, acc])

    if acc > best_accuracy:
        best_accuracy = acc
        best_k = k
        best_knn = knn
        best_predictions = predictions


In [45]:
accuracy_df = pd.DataFrame(results, columns=["K", "Accuracy"])
accuracy_df.to_csv("Accuracy.csv", index=False)

print("Accuracy.csv created")

Accuracy.csv created


In [46]:
confidences = []

for i, test_point in enumerate(features_test):

    neighbors = best_knn.kneighbors([test_point], return_distance=False)[0]

    neighbor_labels = [classes_train[n] for n in neighbors]

    pred_class = best_predictions[i]

    confidence = neighbor_labels.count(pred_class) / best_k

    confidences.append(confidence)


In [47]:
testing_df = pd.read_csv("Testing.csv")

testing_df["Prediction"] = best_predictions
testing_df["Confidence"] = confidences

testing_df.to_csv("Testing.csv", index=False)

print("Testing.csv updated with predictions")

Testing.csv updated with predictions


In [48]:
print(f"Best K: {best_k}")
print(f"Best Accuracy: {best_accuracy}")


Best K: 6
Best Accuracy: 0.7333333333333333
