In [10]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
def get_data():

#----1. Prepare the Training Data
#----Features: [Weight in grams, Texture scale 1-10]
#----Labels: 0 for Apple, 1 for Orange
    features_train = [
        [140, 8],  #----Apple 1
        [150, 7],  #----Apple 2
        [130, 9],  #----Apple 3
        [200, 2],  #----Orange 1
        [220, 3],  #----Orange 2
        [210, 1],  #----Orange 3
    ]
    classes_train = ['Apple','Apple','Apple', 'Orange', 'Orange', 'Orange']

    features_test = [
        [170, 8],  #----Apple
        [190, 4]   #----Orange
    ]
    classes_test = ['Apple', 'Orange']

#----Give meaningful names to features and classes. Classes must be in order of their labels.
    feature_names = ['weight in grams','texture']
    class_names = ['Apple','Orange']

    return features_train,classes_train,features_test,classes_test,feature_names,class_names

In [12]:
features_train,classes_train,features_test,classes_test,feature_names,class_names = get_data()
feature_names,class_names,features_train,classes_train,features_test,classes_test


(['weight in grams', 'texture'],
 ['Apple', 'Orange'],
 [[140, 8], [150, 7], [130, 9], [200, 2], [220, 3], [210, 1]],
 ['Apple', 'Apple', 'Apple', 'Orange', 'Orange', 'Orange'],
 [[170, 8], [190, 4]],
 ['Apple', 'Orange'])

In [13]:
def get_data_from_csv(file_name):

    data = np.genfromtxt(file_name, delimiter=',', names=True, filling_values=0, dtype=None, \
ndmin=1)
    # print(f"Extracting from {file_name} and got {data}")
#----Extract all except the last column name as feature names
    feature_names = list(data.dtype.names[:-1])
    print(f"Feature names {feature_names}")
#----Extract unique values from last column as class names
    class_names = np.unique(data[data.dtype.names[-1]]).tolist()
    # print(f"Class names {class_names}")

#----Extract all except the last column as features
    features = data[feature_names].tolist()
    # print(f"Features are {features}")
#----Extract last column as classes
    np_classes = list(data[data.dtype.names[-1]])
    classes = [str(item) for item in np_classes]
    # print(f"All classes {classes}")

    return features,classes,feature_names,class_names

In [14]:
features_all, classes_all, feature_names, class_names = get_data_from_csv("/content/drive/MyDrive/Colab Notebooks/AllCars.csv")

# Showing what is loaded
print(f"Total cars: {len(features_all)}")
print(f"Features: {feature_names}")
print(f"Styles: {class_names}")
print(f"First car example: {features_all[0]} -> {classes_all[0]}")

Feature names ['Make', 'Volume', 'Doors']
Total cars: 150
Features: ['Make', 'Volume', 'Doors']
Styles: ['Jeep', 'Pickup', 'SUV', 'Sedan', 'Van']
First car example: ('Toyota', 102, 4) -> Sedan


In [15]:
# Making them numbers
from sklearn.preprocessing import LabelEncoder

# Create encoder for Make
make_encoder = LabelEncoder()
makes_numeric = make_encoder.fit_transform([car[0] for car in features_all])

# Getting numbers val/door
volumes = [car[1] for car in features_all]
doors = [car[2] for car in features_all]

# Combine into array with all numeric features
import numpy as np
features_numeric = np.column_stack([makes_numeric, volumes, doors])

print(f"Original first car: {features_all[0]}")
print(f"Numeric first car: {features_numeric[0]}")
print(f"Make encoding: {list(make_encoder.classes_)}")

Original first car: ('Toyota', 102, 4)
Numeric first car: [ 34 102   4]
Make encoding: [np.str_('Acura'), np.str_('Audi'), np.str_('Audi A5'), np.str_('BMW'), np.str_('Buick'), np.str_('Cadilac'), np.str_('Chevrolet'), np.str_('Chevrolet Equinox LT'), np.str_('Chevrolet Trailblazer'), np.str_('Chrysler'), np.str_('Dodge'), np.str_('Fiat'), np.str_('Ford'), np.str_('Ford Bronco'), np.str_('Ford Focus SE'), np.str_('Honda'), np.str_('Honda Accord V6'), np.str_('Hyundai'), np.str_('Infiniti'), np.str_('Infiniti Q50'), np.str_('Jeep'), np.str_('Kia'), np.str_('Land Rover'), np.str_('Land Rover Discovery'), np.str_('Lexus'), np.str_('Mazda'), np.str_('Mercedes'), np.str_('Mercedes GLA 250'), np.str_('Nissan'), np.str_('Porsche'), np.str_('Porsche '), np.str_('Porshe'), np.str_('Ram'), np.str_('Tesla'), np.str_('Toyota'), np.str_('Toyota RAV4 XSE'), np.str_('Volkswagen'), np.str_('Volvo')]


In [16]:
from sklearn.preprocessing import MinMaxScaler

# Normalize  features to 0-1 range
scaler = MinMaxScaler()
features_normalized = scaler.fit_transform(features_numeric)

print(f"Before normalization: {features_numeric[0]}")
print(f"After normalization: {features_normalized[0]}")
print(f"Shape: {features_normalized.shape}")

Before normalization: [ 34 102   4]
After normalization: [0.91891892 0.19838057 0.66666667]
Shape: (150, 3)


In [17]:
from sklearn.model_selection import train_test_split

# Split 80% training, 20% testing
features_train, features_test, classes_train, classes_test = train_test_split(
    features_normalized,
    classes_all,
    test_size=0.2,
    random_state=42
)

print(f"Training set: {len(features_train)} cars")
print(f"Testing set: {len(features_test)} cars")
print(f"First training example: {features_train[0]} -> {classes_train[0]}")
print(f"First testing example: {features_test[0]} -> {classes_test[0]}")

Training set: 120 cars
Testing set: 30 cars
First training example: [0.75675676 0.21052632 0.66666667] -> SUV
First testing example: [0.10810811 0.36842105 1.        ] -> SUV


In [18]:
def get_predictions(K,features_train,classes_train,features_test):

#----Initialize the K-NN Classifier
    knn = KNeighborsClassifier(n_neighbors=K)
#----Train the model
    knn.fit(features_train, classes_train)
#----Predict for the test data
    predictions = knn.predict(features_test)

    return knn,predictions

In [19]:
# Checking to see which K would be best
accuracy_results = []

for K in range(1, 21, 2):
    knn, predictions = get_predictions(K, features_train, classes_train, features_test)
    accuracy = accuracy_score(classes_test, predictions)
    accuracy_results.append([K, accuracy])
    print(f"K={K}: Accuracy = {accuracy:.4f}")

# Find the best K
best_result = max(accuracy_results, key=lambda x: x[1])
print(f"\nBest K: {best_result[0]} with accuracy: {best_result[1]:.4f}")

K=1: Accuracy = 0.6667
K=3: Accuracy = 0.5000
K=5: Accuracy = 0.5667
K=7: Accuracy = 0.5000
K=9: Accuracy = 0.5000
K=11: Accuracy = 0.5667
K=13: Accuracy = 0.5667
K=15: Accuracy = 0.5667
K=17: Accuracy = 0.5333
K=19: Accuracy = 0.5333

Best K: 1 with accuracy: 0.6667


In [20]:
knn,predictions = get_predictions(3,features_train,classes_train,features_test)
knn,predictions

(KNeighborsClassifier(n_neighbors=3),
 array(['SUV', 'Pickup', 'Jeep', 'Sedan', 'SUV', 'SUV', 'Sedan', 'SUV',
        'Sedan', 'SUV', 'SUV', 'Jeep', 'Pickup', 'Sedan', 'Sedan', 'Sedan',
        'Jeep', 'Pickup', 'SUV', 'SUV', 'Sedan', 'SUV', 'SUV', 'Jeep',
        'Van', 'Pickup', 'Sedan', 'SUV', 'SUV', 'Sedan'], dtype='<U6'))

In [21]:
def print_predictions(knn,predictions,features_test,classes_test):

#----Check the probability (How sure is the model?)
    probability = knn.predict_proba(features_test)
    confidences = np.max(probability, axis=1)
#----Output the result for each test data
    for index, value in enumerate(classes_test):
        print(f"The {classes_test[index]} is classified as: {predictions[index]}")
        print(f"Confidence is {confidences[index]}")

#----Compute the accuracy
    accuracy = accuracy_score(classes_test,predictions)
    print(f"The accuracy is {accuracy}")

In [22]:
print_predictions(knn,predictions,features_test,classes_test)

The SUV is classified as: SUV
Confidence is 1.0
The SUV is classified as: Pickup
Confidence is 0.6666666666666666
The Jeep is classified as: Jeep
Confidence is 0.3333333333333333
The Pickup is classified as: Sedan
Confidence is 0.6666666666666666
The Sedan is classified as: SUV
Confidence is 0.6666666666666666
The SUV is classified as: SUV
Confidence is 0.6666666666666666
The SUV is classified as: Sedan
Confidence is 0.6666666666666666
The SUV is classified as: SUV
Confidence is 0.6666666666666666
The SUV is classified as: Sedan
Confidence is 0.6666666666666666
The Jeep is classified as: SUV
Confidence is 1.0
The SUV is classified as: SUV
Confidence is 1.0
The Jeep is classified as: Jeep
Confidence is 0.3333333333333333
The Sedan is classified as: Pickup
Confidence is 0.3333333333333333
The Sedan is classified as: Sedan
Confidence is 1.0
The Sedan is classified as: Sedan
Confidence is 0.6666666666666666
The Sedan is classified as: Sedan
Confidence is 0.6666666666666666
The Pickup is cl

In [23]:
import pandas as pd

#SAVING ALL THE THINGS

# 1. Save Accuracy.csv
accuracy_df = pd.DataFrame(accuracy_results, columns=['K', 'Accuracy'])
accuracy_df.to_csv('/content/drive/MyDrive/Colab Notebooks/Accuracy.csv', index=False)
print("✓ Saved Accuracy.csv")

# 2. Save Training.csv
training_df = pd.DataFrame(features_train, columns=['Make_Normalized', 'Volume_Normalized', 'Doors_Normalized'])
training_df['Style'] = classes_train
training_df.to_csv('/content/drive/MyDrive/Colab Notebooks/Training.csv', index=False)
print("✓ Saved Training.csv")

# 3. Save Testing.csv with predictions using best K
best_K = best_result[0]
knn, predictions = get_predictions(best_K, features_train, classes_train, features_test)
probabilities = knn.predict_proba(features_test)
confidences = np.max(probabilities, axis=1)

testing_df = pd.DataFrame(features_test, columns=['Make_Normalized', 'Volume_Normalized', 'Doors_Normalized'])
testing_df['Style'] = classes_test
testing_df['Prediction'] = predictions
testing_df['Confidence'] = confidences
testing_df.to_csv('/content/drive/MyDrive/Colab Notebooks/Testing.csv', index=False)
print("✓ Saved Testing.csv")

print("\nAll files saved!")

✓ Saved Accuracy.csv
✓ Saved Training.csv
✓ Saved Testing.csv

All files saved!
