In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/ColabNotebooks/CSC345/LabTask3
!ls

/content/drive/MyDrive/ColabNotebooks/CSC345/LabTask3
AllCars.csv


In [None]:
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler


def load_and_normalize(input_csv: str) -> tuple[pd.DataFrame, np.ndarray]:
    df = pd.read_csv(input_csv)

    required = {"Volume", "Doors", "Style"}
    if not required.issubset(df.columns):
        raise ValueError(f"Missing required columns {required}. Found: {set(df.columns)}")

    cars = df[["Volume", "Doors", "Style"]].copy()

    # Clean + enforce numeric
    cars = cars.dropna(subset=["Volume", "Doors", "Style"])
    cars["Volume"] = pd.to_numeric(cars["Volume"], errors="coerce")
    cars["Doors"] = pd.to_numeric(cars["Doors"], errors="coerce")
    cars = cars.dropna(subset=["Volume", "Doors"])

    # Normalize ordinal features
    scaler = MinMaxScaler()
    features = scaler.fit_transform(cars[["Volume", "Doors"]])

    # Store normalized values back into cars
    cars["Volume"] = features[:, 0]
    cars["Doors"] = features[:, 1]

    # Ensure Style is clean strings
    cars["Style"] = cars["Style"].astype(str)

    return cars, features


def majority_style_per_cluster(cars: pd.DataFrame, cluster_ids: np.ndarray) -> dict[int, str]:
    cars_with_cluster = cars.copy()
    cars_with_cluster["ClusterId"] = cluster_ids

    cluster_to_style: dict[int, str] = {}
    for cluster_id in sorted(cars_with_cluster["ClusterId"].unique()):
        members = cars_with_cluster[cars_with_cluster["ClusterId"] == cluster_id]
        majority_style = members["Style"].value_counts().idxmax()
        cluster_to_style[int(cluster_id)] = str(majority_style)

    return cluster_to_style


def build_cluster_cars(cars: pd.DataFrame, cluster_ids: np.ndarray, cluster_to_style: dict[int, str]) -> pd.DataFrame:
    result = cars.copy()
    result["ClusterId"] = cluster_ids
    result["ClusterStyle"] = result["ClusterId"].map(cluster_to_style)
    return result[["Volume", "Doors", "Style", "ClusterStyle"]]


def build_cluster_accuracy(cars: pd.DataFrame, cluster_ids: np.ndarray, cluster_to_style: dict[int, str]) -> pd.DataFrame:
    cars_with_cluster = cars.copy()
    cars_with_cluster["ClusterId"] = cluster_ids

    rows = []
    for cluster_id in sorted(cars_with_cluster["ClusterId"].unique()):
        members = cars_with_cluster[cars_with_cluster["ClusterId"] == cluster_id]
        cluster_style = cluster_to_style[int(cluster_id)]
        size = int(len(members))
        correct = int((members["Style"] == cluster_style).sum())
        accuracy = float(correct / size) if size > 0 else 0.0
        rows.append({"ClusterStyle": cluster_style, "SizeOfCluster": size, "Accuracy": accuracy})

    return pd.DataFrame(rows, columns=["ClusterStyle", "SizeOfCluster", "Accuracy"])


def main():
    input_csv = "AllCars.csv"
    output_cluster_cars = "ClusterCars.csv"
    output_cluster_accuracy = "ClusterAccuracy.csv"

    n_clusters = 5
    random_state = 42

    # Load + normalize
    cars, features = load_and_normalize(input_csv)

    # K-Means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
    cluster_ids = kmeans.fit_predict(features)

    # Majority style per cluster
    cluster_to_style = majority_style_per_cluster(cars, cluster_ids)

    # Outputs
    cluster_cars = build_cluster_cars(cars, cluster_ids, cluster_to_style)
    cluster_accuracy = build_cluster_accuracy(cars, cluster_ids, cluster_to_style)

    cluster_cars.to_csv(output_cluster_cars, index=False)
    cluster_accuracy.to_csv(output_cluster_accuracy, index=False)

    print("Saved:", output_cluster_cars, output_cluster_accuracy)
    print(cluster_accuracy)


if __name__ == "__main__":
    main()


Saved: ClusterCars.csv ClusterAccuracy.csv
  ClusterStyle  SizeOfCluster  Accuracy
0          SUV              9  0.555556
1          SUV             30  0.733333
2        Sedan             83  0.578313
3        Sedan              7  0.714286
4          SUV             31  0.580645
