In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/ColabNotebooks/CSC345/LabTask3
!ls

/content/drive/MyDrive/ColabNotebooks/CSC345/LabTask3
AllCars.csv  ClusterAccuracy.csv  ClusterCars.csv  KMeansClassifier.ipynb


In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt


def load_clean_data(input_csv: str) -> pd.DataFrame:
    """
    Load AllCars.csv and return cleaned data with Volume, Doors, Style.
    """
    df = pd.read_csv(input_csv)

    required = {"Volume", "Doors", "Style"}
    if not required.issubset(df.columns):
        raise ValueError(f"Missing required columns {required}. Found: {set(df.columns)}")

    cars = df[["Volume", "Doors", "Style"]].copy()

    cars = cars.dropna(subset=["Volume", "Doors", "Style"])
    cars["Volume"] = pd.to_numeric(cars["Volume"], errors="coerce")
    cars["Doors"] = pd.to_numeric(cars["Doors"], errors="coerce")
    cars = cars.dropna(subset=["Volume", "Doors"])

    cars["Style"] = cars["Style"].astype(str)

    return cars


def split_and_normalize(
    cars: pd.DataFrame,
    test_size: float,
    random_state: int
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, pd.DataFrame, pd.DataFrame]:
    X = cars[["Volume", "Doors"]]
    y = cars["Style"].to_numpy()

    stratify_labels = cars["Style"] if cars["Style"].nunique() > 1 else None

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state,
        stratify=stratify_labels
    )

    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=["Volume", "Doors"])

    return X_train_scaled, X_test_scaled, y_train, y_test, X_test_scaled_df, X_test.reset_index(drop=True)


def save_tree_png(model: DecisionTreeClassifier, output_png: str):
    plt.figure(figsize=(18, 10))
    plot_tree(
        model,
        feature_names=["Volume", "Doors"],
        class_names=model.classes_,
        filled=True,
        rounded=True
    )
    plt.tight_layout()
    plt.savefig(output_png, dpi=200)
    plt.close()


def main():
    input_csv = "AllCars.csv"
    output_png = "TreeCars.png"
    output_csv = "TreeCars.csv"

    test_size = 0.2
    random_state = 42

    cars = load_clean_data(input_csv)

    X_train_scaled, X_test_scaled, y_train, y_test, X_test_scaled_df, _ = split_and_normalize(
        cars, test_size=test_size, random_state=random_state
    )

    # Simple tree
    tree_model = DecisionTreeClassifier(random_state=random_state)
    tree_model.fit(X_train_scaled, y_train)

    predictions = tree_model.predict(X_test_scaled)
    accuracy = float(accuracy_score(y_test, predictions))

    # Save the tree image
    save_tree_png(tree_model, output_png)

    # Build TreeCars.csv using normalized Volume/Doors
    tree_cars = pd.DataFrame({
        "Volume": X_test_scaled_df["Volume"].to_numpy(),
        "Doors": X_test_scaled_df["Doors"].to_numpy(),
        "Style": y_test,
        "PredictedStyle": predictions
    })

    # Add accuracy row at bottom
    accuracy_row = pd.DataFrame([{
        "Volume": "",
        "Doors": "",
        "Style": "Accuracy",
        "PredictedStyle": f"{accuracy:.4f}"
    }])

    tree_cars = pd.concat([tree_cars, accuracy_row], ignore_index=True)
    tree_cars.to_csv(output_csv, index=False)

    print("Saved:", output_png, output_csv)
    print("Accuracy:", accuracy)


if __name__ == "__main__":
    main()


Saved: TreeCars.png TreeCars.csv
Accuracy: 0.625
