In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

# Load the full dataset from Labtask 2
df = pd.read_csv('AllCars.csv')

# Step 1: Normalize Volume and Doors so they are on the same scale
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(df[['Volume', 'Doors']])

# Step 2: Use K-Means to find 5 clusters
model = KMeans(n_clusters=5, random_state=42, n_init=10)
df['Cluster'] = model.fit_predict(features_scaled)

# Step 3: Find the majority style for each cluster
cluster_map = {}
accuracy_list = []

for i in range(5):
    cluster_data = df[df['Cluster'] == i]
    # Identify the most common style (Sedan, SUV, etc.)
    majority_style = cluster_data['Style'].mode()[0]
    cluster_map[i] = majority_style

    # Calculate how accurate this cluster is
    correct_count = len(cluster_data[cluster_data['Style'] == majority_style])
    accuracy_list.append({
        'ClusterStyle': majority_style,
        'SizeOfCluster': len(cluster_data),
        'Accuracy': correct_count / len(cluster_data)
    })

# Step 4: Map styles back and save CSVs
df['ClusterStyle'] = df['Cluster'].map(cluster_map)
df[['Volume', 'Doors', 'Style', 'ClusterStyle']].to_csv('ClusterCars.csv', index=False)
pd.DataFrame(accuracy_list).to_csv('ClusterAccuracy.csv', index=False)