In [2]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# Load Data
df = pd.read_csv('AllCars.csv')

# Normalize Volume and doors
scaler = MinMaxScaler()
df[['Volume', 'Doors']] = scaler.fit_transform(df[['Volume', 'Doors']])

# K-means clustering into 5 clusters
X = df[['Volume', 'Doors']]
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(X)

# Find majority style for each cluster and compute accuracy
cluster_info = []
cluster_map = {}
for i in range(5):
  cluster_data = df[df['Cluster'] == i]
  if not cluster_data.empty:
    # Find the style that appears the most in the cluster
    majority_style = cluster_data['Style'].value_counts().idxmax()
    cluster_map[i] = majority_style

    # accuracy is the cars with majority style divided by the totals cars in the cluster
    accuracy = (cluster_data['Style'] == majority_style).sum() / len(cluster_data)

    cluster_info.append({'ClusterStyle': majority_style, 'SizeOfCluster' : len(cluster_data), 'Accuracy': accuracy})

df['ClusterStyle'] = df['Cluster'].map(cluster_map)

# Create ClusterCars.csv
df[['Volume', 'Doors', 'Style', 'ClusterStyle']].to_csv('ClusterCars.csv', index=False)

# Create ClusterAccuracy.csv
pd.DataFrame(cluster_info).to_csv('ClusterAccuracy.csv', index=False)
