In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']
df = pd.read_csv(url, sep='\s+', names=columns)  # Use sep='\s+' instead of delim_whitespace

continuous_cols = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year']

df['horsepower'] = pd.to_numeric(df['horsepower'], errors='coerce')

X = df[continuous_cols].copy()

imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
X_imputed = pd.DataFrame(X_imputed, columns=continuous_cols)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

clustering = AgglomerativeClustering(n_clusters=3, linkage='average', metric='euclidean') 
labels = clustering.fit_predict(X_scaled)

df['cluster'] = labels

cluster_stats = df.groupby('cluster')[continuous_cols].agg(['mean', 'var'])
print("Cluster Statistics (Mean and Variance):")
print(cluster_stats)

origin_stats = df.groupby('origin')[continuous_cols].agg(['mean', 'var'])
print("\nOrigin Class Statistics (Mean and Variance):")
print(origin_stats)

crosstab = pd.crosstab(df['cluster'], df['origin'])
print("\nCrosstab of Cluster vs Origin:")
print(crosstab)

print("\nAnalysis:")
if crosstab.max().max() / crosstab.sum().sum() > 0.5:
    print("There is a clear relationship between cluster assignments and origin labels.")
else:
    print("There is no clear relationship between cluster assignments and origin labels.")

Cluster Statistics (Mean and Variance):
               mpg            cylinders           displacement               \
              mean        var      mean       var         mean          var   
cluster                                                                       
0        26.214576  41.397984  4.620339  0.998224   143.391525  3416.251799   
1        14.653535   5.377819  8.000000  0.000000   346.626263  2126.705834   
2        43.700000   0.300000  4.000000  0.000000    91.750000    12.250000   

         horsepower                   weight                acceleration  \
               mean         var         mean            var         mean   
cluster                                                                    
0         86.093426  308.994714  2593.162712  296800.095884    16.433220   
1        160.353535  726.679860  4128.393939  202509.139147    12.694949   
2         49.000000    4.000000  2133.750000   21672.916667    22.875000   

                  model_year