In [24]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score, accuracy_score
from sklearn.metrics import confusion_matrix
from scipy.optimize import linear_sum_assignment
import numpy as np
import pandas as pd

In [25]:
SHOW_PLOTS = False


In [26]:

df = pd.read_csv("dc_motor_data.csv")

# Übersicht
print(df.shape)         # z. B. (58509, 49) → 48 Features + 1 Label
print(df.columns)       # Zeigt die Spaltennamen
print(df.head())        # Erste 5 Zeilen


(29255, 49)
Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31',
       'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41',
       'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'class'],
      dtype='object')
             V1            V2        V3            V4            V5  \
0  2.913200e-06 -5.247700e-06  0.000003 -6.056100e-06  2.778900e-06   
1 -9.584900e-07  5.214300e-08 -0.000047  6.453700e-07 -2.304100e-06   
2 -2.566600e-06 -1.679500e-07  0.000015 -1.598400e-06  8.709200e-07   
3  2.825600e-06  6.067600e-06  0.000118 -4.347500e-06  1.492300e-06   
4 -6.573800e-07 -3.951900e-07  0.000032 -5.296100e-06 -1.605800e-06   

             V6        V7        V8        V9       V10  ...      V40  \
0 -3.752400e-06  0.030804  0.030810  0.030806 -0.033520  ... -0.59314   
1  5.499900e-05  0.03

In [27]:
# Features (ohne Label)
X = df.drop(columns=["class"])

# Zielvariable / Labels
y = df["class"]


In [28]:

# Accuracy via optimal label mapping (Hungarian Algorithmus)
def cluster_accuracy(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    row_ind, col_ind = linear_sum_assignment(-cm)
    mapping = dict(zip(col_ind, row_ind))
    y_aligned = [mapping[label] if label in mapping else -1 for label in y_pred]
    return accuracy_score(y_true, y_aligned)

# Setup
scalers = {
    "StandardScaler": StandardScaler(),
    "MinMaxScaler": MinMaxScaler(),
    "RobustScaler": RobustScaler(),
    "Log+StandardScaler": "log_std"
}

clusterers = {
    "KMeans": lambda X: KMeans(n_clusters=11, random_state=42, n_init=20).fit_predict(X),
    "GMM": lambda X: GaussianMixture(n_components=11, random_state=42).fit(X).predict(X),
    "DBSCAN": lambda X: DBSCAN(eps=0.5, min_samples=5).fit_predict(X),
    "Spectral": lambda X: SpectralClustering(n_clusters=11, random_state=42, assign_labels='kmeans', affinity='nearest_neighbors').fit_predict(X),
}

results = []

for scaler_name, scaler in scalers.items():
    print(f"\n Scaler: {scaler_name}")
    
    # Skalierung vorbereiten
    if scaler_name == "Log+StandardScaler":
        X_shifted = X - X.min().min() + 1e-6
        X_log = np.log(X_shifted)
        X_scaled = StandardScaler().fit_transform(X_log)
    else:
        X_scaled = scaler.fit_transform(X)

    # Optional: PCA zur Dimensionsreduktion (10D, nicht 2D!)
    pca = PCA(n_components=10)
    X_reduced = pca.fit_transform(X_scaled)

    for cluster_name, cluster_fn in clusterers.items():
        try:
            labels = cluster_fn(X_reduced)

            # DBSCAN kann Label -1 (Rauswurf) enthalten – bei Accuracy entfernen
            valid_idx = labels != -1
            y_valid = y[valid_idx]
            labels_valid = np.array(labels)[valid_idx]

            ari = adjusted_rand_score(y, labels)
            nmi = normalized_mutual_info_score(y, labels)
            sil = silhouette_score(X_reduced, labels) if len(set(labels)) > 1 else -1
            acc = cluster_accuracy(y_valid, labels_valid) if len(set(labels_valid)) > 1 else -1

            results.append((scaler_name, cluster_name, ari, nmi, sil, acc))

            print(f"  {cluster_name.ljust(12)} | ARI: {ari:.4f} | NMI: {nmi:.4f} | Sil: {sil:.4f} | Acc: {acc:.4f}")
        
        except Exception as e:
            print(f"  {cluster_name.ljust(12)} | Error: {e}")
            results.append((scaler_name, cluster_name, -1, -1, -1, -1))




 Scaler: StandardScaler
  KMeans       | ARI: 0.0599 | NMI: 0.1617 | Sil: 0.2297 | Acc: 0.1818
  GMM          | ARI: 0.0575 | NMI: 0.1529 | Sil: 0.1745 | Acc: 0.1823
  DBSCAN       | ARI: 0.0000 | NMI: 0.0010 | Sil: -0.2505 | Acc: 0.3913




  Spectral     | ARI: 0.0333 | NMI: 0.2153 | Sil: 0.0584 | Acc: 0.1440

 Scaler: MinMaxScaler
  KMeans       | ARI: 0.1667 | NMI: 0.3721 | Sil: 0.2632 | Acc: 0.2885
  GMM          | ARI: 0.1889 | NMI: 0.4099 | Sil: 0.0841 | Acc: 0.2977
  DBSCAN       | ARI: 0.0393 | NMI: 0.2268 | Sil: 0.3989 | Acc: 0.1844




  Spectral     | ARI: 0.0488 | NMI: 0.2516 | Sil: -0.0982 | Acc: 0.2053

Scaler: RobustScaler
  KMeans       | ARI: 0.0000 | NMI: 0.0008 | Sil: 0.9898 | Acc: 0.0927
  GMM          | ARI: 0.0000 | NMI: 0.0008 | Sil: 0.9898 | Acc: 0.0927
  DBSCAN       | ARI: 0.0000 | NMI: 0.0101 | Sil: -0.5251 | Acc: 0.4450
  Spectral     | ARI: 0.0889 | NMI: 0.2647 | Sil: -0.0394 | Acc: 0.2137

 Scaler: Log+StandardScaler
  KMeans       | ARI: 0.0545 | NMI: 0.1506 | Sil: 0.2313 | Acc: 0.1812
  GMM          | ARI: 0.0941 | NMI: 0.1493 | Sil: 0.1589 | Acc: 0.2196
  DBSCAN       | ARI: -0.0000 | NMI: 0.0021 | Sil: -0.3176 | Acc: 0.5405




  Spectral     | ARI: 0.0333 | NMI: 0.2151 | Sil: 0.0684 | Acc: 0.1441


In [30]:
# Ergebnisse anzeigen (lokal)
print("\n📊 Vergleichstabelle:")
print(df_results.to_string(index=False))



 Vergleichstabelle:
            Scaler Clusterer           ARI      NMI  Silhouette  Accuracy
      MinMaxScaler       GMM  1.889053e-01 0.409898    0.084145  0.297659
      MinMaxScaler    KMeans  1.667088e-01 0.372146    0.263182  0.288532
Log+StandardScaler       GMM  9.409728e-02 0.149313    0.158948  0.219621
      RobustScaler  Spectral  8.886618e-02 0.264723   -0.039378  0.213741
    StandardScaler    KMeans  5.986334e-02 0.161718    0.229730  0.181781
    StandardScaler       GMM  5.746371e-02 0.152904    0.174510  0.182328
Log+StandardScaler    KMeans  5.452684e-02 0.150627    0.231300  0.181200
      MinMaxScaler  Spectral  4.879415e-02 0.251627   -0.098249  0.205298
      MinMaxScaler    DBSCAN  3.926015e-02 0.226814    0.398877  0.184389
    StandardScaler  Spectral  3.333202e-02 0.215259    0.058449  0.144044
Log+StandardScaler  Spectral  3.331771e-02 0.215112    0.068356  0.144078
      RobustScaler    DBSCAN  4.784227e-06 0.010064   -0.525128  0.444976
    StandardScale

Überbleibsel

In [13]:
from sklearn.metrics import accuracy_score
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import confusion_matrix

def cluster_accuracy(y_true, y_pred):
    """Berechnet bestmögliche Accuracy durch Matching der Cluster-Labels zu Klassen"""
    cm = confusion_matrix(y_true, y_pred)
    row_ind, col_ind = linear_sum_assignment(-cm)  # Maximiert Übereinstimmung
    optimal_mapping = dict(zip(col_ind, row_ind))
    y_aligned = [optimal_mapping[label] for label in y_pred]
    acc = accuracy_score(y_true, y_aligned)
    return acc


acc = cluster_accuracy(y, cluster_labels)
results.append((name, k, ari, nmi, silhouette, acc))
print(f"  ARI={ari:.4f} | NMI={nmi:.4f} | Silhouette={silhouette:.4f} | Accuracy={acc:.4f}")


  ARI=0.0620 | NMI=0.1620 | Silhouette=0.2368 | Accuracy=0.1976


In [12]:
# Zusammenfassung
print("\n🔍 Vergleich aller Scaler:")
print("Scaler".ljust(20), "ARI".rjust(8), "NMI".rjust(8), "Silhouette".rjust(12))
for name, ari, sil, nmi in results:
    print(f"{name.ljust(20)} {ari:8.4f} {nmi:8.4f} {sil:12.4f}")



 Vergleich aller Scaler:
Scaler                    ARI      NMI   Silhouette


ValueError: too many values to unpack (expected 4)