In [66]:
%pip install numpy matplotlib pandas scikit-learn seaborn ucimlrepo seaborn jinja2 dataframe_image

Collecting dataframe_image
  Downloading dataframe_image-0.2.7-py3-none-any.whl.metadata (9.3 kB)
Collecting nbconvert>=5 (from dataframe_image)
  Using cached nbconvert-7.16.6-py3-none-any.whl.metadata (8.5 kB)
Collecting aiohttp>=3.10.2 (from dataframe_image)
  Downloading aiohttp-3.13.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (8.1 kB)
Collecting requests (from dataframe_image)
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting mistune (from dataframe_image)
  Downloading mistune-3.1.4-py3-none-any.whl.metadata (1.8 kB)
Collecting lxml (from dataframe_image)
  Downloading lxml-6.0.2-cp310-cp310-macosx_10_9_universal2.whl.metadata (3.6 kB)
Collecting beautifulsoup4 (from dataframe_image)
  Downloading beautifulsoup4-4.14.3-py3-none-any.whl.metadata (3.8 kB)
Collecting cssutils (from dataframe_image)
  Downloading cssutils-2.11.1-py3-none-any.whl.metadata (8.7 kB)
Collecting playwright (from dataframe_image)
  Downloading playwright-1.57.0-py3-none-macosx_1

In [173]:
import numpy as np
import pandas as pd
import dataframe_image as dfi
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.datasets import load_iris, load_wine, load_breast_cancer, make_blobs, make_moons, make_circles
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from collections import Counter
from typing import Callable
from ucimlrepo import fetch_ucirepo
import warnings
warnings.filterwarnings('ignore')
import os

In [148]:
class FuzzyEquivalenceKMeans:
    def __init__(self, n_clusters: int = 3, 
                 E: str = 'E_GG',
                 A: str = 'power_mean',
                 p: float = 1,
                 max_iter: int = 100,
                 random_state: int = 42):
        self.n_clusters = n_clusters
        self.E_name = E
        self.A_name = A
        self.p = p
        self.max_iter = max_iter
        self.random_state = random_state
        
        self.E = self._get_fuzzy_equivalence(E)
        self.A = self._get_aggregation(A)
        
        self.centroids = None
        self.labels = None
        self.clusters = None
        
    def _get_fuzzy_equivalence(self, E_name: str) -> Callable:

        if E_name == 'E_LK':
            return self._E_LK
        elif E_name == 'E_GG':
            return self._E_GG
        elif E_name == 'E_GD':
            return self._E_GD
        elif E_name == 'E_FD':
            return self._E_FD
        elif E_name == 'E_LK_R':
            return self._E_LK_R
        elif E_name == 'E_GD_R':
            return self._E_GD_R
        elif E_name == 'E_FD_R':
            return self._E_FD_R
        elif E_name == 'E_3':
            return self._E_3
        elif E_name == 'E_4':
            return self._E_4
        elif E_name == 'E_5':
            return self._E_5
        elif E_name == 'E_6':
            return self._E_6
        else:
            raise ValueError(f"Unknown fuzzy equivalence function: {E_name}")
    
    def _get_aggregation(self, A_name: str) -> Callable:

        if A_name == 'A2':
            return lambda x: self._power_mean(x, p = self.p)
        
        elif A_name == 'A3':
            return self._minimum
        
        elif A_name == 'A4':
            return self._maximum
        
        else:
            raise ValueError(f"Unknown aggregation function: {A_name}")
    
    def _E_LK(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
        """E_LK(x,y) = 1 - |x - y|"""
        return 1 - np.abs(x - y)
    
    def _E_GG(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
        """E_GG(x,y) = min(x, y)/max(x, y) for x ‚â† y, 1 for x = y"""
        with np.errstate(divide='ignore', invalid='ignore'):
            result = np.where(x == y, 1.0, np.minimum(x, y) / np.maximum(x, y))
            # Handle division by zero
            result = np.nan_to_num(result, nan=1.0, posinf=1.0, neginf=0.0)
        return result
    
    def _E_GD(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
        """E_GD(x,y) = min(x,y) for x ‚â† y, 1 for x = y"""
        return np.where(x == y, 1.0, np.minimum(x, y))
    
    def _E_FD(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
        """
        E_FD(x,y) = max(1 - y, x) for x < y, max(1 - x, y) for y < x, 1 for x = y
        """
        return np.where(x == y, 1.0, np.where(x < y, np.maximum(1 - y, x), np.maximum(1 - x, y)))
    
    def _E_LK_R(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
        """E^R_LK(x,y) = (E_LK(x,y))^2"""
        return self._E_LK(x, y) ** 2
    
    def _E_GD_R(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
        """E^R_GD(x,y) = E_GD(x,y) * (2 - E_GD(x,y))"""
        e_gd = self._E_GD(x, y)
        return e_gd * (2 - e_gd)
    
    def _E_FD_R(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
        """E^R_FD(x,y) piecewise function"""
        e_fd = self._E_FD(x, y)
        return np.where(e_fd >= 0.5, 1 - 2 * ((1 - e_fd) ** 2), 2 * (e_fd ** 2))
    
    def _E_3(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
        """E_3(x,y) = 2*min(x,y)/(x+y) for x+y>0, otherwise 1"""
        denominator = x + y
        with np.errstate(divide='ignore', invalid='ignore'):
            result = np.where(denominator > 0, 
                            2 * np.minimum(x, y) / denominator, 
                            1.0)
            result = np.nan_to_num(result, nan=1.0)
        return np.clip(result, 0, 1)
    
    def _E_4(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
        """E_4(x,y) = 2*x*y/(x^2+y^2) for x+y>0, otherwise 1"""
        denominator = x**2 + y**2
        with np.errstate(divide='ignore', invalid='ignore'):
            result = np.where((x + y) > 0, 
                            2 * x * y / denominator, 
                            1.0)
            result = np.nan_to_num(result, nan=1.0, posinf=1.0, neginf=0.0)
        return np.clip(result, 0, 1)
    
    def _E_5(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
        """E_5(x,y) = 2*min(x^2,y^2)/(x^2+y^2) for x+y>0, otherwise 1"""
        denominator = x**2 + y**2
        with np.errstate(divide='ignore', invalid='ignore'):
            result = np.where((x + y) > 0, 
                            2 * np.minimum(x**2, y**2) / denominator, 
                            1.0)
            result = np.nan_to_num(result, nan=1.0, posinf=1.0, neginf=0.0)
        return np.clip(result, 0, 1)
    
    def _E_6(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
        """E_6(x,y) = (1 - |x-y|)/(1+|x-y|)"""
        numerator = 1 - np.abs(x - y)
        denominator = 1 + np.abs(x - y)
        with np.errstate(divide='ignore', invalid='ignore'):
            result = numerator / denominator
            result = np.nan_to_num(result, nan=1.0)
        return np.clip(result, 0, 1)
    
    def _power_mean(self, values: np.ndarray, p: float = 2) -> np.floating:
        """A2: Power mean of values with parameter p"""
        if p == 0:
            return np.prod(values) ** (1/len(values))
        return (np.mean(values ** p)) ** (1/p)
    
    def _minimum(self, values: np.ndarray) -> float:
        """A3: Minimum of values"""
        return np.min(values)
    
    def _maximum(self, values: np.ndarray) -> float:
        """A4: Maximum of values"""
        return np.max(values)
    
    def _initialize_centroids(self, X: np.ndarray) -> None:
        """Initialize centroids randomly from data points."""
        if self.random_state is not None:
            np.random.seed(self.random_state)
        
        indices = np.random.choice(X.shape[0], self.n_clusters, replace=False)
        self.centroids = X[indices].copy()
    
    def _normalize_data(self, X: np.ndarray) -> np.ndarray:

        X_norm = X.copy().astype(float)
        for i in range(X.shape[1]):
            min_val = np.min(X[:, i])
            max_val = np.max(X[:, i])
            if max_val > min_val:
                X_norm[:, i] = (X[:, i] - min_val) / (max_val - min_val)
            else:
                X_norm[:, i] = 0.5
        return X_norm
    
    def fit(self, X: np.ndarray) -> 'FuzzyEquivalenceKMeans':

        if self.random_state is not None:
            np.random.seed(self.random_state)
        
        n_samples, n_features = X.shape
        
        X_norm = self._normalize_data(X)
        self._initialize_centroids(X)
        centroids_norm = self._normalize_data(self.centroids)
        
        for iteration in range(self.max_iter):

            old_centroids = self.centroids.copy()
            labels = np.zeros(n_samples, dtype=int)
            
            for i in range(n_samples):
                closeness_values = []
                
                for j in range(self.n_clusters):
                    
                    eq_values = self.E(X_norm[i], centroids_norm[j])
                    closeness = self.A(eq_values)
                    closeness_values.append(closeness)

                labels[i] = np.argmax(closeness_values)
            
            self.labels = labels
            
            for j in range(self.n_clusters):
                cluster_points = X[labels == j]
                if len(cluster_points) > 0:
                    self.centroids[j] = np.mean(cluster_points, axis=0)
            
            centroids_norm = self._normalize_data(self.centroids)
            
            if np.allclose(old_centroids, self.centroids, rtol=1e-5):
                print(f"Converged at iteration {iteration + 1}")
                break
            
            if iteration == self.max_iter - 1:
                print(f"Reached maximum iterations ({self.max_iter})")
        
        self.clusters = {}
        for j in range(self.n_clusters):
            self.clusters[j] = {
                'center': self.centroids[j],
                'points': X[self.labels == j]
            }
        
        return self
    
    def predict(self, X: np.ndarray) -> np.ndarray:

        if self.centroids is None:
            raise ValueError("Model must be fitted before prediction")
        
        X_norm = self._normalize_data(X)
        centroids_norm = self._normalize_data(self.centroids)
        
        n_samples = X.shape[0]
        labels = np.zeros(n_samples, dtype=int)
        
        for i in range(n_samples):
            closeness_values = []
            
            for j in range(self.n_clusters):
                eq_values = self.E(X_norm[i], centroids_norm[j])
                closeness = self.A(eq_values)
                closeness_values.append(closeness)
            
            labels[i] = np.argmax(closeness_values)
        
        return labels
    
    def fit_predict(self, X: np.ndarray) -> np.ndarray:

        self.fit(X)
        return self.labels

In [149]:
def fuzzy_silhouette_score(X: np.ndarray, labels: np.ndarray,
                                 model: 'FuzzyEquivalenceKMeans') -> float:
    """
    Simplified version of silhouette score using distance-like measure.
    Uses 1 - closeness as distance.
    """
    n_samples = X.shape[0]
    n_clusters = len(np.unique(labels))
    
    if n_clusters <= 1:
        return 0.0
    
    # Normalize data
    X_norm = model._normalize_data(X)
    centroids_norm = model._normalize_data(model.centroids)
    
    # Compute distance matrix using 1 - closeness
    distance_matrix = np.zeros((n_samples, n_clusters))
    for i in range(n_samples):
        for j in range(n_clusters):
            eq_values = model.E(X_norm[i], centroids_norm[j])
            closeness = model.A(eq_values)
            distance_matrix[i, j] = 1 - closeness
    
    silhouette_vals = np.zeros(n_samples)
    
    for i in range(n_samples):
        cluster_i = labels[i]
        
        # a(i): average distance to points in same cluster
        mask_same = (labels == cluster_i)
        mask_same[i] = False
        if np.sum(mask_same) > 0:
            a = np.mean(distance_matrix[i, cluster_i] + distance_matrix[mask_same, cluster_i])
        else:
            a = 0
        
        # b(i): minimum average distance to other clusters
        b = np.inf
        for cluster_j in range(n_clusters):
            if cluster_j != cluster_i:
                mask_other = (labels == cluster_j)
                if np.sum(mask_other) > 0:
                    avg_dist = np.mean(distance_matrix[i, cluster_j] + distance_matrix[mask_other, cluster_j])
                    b = min(b, avg_dist)
        
        if np.isinf(b):
            b = 0
        
        if max(a, b) > 0:
            silhouette_vals[i] = (b - a) / max(a, b)
    
    return np.mean(silhouette_vals)


def fuzzy_davies_bouldin_score(X: np.ndarray, labels: np.ndarray,
                              model: 'FuzzyEquivalenceKMeans') -> float:
    
    n_samples = X.shape[0]
    n_clusters = len(np.unique(labels))
    
    if n_clusters <= 1:
        return 0.0
    
    X_norm = model._normalize_data(X)
    centroids_norm = model._normalize_data(model.centroids)
    
    S = np.zeros(n_clusters)
    cluster_sizes = np.zeros(n_clusters, dtype=int)
    
    for i in range(n_samples):
        cluster_i = labels[i]
        cluster_sizes[cluster_i] += 1
        
        eq_values = model.E(X_norm[i], centroids_norm[cluster_i])
        closeness = model.A(eq_values)
        S[cluster_i] += 1 - closeness
    
    for j in range(n_clusters):
        if cluster_sizes[j] > 0:
            S[j] = S[j] / cluster_sizes[j]
        else:
            S[j] = 0
    
    M = np.zeros((n_clusters, n_clusters))
    for i in range(n_clusters):
        for j in range(n_clusters):
            if i != j:
                eq_values = model.E(centroids_norm[i], centroids_norm[j])
                closeness = model.A(eq_values)
                M[i, j] = 1 - closeness
    
    R = np.zeros((n_clusters, n_clusters))
    for i in range(n_clusters):
        for j in range(n_clusters):
            if i != j:
                R[i, j] = (S[i] + S[j]) / M[i, j]
    
    D = np.zeros(n_clusters)
    for i in range(n_clusters):
        if np.any(R[i, :] > 0):
            D[i] = np.max(R[i, :])
    
    return np.mean(D)

def fuzzy_calinski_harabasz_score(X: np.ndarray, labels: np.ndarray,
                                 model: 'FuzzyEquivalenceKMeans') -> float:
    
    n_samples = X.shape[0]
    n_clusters = len(np.unique(labels))
    
    if n_clusters <= 1:
        return 0.0
    
    X_norm = model._normalize_data(X)
    centroids_norm = model._normalize_data(model.centroids)
    overall_centroid_norm = np.mean(X_norm, axis=0)
    
    total_dispersion = 0
    for i in range(n_samples):
        eq_values = model.E(X_norm[i], overall_centroid_norm)
        closeness = model.A(eq_values)
        total_dispersion += 1 - closeness
    
    within_dispersion = 0
    cluster_sizes = np.zeros(n_clusters, dtype=int)
    
    for j in range(n_clusters):
        mask = labels == j
        cluster_points = X_norm[mask]
        cluster_sizes[j] = len(cluster_points)
        
        for i in range(len(cluster_points)):
            eq_values = model.E(cluster_points[i], centroids_norm[j])
            closeness = model.A(eq_values)
            within_dispersion += 1 - closeness
    
    between_dispersion = 0
    for j in range(n_clusters):
        if cluster_sizes[j] > 0:
            eq_values = model.E(centroids_norm[j], overall_centroid_norm)
            closeness = model.A(eq_values)
            between_dispersion += cluster_sizes[j] * (1 - closeness)
    
    if within_dispersion == 0 or n_samples - n_clusters == 0:
        return float('inf') if between_dispersion > 0 else 0.0
    
    score = (between_dispersion / (n_clusters - 1)) / (within_dispersion / (n_samples - n_clusters))
    return score

In [172]:
def load_all_datasets():

    datasets = {}
    print("Loading sklearn datasets...")
    
    # Iris dataset
    iris = load_iris()
    X_iris = iris.data
    y_iris = iris.target
    datasets['Iris'] = {'X': X_iris, 'y': y_iris, 'n_clusters': 3, 'description': 'Iris flowers dataset (150 samples, 4 features)'}
    
    # Wine dataset
    wine = load_wine()
    X_wine = wine.data
    y_wine = wine.target
    datasets['Wine'] = {'X': X_wine, 'y': y_wine, 'n_clusters': 3, 'description': 'Wine chemical analysis dataset (178 samples, 13 features)'}
    
    # Breast cancer dataset
    cancer = load_breast_cancer()
    X_cancer = cancer.data
    y_cancer = cancer.target
    datasets['Breast_Cancer'] = {'X': X_cancer, 'y': y_cancer, 'n_clusters': 2, 'description': 'Breast cancer diagnostic dataset (569 samples, 30 features)'}
    
    # Synthetic datasets
    print("Generating synthetic datasets...")
    
    # Blobs dataset
    X_blobs, y_blobs = make_blobs(n_samples=300, centers=4, cluster_std=1.0, random_state=42)
    datasets['Blobs'] = {'X': X_blobs, 'y': y_blobs, 'n_clusters': 4, 'description': 'Synthetic Gaussian blobs (300 samples, 2 features)'}
    
    # Moons dataset
    X_moons, y_moons = make_moons(n_samples=300, noise=0.1, random_state=42)
    datasets['Moons'] = {'X': X_moons, 'y': y_moons, 'n_clusters': 2, 'description': 'Two interleaving moons (300 samples, 2 features)'}
    
    # Circles dataset
    X_circles, y_circles = make_circles(n_samples=300, noise=0.05, factor=0.5, random_state=42)
    datasets['Circles'] = {'X': X_circles, 'y': y_circles, 'n_clusters': 2, 'description': 'Two concentric circles (300 samples, 2 features)'}
    
    # 2. UCI datasets placeholder - will be loaded if files exist
    print("\nNote: UCI datasets will be loaded if files exist in 'data/' folder")
    
    return datasets

def load_uci_datasets():

    uci_datasets = {}
    
    # List of UCI datasets and their expected files
    uci_files = {
        'Wholesale_Customers': 'data/Wholesale_customers.csv',
        'Heart_Failure': 'data/heart_failure_clinical_records.csv',
        'Seeds': 'data/seeds_dataset.csv',
        'Absenteeism': 'data/Absenteeism_at_work.csv'
    }
    
    for dataset_name, file_path in uci_files.items():
        if os.path.exists(file_path):
            try:
                print(f"Loading {dataset_name} from {file_path}...")
                
                if dataset_name == 'Online_Retail':
                    df = pd.read_csv(file_path)
                    # Select numerical columns
                    numeric_cols = ['Quantity', 'UnitPrice']
                    df = df[numeric_cols].dropna()
                    X = df.values[:1000]  # Use subset for faster testing
                    n_clusters = 5
                    
                elif dataset_name == 'Wholesale_Customers':
                    df = pd.read_csv(file_path)
                    # Use all columns except Channel and Region if they exist
                    if 'Channel' in df.columns:
                        df = df.drop(['Channel', 'Region'], axis=1)
                    X = df.values
                    n_clusters = 3
                    
                elif dataset_name == 'Heart_Failure':
                    df = pd.read_csv(file_path)
                    X = df.drop('DEATH_EVENT', axis=1).values
                    n_clusters = 2
                    
                elif dataset_name == 'Seeds':
                    df = pd.read_csv(file_path)
                    X = df.iloc[:, :-1].values
                    n_clusters = 3
                    
                elif dataset_name == 'Absenteeism':
                    df = pd.read_csv(file_path, sep=';')
                    # Select numerical columns
                    numeric_cols = df.select_dtypes(include=[np.number]).columns
                    X = df[numeric_cols].values[:200]  # Use subset
                    n_clusters = 4
                
                # For datasets without true labels, create pseudo-labels for n_clusters determination
                if dataset_name in ['Online_Retail', 'Wholesale_Customers', 'Absenteeism']:
                    from sklearn.cluster import KMeans
                    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
                    y = kmeans.fit_predict(X)
                else:
                    # For datasets with true labels
                    y = None
                
                uci_datasets[dataset_name] = {
                    'X': X,
                    'y': y,
                    'n_clusters': n_clusters,
                    'description': f'{dataset_name} dataset ({X.shape[0]} samples, {X.shape[1]} features)'
                }
                
                print(f"  ‚úì Loaded: {X.shape[0]} samples, {X.shape[1]} features")
                
            except Exception as e:
                print(f"  ‚úó Error loading {dataset_name}: {str(e)}")
        else:
            print(f"  ‚ö† {dataset_name} file not found: {file_path}")
    
    return uci_datasets


def run_all_combinations_on_dataset(X, dataset_name, n_clusters=3, max_iter=100, random_state=42, p_values=[0.1, 1, 4]):
    
    fuzzy_equivs = [
        'E_LK', 'E_GG', 'E_GD', 'E_FD',
        'E_LK_R', 'E_GG_R', 'E_GD_R', 'E_FD_R',
        'E_3', 'E_4', 'E_5', 'E_6'
    ]
    
    aggregations = ['A2', 'A3', 'A4']
    
    results = []
    
    print(f"\n{'='*60}")
    print(f"Testing dataset: {dataset_name}")
    print(f"Shape: {X.shape}, Clusters: {n_clusters}")
    print(f"{'='*60}")
    
    total_combinations = len(fuzzy_equivs) * len(aggregations)
    if 'A2' in aggregations:
        total_combinations += len(fuzzy_equivs) * (len(p_values) - 1)
    
    print(f"Testing {total_combinations} combinations...")
    
    print("\n1. Running Standard KMeans for baseline comparison...")
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    kmeans = KMeans(n_clusters=n_clusters, 
                   random_state=random_state, 
                   n_init=10)
    
    kmeans_labels = kmeans.fit_predict(X_scaled)
    
    if len(np.unique(kmeans_labels)) > 1:
        standard_silhouette = silhouette_score(X_scaled, kmeans_labels)
        standard_db = davies_bouldin_score(X_scaled, kmeans_labels)
        standard_ch = calinski_harabasz_score(X_scaled, kmeans_labels)
    else:
        standard_silhouette = 0
        standard_db = float('inf')
        standard_ch = 0
    
    results.append({
        'dataset': dataset_name,
        'E': 'Euclidean',
        'A': 'Mean',
        'p': None,
        'silhouette_score': standard_silhouette,
        'davies_bouldin_score': standard_db,
        'calinski_harabasz_score': standard_ch,
        'n_clusters_found': len(np.unique(kmeans_labels)),
        'method': 'Standard_KMeans',
        'inertia': kmeans.inertia_
    })
    
    print(f"   Standard KMeans Results:")
    print(f"     Silhouette: {standard_silhouette:.4f}")
    print(f"     Davies-Bouldin: {standard_db:.4f}")
    print(f"     Calinski-Harabasz: {standard_ch:.4f}")
    print(f"     Inertia: {kmeans.inertia_:.4f}")
    print(f"\n2. Running Fuzzy Equivalence KMeans combinations...")
    
    current = 0

    for E_name in fuzzy_equivs:
        for A_name in aggregations:
            if A_name == 'A2':
                for p in p_values:
                    current += 1
                    combination_id = f"{E_name}_{A_name}_p{p}"
                    
                    try:
                        model = FuzzyEquivalenceKMeans(
                            n_clusters=n_clusters,
                            E=E_name,
                            A=A_name,
                            p=p,
                            max_iter=max_iter,
                            random_state=random_state
                        )
                        
                        labels = model.fit_predict(X)
                        
                        n_clusters_found = len(np.unique(labels))
                        
                        if n_clusters_found > 1:
                            sil_score = fuzzy_silhouette_score(X, labels, model)
                            db_score = fuzzy_davies_bouldin_score(X, labels, model)
                            ch_score = fuzzy_calinski_harabasz_score(X, labels, model)
                        else:
                            sil_score = 0
                            db_score = float('inf')
                            ch_score = 0
                        
                        results.append({
                            'dataset': dataset_name,
                            'E': E_name,
                            'A': A_name,
                            'p': p,
                            'silhouette_score': sil_score,
                            'davies_bouldin_score': db_score,
                            'calinski_harabasz_score': ch_score,
                            'n_clusters_found': n_clusters_found,
                            'method': 'Fuzzy_Equivalence',
                            'inertia': None  # Fuzzy method doesn't have inertia
                        })
                        
                        if current % 10 == 0:
                            print(f"  Progress: {current}/{total_combinations} | Current: {combination_id}")
                        
                    except Exception as e:
                        results.append({
                            'dataset': dataset_name,
                            'E': E_name,
                            'A': A_name,
                            'p': p,
                            'silhouette_score': 0,
                            'davies_bouldin_score': float('inf'),
                            'calinski_harabasz_score': 0,
                            'n_clusters_found': 0,
                            'method': 'Fuzzy_Equivalence',
                            'inertia': None,
                            'error': str(e)
                        })
            else:
                current += 1
                combination_id = f"{E_name}_{A_name}"
                
                try:
                    model = FuzzyEquivalenceKMeans(
                        n_clusters=n_clusters,
                        E=E_name,
                        A=A_name,
                        p=1,
                        max_iter=max_iter,
                        random_state=random_state
                    )
                    
                    labels = model.fit_predict(X)
                    n_clusters_found = len(np.unique(labels))
                    
                    if n_clusters_found > 1:
                        sil_score = fuzzy_silhouette_score(X, labels, model)
                        db_score = fuzzy_davies_bouldin_score(X, labels, model)
                        ch_score = fuzzy_calinski_harabasz_score(X, labels, model)
                    else:
                        sil_score = 0
                        db_score = float('inf')
                        ch_score = 0
                    
                    results.append({
                        'dataset': dataset_name,
                        'E': E_name,
                        'A': A_name,
                        'p': None,
                        'silhouette_score': sil_score,
                        'davies_bouldin_score': db_score,
                        'calinski_harabasz_score': ch_score,
                        'n_clusters_found': n_clusters_found,
                        'method': 'Fuzzy_Equivalence',
                        'inertia': None
                    })
                    
                    if current % 10 == 0:
                        print(f"  Progress: {current}/{total_combinations} | Current: {combination_id}")
                    
                except Exception as e:
                    results.append({
                        'dataset': dataset_name,
                        'E': E_name,
                        'A': A_name,
                        'p': None,
                        'silhouette_score': 0,
                        'davies_bouldin_score': float('inf'),
                        'calinski_harabasz_score': 0,
                        'n_clusters_found': 0,
                        'method': 'Fuzzy_Equivalence',
                        'inertia': None,
                        'error': str(e)
                    })
    
    results_df = pd.DataFrame(results)
    
    def get_combination_name(row):
        if row['method'] == 'Standard_KMeans':
            return 'Standard_KMeans'
        elif pd.isna(row['p']) or row['p'] is None:
            return f"{row['E']}_{row['A']}"
        else:
            return f"{row['E']}_{row['A']}_p{row['p']}"
    
    results_df['combination'] = results_df.apply(get_combination_name, axis=1)
    
    print(f"\n‚úì Completed {len(results_df)} runs for {dataset_name}")
    print(f"  - Standard KMeans: 1 run")
    print(f"  - Fuzzy Equivalence: {len(results_df) - 1} runs")
    
    print(f"\n{'='*60}")
    print(f"COMPARISON ANALYSIS")
    print(f"{'='*60}")
    
    fuzzy_results = results_df[results_df['method'] == 'Fuzzy_Equivalence']
    successful_fuzzy = fuzzy_results[fuzzy_results['n_clusters_found'] > 0]
    
    if len(successful_fuzzy) > 0:
        best_fuzzy_sil = successful_fuzzy.loc[successful_fuzzy['silhouette_score'].idxmax()]
        best_fuzzy_db = successful_fuzzy.loc[successful_fuzzy['davies_bouldin_score'].idxmin()]
        best_fuzzy_ch = successful_fuzzy.loc[successful_fuzzy['calinski_harabasz_score'].idxmax()]
        
        standard_row = results_df[results_df['method'] == 'Standard_KMeans'].iloc[0]
        
        print(f"\nStandard KMeans vs Best Fuzzy Equivalence:")
        print(f"{'-'*50}")
        
        print(f"\nSilhouette Score:")
        print(f"  Standard KMeans: {standard_row['silhouette_score']:.4f}")
        print(f"  Best Fuzzy: {best_fuzzy_sil['silhouette_score']:.4f} ({best_fuzzy_sil['combination']})")
        if best_fuzzy_sil['silhouette_score'] > standard_row['silhouette_score']:
            improvement = ((best_fuzzy_sil['silhouette_score'] - standard_row['silhouette_score']) / 
                          abs(standard_row['silhouette_score']) * 100)
            print(f"  ‚úÖ Fuzzy better by: {improvement:+.2f}%")
        else:
            print(f"  ‚ùå Standard better or equal")
        
        print(f"\nDavies-Bouldin Score (lower is better):")
        print(f"  Standard KMeans: {standard_row['davies_bouldin_score']:.4f}")
        print(f"  Best Fuzzy: {best_fuzzy_db['davies_bouldin_score']:.4f} ({best_fuzzy_db['combination']})")
        if best_fuzzy_db['davies_bouldin_score'] < standard_row['davies_bouldin_score']:
            improvement = ((standard_row['davies_bouldin_score'] - best_fuzzy_db['davies_bouldin_score']) / 
                          standard_row['davies_bouldin_score'] * 100)
            print(f"  ‚úÖ Fuzzy better by: {improvement:+.2f}%")
        else:
            print(f"  ‚ùå Standard better or equal")
        
        print(f"\nCalinski-Harabasz Score:")
        print(f"  Standard KMeans: {standard_row['calinski_harabasz_score']:.4f}")
        print(f"  Best Fuzzy: {best_fuzzy_ch['calinski_harabasz_score']:.4f} ({best_fuzzy_ch['combination']})")
        if best_fuzzy_ch['calinski_harabasz_score'] > standard_row['calinski_harabasz_score']:
            improvement = ((best_fuzzy_ch['calinski_harabasz_score'] - standard_row['calinski_harabasz_score']) / 
                          abs(standard_row['calinski_harabasz_score']) * 100)
            print(f"  ‚úÖ Fuzzy better by: {improvement:+.2f}%")
        else:
            print(f"  ‚ùå Standard better or equal")
        
        fuzzy_beats_standard_sil = len(successful_fuzzy[successful_fuzzy['silhouette_score'] > standard_row['silhouette_score']])
        fuzzy_beats_standard_db = len(successful_fuzzy[successful_fuzzy['davies_bouldin_score'] < standard_row['davies_bouldin_score']])
        fuzzy_beats_standard_ch = len(successful_fuzzy[successful_fuzzy['calinski_harabasz_score'] > standard_row['calinski_harabasz_score']])
        
        print(f"\nSummary of fuzzy methods beating standard:")
        print(f"  Silhouette: {fuzzy_beats_standard_sil}/{len(successful_fuzzy)} methods")
        print(f"  Davies-Bouldin: {fuzzy_beats_standard_db}/{len(successful_fuzzy)} methods")
        print(f"  Calinski-Harabasz: {fuzzy_beats_standard_ch}/{len(successful_fuzzy)} methods")
    
    return results_df


def analyze_and_save_results(results_df, dataset_name):
    
    standard_results = results_df[results_df['method'] == 'Standard_KMeans']
    fuzzy_results = results_df[results_df['method'] == 'Fuzzy_Equivalence']
    successful_fuzzy = fuzzy_results[fuzzy_results['n_clusters_found'] > 0].copy()
    
    print(f"\n{'='*60}")
    print(f"ANALYSIS FOR {dataset_name}")
    print(f"{'='*60}")
    
    print(f"\nMETHODS SUMMARY:")
    print(f"  Standard KMeans: 1 run")
    print(f"  Fuzzy Equivalence: {len(fuzzy_results)} runs")
    print(f"  Successful fuzzy runs: {len(successful_fuzzy)}")
    
    if len(standard_results) > 0:
        std_row = standard_results.iloc[0]
        print(f"\nSTANDARD KMEANS RESULTS:")
        print(f"  Silhouette Score: {std_row['silhouette_score']:.4f}")
        print(f"  Davies-Bouldin Score: {std_row['davies_bouldin_score']:.4f}")
        print(f"  Calinski-Harabasz Score: {std_row['calinski_harabasz_score']:.4f}")
        if 'inertia' in std_row and pd.notnull(std_row['inertia']):
            print(f"  Inertia (WCSS): {std_row['inertia']:.4f}")
    
    if len(successful_fuzzy) > 0:
        print(f"\nBEST FUZZY EQUIVALENCE RESULTS:")
        
        if successful_fuzzy['silhouette_score'].max() > 0:
            best_sil = successful_fuzzy.loc[successful_fuzzy['silhouette_score'].idxmax()]
            print(f"\n  Best Silhouette Score: {best_sil['silhouette_score']:.4f}")
            print(f"    Combination: {best_sil['combination']}")
            print(f"    Parameters: E={best_sil['E']}, A={best_sil['A']}, p={best_sil['p']}")
            
            # Compare with standard
            if len(standard_results) > 0:
                std_sil = standard_results.iloc[0]['silhouette_score']
                improvement = ((best_sil['silhouette_score'] - std_sil) / abs(std_sil) * 100)
                print(f"    vs Standard KMeans: {improvement:+.2f}%")
        
        if successful_fuzzy['davies_bouldin_score'].min() < float('inf'):
            best_db = successful_fuzzy.loc[successful_fuzzy['davies_bouldin_score'].idxmin()]
            print(f"\n  Best Davies-Bouldin Score: {best_db['davies_bouldin_score']:.4f}")
            print(f"    Combination: {best_db['combination']}")
            print(f"    Parameters: E={best_db['E']}, A={best_db['A']}, p={best_db['p']}")
            
            if len(standard_results) > 0:
                std_db = standard_results.iloc[0]['davies_bouldin_score']
                improvement = ((std_db - best_db['davies_bouldin_score']) / std_db * 100)
                print(f"    vs Standard KMeans: {improvement:+.2f}% improvement")
        
        if successful_fuzzy['calinski_harabasz_score'].max() > 0:
            best_ch = successful_fuzzy.loc[successful_fuzzy['calinski_harabasz_score'].idxmax()]
            print(f"\n  Best Calinski-Harabasz Score: {best_ch['calinski_harabasz_score']:.4f}")
            print(f"    Combination: {best_ch['combination']}")
            print(f"    Parameters: E={best_ch['E']}, A={best_ch['A']}, p={best_ch['p']}")
            
            if len(standard_results) > 0:
                std_ch = standard_results.iloc[0]['calinski_harabasz_score']
                improvement = ((best_ch['calinski_harabasz_score'] - std_ch) / abs(std_ch) * 100)
                print(f"    vs Standard KMeans: {improvement:+.2f}%")
        
        print(f"\nFUZZY METHODS SUMMARY STATISTICS:")
        print(f"  Average Silhouette Score: {successful_fuzzy['silhouette_score'].mean():.4f}")
        print(f"  Average Davies-Bouldin Score: {successful_fuzzy['davies_bouldin_score'].mean():.4f}")
        print(f"  Average Calinski-Harabasz Score: {successful_fuzzy['calinski_harabasz_score'].mean():.1f}")
    
    # Save results
    print(f"\nüíæ Saving results to CSV files...")
    os.makedirs('results', exist_ok=True)
    
    full_filename = f'results/full_results_{dataset_name}.csv'
    results_df.to_csv(full_filename, index=False)
    print(f"  Full results saved to: {full_filename}")
    
    if len(successful_fuzzy) > 0:
        top10_fuzzy = successful_fuzzy.sort_values('silhouette_score', ascending=False).head(10)
        top10_filename = f'results/top10_fuzzy_{dataset_name}.csv'
        top10_fuzzy.to_csv(top10_filename, index=False)
        print(f"  Top 10 fuzzy methods saved to: {top10_filename}")
        
        if len(standard_results) > 0:
            comparison_data = []
            std_row = standard_results.iloc[0]
            
            for idx, (_, row) in enumerate(top10_fuzzy.iterrows()):
                improvement_sil = ((row['silhouette_score'] - std_row['silhouette_score']) / 
                                  abs(std_row['silhouette_score']) * 100)
                improvement_db = ((std_row['davies_bouldin_score'] - row['davies_bouldin_score']) / 
                                 std_row['davies_bouldin_score'] * 100) if std_row['davies_bouldin_score'] > 0 else 0
                improvement_ch = ((row['calinski_harabasz_score'] - std_row['calinski_harabasz_score']) / 
                                  abs(std_row['calinski_harabasz_score']) * 100)
                
                comparison_data.append({
                    'Rank': idx + 1,
                    'Combination': row['combination'],
                    'Silhouette_Score': row['silhouette_score'],
                    'Silhouette_Improvement_%': improvement_sil,
                    'DB_Score': row['davies_bouldin_score'],
                    'DB_Improvement_%': improvement_db,
                    'CH_Score': row['calinski_harabasz_score'],
                    'CH_Improvement_%': improvement_ch,
                    'E_Function': row['E'],
                    'A_Function': row['A'],
                    'p_Value': row['p']
                })
            
            comparison_df = pd.DataFrame(comparison_data)
            comparison_df = pd.concat([
                pd.DataFrame([{
                    'Rank': 'Baseline',
                    'Combination': 'Standard_KMeans',
                    'Silhouette_Score': std_row['silhouette_score'],
                    'Silhouette_Improvement_%': 0,
                    'DB_Score': std_row['davies_bouldin_score'],
                    'DB_Improvement_%': 0,
                    'CH_Score': std_row['calinski_harabasz_score'],
                    'CH_Improvement_%': 0,
                    'E_Function': 'Euclidean',
                    'A_Function': 'Mean',
                    'p_Value': None
                }]),
                comparison_df
            ], ignore_index=True)
            
            comparison_filename = f'results/comparison_{dataset_name}.csv'
            comparison_df.to_csv(comparison_filename, index=False)
            print(f"  Comparison table saved to: {comparison_filename}")
    
    return results_df

def create_visualizations(results_df, dataset_name, X):
    
    standard_results = results_df[results_df['method'] == 'Standard_KMeans']
    fuzzy_results = results_df[results_df['method'] == 'Fuzzy_Equivalence']
    successful_fuzzy = fuzzy_results[fuzzy_results['n_clusters_found'] > 0].copy()
    
    if len(successful_fuzzy) == 0 or X.shape[1] < 2:
        return
    
    print(f"\nüìä Creating visualizations for {dataset_name}...")
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    axes = axes.flatten()
    
    if len(standard_results) > 0:
        ax = axes[0]
        
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        kmeans = KMeans(n_clusters=int(standard_results.iloc[0]['n_clusters_found']), 
                       random_state=42, n_init=10)
        kmeans_labels = kmeans.fit_predict(X_scaled[:, :2])
        
        scatter = ax.scatter(X_scaled[:, 0], X_scaled[:, 1], 
                           c=kmeans_labels, 
                           cmap='viridis', 
                           s=30, 
                           alpha=0.7,
                           edgecolors='w',
                           linewidth=0.5)
        
        std_row = standard_results.iloc[0]
        title = f"Standard KMeans\n"
        title += f"Silhouette: {std_row['silhouette_score']:.3f}\n"
        title += f"DB: {std_row['davies_bouldin_score']:.3f}\n"
        title += f"CH: {std_row['calinski_harabasz_score']:.1f}"
        
        ax.set_title(title, fontsize=10)
        ax.set_xlabel('Feature 1 (scaled)')
        ax.set_ylabel('Feature 2 (scaled)')
        ax.grid(True, alpha=0.3)
    
    top_fuzzy = successful_fuzzy.sort_values('silhouette_score', ascending=False).head(3)
    
    for idx, (_, row) in enumerate(top_fuzzy.iterrows()):
        ax = axes[idx + 1]  # +1 because axes[0] is for standard KMeans
        
        try:
            model = FuzzyEquivalenceKMeans(
                n_clusters=int(row['n_clusters_found']),
                E=row['E'],
                A=row['A'],
                p=row['p'] if pd.notnull(row['p']) else 1,
                max_iter=100,
                random_state=42
            )
            
            labels = model.fit_predict(X[:, :2])
            
            scatter = ax.scatter(X[:, 0], X[:, 1], 
                               c=labels, 
                               cmap='viridis', 
                               s=30, 
                               alpha=0.7,
                               edgecolors='w',
                               linewidth=0.5)
            
            title = f"{row['combination']}\n"
            title += f"Silhouette: {row['silhouette_score']:.3f}\n"
            title += f"DB: {row['davies_bouldin_score']:.3f}\n"
            title += f"CH: {row['calinski_harabasz_score']:.1f}"
            
            ax.set_title(title, fontsize=10)
            ax.set_xlabel('Feature 1')
            ax.set_ylabel('Feature 2')
            ax.grid(True, alpha=0.3)
            
        except Exception as e:
            ax.text(0.5, 0.5, f"{row['combination']}\nSilhouette: {row['silhouette_score']:.3f}\nError: {str(e)[:50]}",
                   ha='center', va='center', transform=ax.transAxes, fontsize=9)
            ax.set_title(row['combination'], fontsize=10)
    
    plt.suptitle(f'Clustering Comparison for {dataset_name}\nStandard KMeans vs Top 3 Fuzzy Methods', 
                fontsize=14, y=1.02)
    plt.tight_layout()
    
    os.makedirs('results/plots', exist_ok=True)
    plot_filename = f'results/plots/comparison_{dataset_name}.png'
    plt.savefig(plot_filename, dpi=150, bbox_inches='tight')
    plt.close()
    print(f"  Visualization saved to: {plot_filename}")

In [162]:
print("="*80)
print("FUZZY EQUIVALENCE K-MEANS COMPREHENSIVE TESTING")
print("="*80)

print("\nüìÇ LOADING DATASETS...")
datasets = load_all_datasets()

uci_datasets = load_uci_datasets()
datasets.update(uci_datasets)

print(f"\n‚úÖ Total datasets to test: {len(datasets)}")
for name, info in datasets.items():
    print(f"  {name}: {info['X'].shape[0]} samples, {info['X'].shape[1]} features, {info['n_clusters']} clusters")

all_results = []

test_params = {
    'max_iter': 200,
    'random_state': 42,
    'p_values': [0.1, 1, 4]
}

print(f"\n{'='*80}")
print("RUNNING TESTS...")
print(f"{'='*80}")

for dataset_name, dataset_info in datasets.items():
    try:
        X = dataset_info['X']
        n_clusters = dataset_info['n_clusters']
        
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        results_df = run_all_combinations_on_dataset(
            X=X_scaled,
            dataset_name=dataset_name,
            n_clusters=n_clusters,
            max_iter=test_params['max_iter'],
            random_state=test_params['random_state'],
            p_values=test_params['p_values']
        )

        successful = analyze_and_save_results(results_df, dataset_name)
    
        if X.shape[1] >= 2:
            create_visualizations(results_df, dataset_name, X_scaled[:, :2])
        
        all_results.append(results_df)
        
        print(f"\n‚úì Completed testing for {dataset_name}")
        print(f"{'='*60}")
        
    except Exception as e:
        print(f"\n‚úó Error testing {dataset_name}: {str(e)}")
        print(f"{'='*60}")
        continue

if all_results:
    print(f"\n{'='*80}")
    print("COMBINING ALL RESULTS...")
    print(f"{'='*80}")
    
    combined_results = pd.concat(all_results, ignore_index=True)
    
    combined_filename = 'results/ALL_DATASETS_COMBINED.csv'
    combined_results.to_csv(combined_filename, index=False)
    print(f"‚úÖ All results combined and saved to: {combined_filename}")
    print(f"\nüìà CREATING GLOBAL SUMMARY...")
    
    summary_data = []
    for dataset_name in datasets.keys():
        dataset_results = combined_results[combined_results['dataset'] == dataset_name]
        successful = dataset_results[dataset_results['n_clusters_found'] > 0]
        
        if len(successful) > 0:
            # Get best method by silhouette score
            best_row = successful.loc[successful['silhouette_score'].idxmax()]
            
            summary_data.append({
                'dataset': dataset_name,
                'samples': datasets[dataset_name]['X'].shape[0],
                'features': datasets[dataset_name]['X'].shape[1],
                'clusters': datasets[dataset_name]['n_clusters'],
                'best_silhouette': best_row['silhouette_score'],
                'best_combination': best_row['combination'],
                'best_E': best_row['E'],
                'best_A': best_row['A'],
                'best_p': best_row['p'],
                'best_db': successful['davies_bouldin_score'].min(),
                'best_ch': successful['calinski_harabasz_score'].max(),
                'avg_silhouette': successful['silhouette_score'].mean(),
                'avg_db': successful['davies_bouldin_score'].mean(),
                'avg_ch': successful['calinski_harabasz_score'].mean(),
                'successful_runs': len(successful),
                'total_runs': len(dataset_results)
            })
    
    if summary_data:
        summary_df = pd.DataFrame(summary_data)
        summary_filename = 'results/GLOBAL_SUMMARY.csv'
        summary_df.to_csv(summary_filename, index=False)
        print(f"‚úÖ Global summary saved to: {summary_filename}")
        
        print(f"\nüìä GLOBAL STATISTICS:")
        print(f"  Total datasets tested: {len(summary_df)}")
        print(f"  Total successful runs: {summary_df['successful_runs'].sum()}")
        print(f"  Average silhouette across datasets: {summary_df['avg_silhouette'].mean():.4f}")
        
        print(f"\nüèÜ MOST SUCCESSFUL FUZZY EQUIVALENCE FUNCTIONS:")
        E_counts = summary_df['best_E'].value_counts()
        for E, count in E_counts.items():
            print(f"  {E}: best on {count} datasets")
        
        print(f"\nüèÜ MOST SUCCESSFUL AGGREGATION FUNCTIONS:")
        A_counts = summary_df['best_A'].value_counts()
        for A, count in A_counts.items():
            print(f"  {A}: best on {count} datasets")
    
    print(f"\n{'='*80}")
    print("‚úÖ TESTING COMPLETE!")
    print(f"{'='*80}")
    print(f"\nAll results saved in the 'results/' folder:")
    print("  - Individual dataset results: results/full_results_*.csv")
    print("  - Top 10 summaries: results/summary_top10_*.csv")
    print("  - Best results: results/best_results_*.csv")
    print("  - Visualizations: results/plots/*.png")
    print("  - Combined results: results/ALL_DATASETS_COMBINED.csv")
    print("  - Global summary: results/GLOBAL_SUMMARY.csv")

else:
    print("\n‚ö† No results were generated!")

FUZZY EQUIVALENCE K-MEANS COMPREHENSIVE TESTING

üìÇ LOADING DATASETS...
Loading sklearn datasets...
Generating synthetic datasets...

Note: UCI datasets will be loaded if files exist in 'data/' folder
Loading Wholesale_Customers from data/Wholesale_customers.csv...
  ‚úì Loaded: 440 samples, 6 features
Loading Heart_Failure from data/heart_failure_clinical_records.csv...
  ‚úì Loaded: 299 samples, 12 features
Loading Seeds from data/seeds_dataset.csv...
  ‚úì Loaded: 209 samples, 0 features
Loading Absenteeism from data/Absenteeism_at_work.csv...
  ‚úì Loaded: 200 samples, 21 features

‚úÖ Total datasets to test: 10
  Iris: 150 samples, 4 features, 3 clusters
  Wine: 178 samples, 13 features, 3 clusters
  Breast_Cancer: 569 samples, 30 features, 2 clusters
  Blobs: 300 samples, 2 features, 4 clusters
  Moons: 300 samples, 2 features, 2 clusters
  Circles: 300 samples, 2 features, 2 clusters
  Wholesale_Customers: 440 samples, 6 features, 3 clusters
  Heart_Failure: 299 samples, 12 fe

In [163]:
absenteeism = pd.read_csv('results/ALL_DATASETS_COMBINED.csv')
absenteeism = absenteeism[absenteeism['dataset'] == 'Absenteeism']
absenteeism = absenteeism[['combination', 'p', 'silhouette_score', 'davies_bouldin_score', 
                           'calinski_harabasz_score', 'n_clusters_found']]
absenteeism['davies_bouldin_score'] = absenteeism['davies_bouldin_score'].replace([np.inf, -np.inf], np.nan)
absenteeism['calinski_harabasz_score'] = absenteeism['calinski_harabasz_score'].replace([np.inf, -np.inf], np.nan)
absenteeism_heatmap = absenteeism.style
absenteeism_heatmap = absenteeism_heatmap.background_gradient(subset=['silhouette_score', 'calinski_harabasz_score'], cmap='plasma')
absenteeism_heatmap = absenteeism_heatmap.background_gradient(subset=['davies_bouldin_score'], cmap='plasma_r')
dfi.export(absenteeism_heatmap, 'results/plots/Absenteeism_heatmap.png', table_conversion='matplotlib')

In [164]:
heart_failure = pd.read_csv('results/ALL_DATASETS_COMBINED.csv')
heart_failure = heart_failure[heart_failure['dataset'] == 'Heart_Failure']
heart_failure = heart_failure[['combination', 'p', 'silhouette_score', 'davies_bouldin_score', 
                             'calinski_harabasz_score', 'n_clusters_found']]
heart_failure['davies_bouldin_score'] = heart_failure['davies_bouldin_score'].replace([np.inf, -np.inf], np.nan)
heart_failure['calinski_harabasz_score'] = heart_failure['calinski_harabasz_score'].replace([np.inf, -np.inf], np.nan)
heart_failure_heatmap = heart_failure.style
heart_failure_heatmap = heart_failure_heatmap.background_gradient(subset=['silhouette_score', 'calinski_harabasz_score'], cmap='plasma')
heart_failure_heatmap = heart_failure_heatmap.background_gradient(subset=['davies_bouldin_score'], cmap='plasma_r')
dfi.export(heart_failure_heatmap, 'results/plots/Heart_Failure_heatmap.png', table_conversion='matplotlib')

In [165]:
wholesale_customers = pd.read_csv('results/ALL_DATASETS_COMBINED.csv')
wholesale_customers = wholesale_customers[wholesale_customers['dataset'] == 'Wholesale_Customers']
wholesale_customers = wholesale_customers[['combination', 'p', 'silhouette_score', 'davies_bouldin_score', 
                                         'calinski_harabasz_score', 'n_clusters_found']]
wholesale_customers['davies_bouldin_score'] = wholesale_customers['davies_bouldin_score'].replace([np.inf, -np.inf], np.nan)
wholesale_customers['calinski_harabasz_score'] = wholesale_customers['calinski_harabasz_score'].replace([np.inf, -np.inf], np.nan)
wholesale_customers_heatmap = wholesale_customers.style
wholesale_customers_heatmap = wholesale_customers_heatmap.background_gradient(subset=['silhouette_score', 'calinski_harabasz_score'], cmap='plasma')
wholesale_customers_heatmap = wholesale_customers_heatmap.background_gradient(subset=['davies_bouldin_score'], cmap='plasma_r')
dfi.export(wholesale_customers_heatmap, 'results/plots/Wholesale_Customers_heatmap.png', table_conversion='matplotlib')

In [166]:
circles = pd.read_csv('results/ALL_DATASETS_COMBINED.csv')
circles = circles[circles['dataset'] == 'Circles']
circles = circles[['combination', 'p', 'silhouette_score', 'davies_bouldin_score', 
                 'calinski_harabasz_score', 'n_clusters_found']]
circles['davies_bouldin_score'] = circles['davies_bouldin_score'].replace([np.inf, -np.inf], np.nan)
circles['calinski_harabasz_score'] = circles['calinski_harabasz_score'].replace([np.inf, -np.inf], np.nan)
circles_heatmap = circles.style
circles_heatmap = circles_heatmap.background_gradient(subset=['silhouette_score', 'calinski_harabasz_score'], cmap='plasma')
circles_heatmap = circles_heatmap.background_gradient(subset=['davies_bouldin_score'], cmap='plasma_r')
dfi.export(circles_heatmap, 'results/plots/Circles_heatmap.png', table_conversion='matplotlib')

In [167]:
moons = pd.read_csv('results/ALL_DATASETS_COMBINED.csv')
moons = moons[moons['dataset'] == 'Moons']
moons = moons[['combination', 'p', 'silhouette_score', 'davies_bouldin_score', 
               'calinski_harabasz_score', 'n_clusters_found']]
moons['davies_bouldin_score'] = moons['davies_bouldin_score'].replace([np.inf, -np.inf], np.nan)
moons['calinski_harabasz_score'] = moons['calinski_harabasz_score'].replace([np.inf, -np.inf], np.nan)
moons_heatmap = moons.style
moons_heatmap = moons_heatmap.background_gradient(subset=['silhouette_score', 'calinski_harabasz_score'], cmap='plasma')
moons_heatmap = moons_heatmap.background_gradient(subset=['davies_bouldin_score'], cmap='plasma_r')
dfi.export(moons_heatmap, 'results/plots/Moons_heatmap.png', table_conversion='matplotlib')

In [168]:
blobs = pd.read_csv('results/ALL_DATASETS_COMBINED.csv')
blobs = blobs[blobs['dataset'] == 'Blobs']
blobs = blobs[['combination', 'p', 'silhouette_score', 'davies_bouldin_score', 
               'calinski_harabasz_score', 'n_clusters_found']]
blobs['davies_bouldin_score'] = blobs['davies_bouldin_score'].replace([np.inf, -np.inf], np.nan)
blobs['calinski_harabasz_score'] = blobs['calinski_harabasz_score'].replace([np.inf, -np.inf], np.nan)
blobs_heatmap = blobs.style
blobs_heatmap = blobs_heatmap.background_gradient(subset=['silhouette_score', 'calinski_harabasz_score'], cmap='plasma')
blobs_heatmap = blobs_heatmap.background_gradient(subset=['davies_bouldin_score'], cmap='plasma_r')
dfi.export(blobs_heatmap, 'results/plots/Blobs_heatmap.png', table_conversion='matplotlib')

In [169]:
brest_cancer = pd.read_csv('results/ALL_DATASETS_COMBINED.csv')
brest_cancer = brest_cancer[brest_cancer['dataset'] == 'Breast_Cancer']
brest_cancer = brest_cancer[['combination', 'p', 'silhouette_score', 'davies_bouldin_score', 
                           'calinski_harabasz_score', 'n_clusters_found']]
brest_cancer['davies_bouldin_score'] = brest_cancer['davies_bouldin_score'].replace([np.inf, -np.inf], np.nan)
brest_cancer['calinski_harabasz_score'] = brest_cancer['calinski_harabasz_score'].replace([np.inf, -np.inf], np.nan)
brest_cancer_heatmap = brest_cancer.style
brest_cancer_heatmap = brest_cancer_heatmap.background_gradient(subset=['silhouette_score', 'calinski_harabasz_score'], cmap='plasma')
brest_cancer_heatmap = brest_cancer_heatmap.background_gradient(subset=['davies_bouldin_score'], cmap='plasma_r')
dfi.export(brest_cancer_heatmap, 'results/plots/Breast_Cancer_heatmap.png', table_conversion='matplotlib')

In [170]:
wine = pd.read_csv('results/ALL_DATASETS_COMBINED.csv')
wine = wine[wine['dataset'] == 'Wine']
wine = wine[['combination', 'p', 'silhouette_score', 'davies_bouldin_score', 
             'calinski_harabasz_score', 'n_clusters_found']]
wine['davies_bouldin_score'] = wine['davies_bouldin_score'].replace([np.inf, -np.inf], np.nan)
wine['calinski_harabasz_score'] = wine['calinski_harabasz_score'].replace([np.inf, -np.inf], np.nan)
wine_heatmap = wine.style
wine_heatmap = wine_heatmap.background_gradient(subset=['silhouette_score', 'calinski_harabasz_score'], cmap='plasma')
wine_heatmap = wine_heatmap.background_gradient(subset=['davies_bouldin_score'], cmap='plasma_r')
dfi.export(wine_heatmap, 'results/plots/Wine_heatmap.png', table_conversion='matplotlib')

In [171]:
iris = pd.read_csv('results/ALL_DATASETS_COMBINED.csv')
iris = iris[iris['dataset'] == 'Iris']
iris = iris[['combination', 'p', 'silhouette_score', 'davies_bouldin_score', 
             'calinski_harabasz_score', 'n_clusters_found']]
iris['davies_bouldin_score'] = iris['davies_bouldin_score'].replace([np.inf, -np.inf], np.nan)
iris['calinski_harabasz_score'] = iris['calinski_harabasz_score'].replace([np.inf, -np.inf], np.nan)
iris_heatmap = iris.style
iris_heatmap = iris_heatmap.background_gradient(subset=['silhouette_score', 'calinski_harabasz_score'], cmap='plasma')
iris_heatmap = iris_heatmap.background_gradient(subset=['davies_bouldin_score'], cmap='plasma_r')
dfi.export(iris_heatmap, 'results/plots/Iris_heatmap.png', table_conversion='matplotlib')

In [176]:
df = pd.read_csv('results/ALL_DATASETS_COMBINED.csv')

In [183]:
def analyze_dataset_sc(df):
    datasets = df['dataset'].unique()
    
    aggregation_top3_counts = Counter()
    equivalence_top3_counts = Counter()
    pair_top3_counts = Counter()
    
    for dataset in datasets:
        dataset_data = df[df['dataset'] == dataset]
        valid_scores = dataset_data.dropna(subset=['silhouette_score'])
        sorted_data = valid_scores.sort_values('silhouette_score', ascending=False)
        top3 = sorted_data.head(3)
        
        for _, row in top3.iterrows():
            agg_func_code = row['combination'].split('_')

            if agg_func_code[-1][0] == 'p':
                agg_func = agg_func_code[-2] + '(' + agg_func_code[-1] + ')'
            elif len(agg_func_code) == 2:
                continue
            else:
                agg_func = agg_func_code[-1]
                
            equiv_func = row['E']
            
            if pd.notna(agg_func):
                aggregation_top3_counts[agg_func] += 1
            
            if pd.notna(equiv_func) and equiv_func != 'Euclidean':
                if '_R' in equiv_func:
                    equiv_func = equiv_func.replace('_R', '')
                equivalence_top3_counts[equiv_func] += 1
            
            if pd.notna(agg_func) and pd.notna(equiv_func):
                pair = f"{equiv_func}_{agg_func}"
                pair_top3_counts[pair] += 1
    
    return aggregation_top3_counts, equivalence_top3_counts, pair_top3_counts

aggregation_counts, equivalence_counts, pair_counts = analyze_dataset_sc(df)

print("1. How many times each aggregation function fell in top 3 by silhouette score:")
for agg_func, count in sorted(aggregation_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"   {agg_func}: {count} times")

print("\n2. How many times each equivalence function fell in top 3 by silhouette score:")
for equiv_func, count in sorted(equivalence_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"   {equiv_func}: {count} times")

print("\n3. 3 most popular pairs of aggregation and equivalence based on silhouette score:")
for i, (pair, count) in enumerate(pair_counts.most_common(3), 1):
    print(f"   {i}. {pair}: {count} times")

1. How many times each aggregation function fell in top 3 by silhouette score:
   A4: 14 times
   A2(p4.0): 5 times
   A2(p0.1): 4 times
   A2(p1.0): 2 times

2. How many times each equivalence function fell in top 3 by silhouette score:
   E_FD: 12 times
   E_4: 4 times
   E_LK: 3 times
   E_GD: 3 times
   E_3: 1 times
   E_6: 1 times
   E_5: 1 times

3. 3 most popular pairs of aggregation and equivalence based on silhouette score:
   1. E_FD_A4: 4 times
   2. E_FD_A2(p4.0): 4 times
   3. E_4_A4: 3 times


In [184]:
def analyze_dataset_sc(df):
    datasets = df['dataset'].unique()
    
    aggregation_top3_counts = Counter()
    equivalence_top3_counts = Counter()
    pair_top3_counts = Counter()
    
    for dataset in datasets:
        dataset_data = df[df['dataset'] == dataset]
        valid_scores = dataset_data.dropna(subset=['davies_bouldin_score'])
        sorted_data = valid_scores.sort_values('davies_bouldin_score', ascending=True)
        top3 = sorted_data.head(3)
        
        for _, row in top3.iterrows():
            agg_func_code = row['combination'].split('_')

            if agg_func_code[-1][0] == 'p':
                agg_func = agg_func_code[-2] + '(' + agg_func_code[-1] + ')'
            elif len(agg_func_code) == 2:
                continue
            else:
                agg_func = agg_func_code[-1]

            equiv_func = row['E']
            
            if pd.notna(agg_func):
                aggregation_top3_counts[agg_func] += 1
            
            if pd.notna(equiv_func) and equiv_func != 'Euclidean':
                if '_R' in equiv_func:
                    equiv_func = equiv_func.replace('_R', '')
                equivalence_top3_counts[equiv_func] += 1
            
            if pd.notna(agg_func) and pd.notna(equiv_func):
                pair = f"{equiv_func}_{agg_func}"
                pair_top3_counts[pair] += 1
    
    return aggregation_top3_counts, equivalence_top3_counts, pair_top3_counts

aggregation_counts, equivalence_counts, pair_counts = analyze_dataset_sc(df)

print("1. How many times each aggregation function fell in top 3 by Davies-Bouldin score:")
for agg_func, count in sorted(aggregation_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"   {agg_func}: {count} times")

print("\n2. How many times each equivalence function fell in top 3 by Davies-Bouldin score:")
for equiv_func, count in sorted(equivalence_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"   {equiv_func}: {count} times")

print("\n3. 3 most popular pairs of aggregation and equivalence based on Davies-Bouldin score:")
for i, (pair, count) in enumerate(pair_counts.most_common(3), 1):
    print(f"   {i}. {pair}: {count} times")

1. How many times each aggregation function fell in top 3 by Davies-Bouldin score:
   A4: 22 times
   A2(p0.1): 3 times
   A2(p4.0): 1 times

2. How many times each equivalence function fell in top 3 by Davies-Bouldin score:
   E_FD: 8 times
   E_LK: 6 times
   E_4: 4 times
   E_3: 3 times
   E_GD: 2 times
   E_6: 2 times
   E_5: 1 times

3. 3 most popular pairs of aggregation and equivalence based on Davies-Bouldin score:
   1. E_FD_A4: 7 times
   2. E_4_A4: 4 times
   3. E_3_A4: 3 times


In [185]:
def analyze_dataset_sc(df):
    datasets = df['dataset'].unique()
    
    aggregation_top3_counts = Counter()
    equivalence_top3_counts = Counter()
    pair_top3_counts = Counter()
    
    for dataset in datasets:
        dataset_data = df[df['dataset'] == dataset]
        valid_scores = dataset_data.dropna(subset=['calinski_harabasz_score'])
        sorted_data = valid_scores.sort_values('calinski_harabasz_score', ascending=False)
        top3 = sorted_data.head(3)
        
        for _, row in top3.iterrows():
            agg_func_code = row['combination'].split('_')

            if agg_func_code[-1][0] == 'p':
                agg_func = agg_func_code[-2] + '(' + agg_func_code[-1] + ')'
            elif len(agg_func_code) == 2:
                continue
            else:
                agg_func = agg_func_code[-1]

            equiv_func = row['E']
            
            if pd.notna(agg_func):
                aggregation_top3_counts[agg_func] += 1
            
            if pd.notna(equiv_func) and equiv_func != 'Euclidean':
                if '_R' in equiv_func:
                    equiv_func = equiv_func.replace('_R', '')
                equivalence_top3_counts[equiv_func] += 1
            
            if pd.notna(agg_func) and pd.notna(equiv_func):
                pair = f"{equiv_func}_{agg_func}"
                pair_top3_counts[pair] += 1
    
    return aggregation_top3_counts, equivalence_top3_counts, pair_top3_counts

aggregation_counts, equivalence_counts, pair_counts = analyze_dataset_sc(df)

print("1. How many times each aggregation function fell in top 3 by Calinski-Harabasz score:")
for agg_func, count in sorted(aggregation_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"   {agg_func}: {count} times")

print("\n2. How many times each equivalence function fell in top 3 by Calinski-Harabasz score:")
for equiv_func, count in sorted(equivalence_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"   {equiv_func}: {count} times")

print("\n3. 3 most popular pairs of aggregation and equivalence based on Calinski-Harabasz score:")
for i, (pair, count) in enumerate(pair_counts.most_common(3), 1):
    print(f"   {i}. {pair}: {count} times")

1. How many times each aggregation function fell in top 3 by Calinski-Harabasz score:
   A4: 26 times

2. How many times each equivalence function fell in top 3 by Calinski-Harabasz score:
   E_FD: 13 times
   E_LK: 6 times
   E_4: 3 times
   E_6: 2 times
   E_GD: 2 times

3. 3 most popular pairs of aggregation and equivalence based on Calinski-Harabasz score:
   1. E_FD_A4: 13 times
   2. E_LK_A4: 6 times
   3. E_4_A4: 3 times
