<a href="https://colab.research.google.com/github/Tobiezhg/k-means-clustering-excel-tool/blob/testing-accuracy/Iris_accuracy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pandas scikit-learn matplotlib



In [None]:
# Hungarian Algorithm Implementation
def hungarian_algorithm(A, n, m):
    INF = sys.maxsize
    # Potential vectors for row and column adjustments
    u = [0] * (n + 1)
    v = [0] * (m + 1)
    # Assignment tracking
    p = [0] * (m + 1)
    way = [0] * (m + 1)

    for i in range(1, n + 1):
        p[0] = i
        j0 = 0
        minv = [INF] * (m + 1)  # Minimum values for potential updates
        used = [False] * (m + 1)  # Tracks visited columns

        while True:
            used[j0] = True
            i0 = p[j0]
            delta = INF
            j1 = -1

            for j in range(1, m + 1):
                if not used[j]:
                    cur = A[i0][j] - u[i0] - v[j]  # Compute reduced cost
                    if cur < minv[j]:
                        minv[j] = cur
                        way[j] = j0
                    if minv[j] < delta:
                        delta = minv[j]
                        j1 = j

            for j in range(m + 1):
                if used[j]:
                    u[p[j]] += delta  # Update potentials for assigned rows
                    v[j] -= delta  # Update potentials for assigned columns
                else:
                    minv[j] -= delta  # Reduce minimum values for next iteration

            j0 = j1  # Move to next column

            if p[j0] == 0:
                break

        while j0:
            j1 = way[j0]
            p[j0] = p[j1]
            j0 = j1

    # Return final assignments
    return p[1:]

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from scipy.optimize import linear_sum_assignment
import sys
import time

In [None]:
# Start measuring total execution time
start_time = time.time()

In [None]:
# Load dataset from CSV file
file_path = "iris_dataset.csv"
df_numeric = pd.read_csv(file_path)

# Display first few rows
print(df_numeric.head())

   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa


In [None]:
# Select numeric columns
numeric_columns = df_numeric.select_dtypes(include=[np.number]).columns  # Identify numeric columns

# Standardize only the numeric columns
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_numeric[numeric_columns])  # Scale only numeric column

print("Preprocessed Data Shape (including species):", df_numeric.shape)
print(print(df_numeric.columns))

Preprocessed Data Shape (including species): (150, 5)
Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')
None


In [None]:
# Measure time for silhouette score calculation
silhouette_start = time.time()

In [None]:
use_silhouette = True  # Toggle: True to use silhouette scores; False to use fixed number of clusters
cluster_range = range(3, 6)  # Range for silhouette score calculation (from GUI input)
n_init = 10  # Number of re-runs for K-Means
max_iter = 300  # Maximum iterations for K-Means

# To store silhouette scores
silhouette_results = []

if use_silhouette:
    print("Calculating silhouette scores to determine the optimal number of clusters...")

    # Loop through each possible number of clusters in the range
    for k in cluster_range:
        # Run K-Means with random/k-mean++/sequenttial initialization
        kmeans = KMeans(n_clusters=k, init="k-means++", n_init=n_init, max_iter=max_iter, random_state=None)
        kmeans.fit(X_scaled)  # Fit to the scaled data

        # Compute Silhouette Score
        score = silhouette_score(X_scaled, kmeans.labels_)
        silhouette_results.append((k, score))
        print(f"Clusters: {k}, Silhouette Score: {score}")

    # Select the number of clusters with the highest silhouette score
    best_k, best_score = max(silhouette_results, key=lambda x: x[1])
    print(f"Best number of clusters: {best_k}, Silhouette Score: {best_score}")

else:
    # Use a fixed number of clusters
    best_k = 3


Calculating silhouette scores to determine the optimal number of clusters...
Clusters: 3, Silhouette Score: 0.45840087099074767
Clusters: 4, Silhouette Score: 0.3887799827106933
Clusters: 5, Silhouette Score: 0.3516422319268987
Best number of clusters: 3, Silhouette Score: 0.45840087099074767


In [None]:
silhouette_end = time.time()
print(f"Silhouette score calculation time: {silhouette_end - silhouette_start:.4f} seconds")

Silhouette score calculation time: 0.0834 seconds


In [None]:
# Measure time for K-means clustering
kmeans_start = time.time()

In [None]:
# Run K-Means with the selected number of clusters (`best_k`)
results = []  # To store results of each run
best_inertia = float('inf')  # To track the best run
best_labels = None  # To track the best cluster labels
best_run = None  # To track which run was the best

for run in range(10):
    print(f"Run {run + 1}:")

    # Run K-Means with random/k-mean++/sequential initialization
    kmeans = KMeans(n_clusters=best_k, init="k-means++", n_init=1, max_iter=max_iter, random_state=None)
    kmeans.fit(X_scaled)

    # Store results (e.g., inertia and cluster centers)
    results.append({
        "run": run + 1,
        "inertia": kmeans.inertia_,
        "cluster_centers": kmeans.cluster_centers_,
        "labels": kmeans.labels_,
    })

    # Check if this run is the best (lowest inertia)
    if kmeans.inertia_ < best_inertia:
        best_inertia = kmeans.inertia_
        best_labels = kmeans.labels_
        best_run = run + 1

# After the loop, assign the best cluster labels to the DataFrame
df_numeric["Cluster"] = best_labels

print(f"Best run: {best_run} with inertia: {best_inertia}")

Run 1:
Run 2:
Run 3:
Run 4:
Run 5:
Run 6:
Run 7:
Run 8:
Run 9:
Run 10:
Best run: 2 with inertia: 140.96581663074699


In [None]:
kmeans_end = time.time()
print(f"K-means clustering time: {kmeans_end - kmeans_start:.4f} seconds")

K-means clustering time: 0.0491 seconds


In [None]:
# Measure time for Hungarian algorithm
hungarian_start = time.time()

In [None]:
# Create cost matrix for Hungarian Algorithm
# Rows = true labels, Columns = predicted clusters
true_labels = df_numeric["species"].unique()
clusters = range(best_k)
cost_matrix = np.zeros((len(true_labels), best_k))

for i, label in enumerate(true_labels):
    for j in clusters:
        cost_matrix[i, j] = -len(df_numeric[(df_numeric["species"] == label) & (df_numeric["Cluster"] == j)])

# Apply Hungarian Algorithm for optimal cluster assignment
row_ind, col_ind = linear_sum_assignment(cost_matrix)

# Map clusters to true labels based on Hungarian Algorithm output
cluster_to_label = {cluster: true_labels[row] for row, cluster in zip(row_ind, col_ind)}
df_numeric["Assigned_Label"] = df_numeric["Cluster"].map(cluster_to_label)

# Display the Hungarian algorithm cluster assignment results
print("Mapping of clusters to true labels based on Hungarian Algorithm:")
print(cluster_to_label)

# Display the DataFrame with 'Cluster', 'Assigned_Label', and comparison to true labels
print("\nDataFrame Results:")
print(df_numeric[["species", "Cluster", "Assigned_Label"]].head())

Mapping of clusters to true labels based on Hungarian Algorithm:
{0: 'setosa', 2: 'versicolor', 1: 'virginica'}

DataFrame Results:
  species  Cluster Assigned_Label
0  setosa        0         setosa
1  setosa        0         setosa
2  setosa        0         setosa
3  setosa        0         setosa
4  setosa        0         setosa


In [None]:
hungarian_end = time.time()
print(f"Hungarian algorithm time: {hungarian_end - hungarian_start:.4f} seconds")

Hungarian algorithm time: 0.0282 seconds


In [None]:
# Measure time for cluster-to-species mapping and match column creation
mapping_start = time.time()

In [None]:
# Ensure 'Cluster' column is in the main DataFrame (df)
df_numeric["Cluster"] = df_numeric["Cluster"]

# Step 1: Establish mappings of clusters to species based on the most frequent assignments
correct_assignment = {}
k = df_numeric["Cluster"].nunique()  # Automatically determine the number of clusters

for cluster_label in range(k):  # Loop through each cluster
    # Filter for all rows in the current cluster
    cluster_points = df_numeric[df_numeric["Cluster"] == cluster_label]

    # Handle empty clusters
    if cluster_points.empty:
        print(f"Cluster {cluster_label} is empty. Skipping...")
        continue

    # Identify the most common species in the cluster
    most_common_species = cluster_points["species"].mode()

    # Handle ties in mode
    if len(most_common_species) > 1:
        print(f"Warning: Cluster {cluster_label} has multiple modes. Using the first one.")

    # Map this cluster to the most frequent species
    correct_assignment[cluster_label] = most_common_species.iloc[0]

# Step 2: Create a column mapping clusters to the correct species
df_numeric["Assigned_Cluster"] = df_numeric["Cluster"].map(correct_assignment)

# Step 3: Create a match column to indicate if the cluster assignment matches the mapped species
df_numeric["Match"] = df_numeric["species"] == df_numeric["Assigned_Cluster"]

In [None]:
mapping_end = time.time()
print(f"Cluster-to-species mapping and match column creation time: {mapping_end - mapping_start:.4f} seconds")

Cluster-to-species mapping and match column creation time: 0.0300 seconds


In [None]:
# Measure time for WCSS computation
wcss_start = time.time()

In [None]:
# Compute WCSS for different values of k
wcss = []
k_values = range(1, 11)  # Testing k from 1 to 10

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)  # Use standardized data
    wcss.append(kmeans.inertia_)  # WCSS (sum of squared distances)

# Display WCSS values
print("\nWCSS values for different k:")
for k, inertia in zip(k_values, wcss):
    print(f"k = {k}, WCSS = {inertia:.4f}")



WCSS values for different k:
k = 1, WCSS = 600.0000
k = 2, WCSS = 223.7320
k = 3, WCSS = 140.9658
k = 4, WCSS = 114.6179
k = 5, WCSS = 91.2954
k = 6, WCSS = 81.7566
k = 7, WCSS = 71.3198
k = 8, WCSS = 62.6518
k = 9, WCSS = 55.2618
k = 10, WCSS = 50.6232


In [None]:
wcss_end = time.time()
print(f"WCSS computation time: {wcss_end - wcss_start:.4f} seconds")

WCSS computation time: 0.2772 seconds


In [None]:
# Measure time for accuracy calculation
accuracy_start = time.time()

In [None]:
# Step 4: Compute the accuracy
total_observations = len(df_numeric)  # Total data points

# Validation: Ensure 'Match' column exists
if "Match" not in df_numeric.columns:
    raise ValueError("The 'Match' column is missing from the DataFrame. Ensure earlier steps were executed correctly.")

# Validation: Ensure 'Match' column is Boolean
if not df_numeric["Match"].dtype == bool:
    raise TypeError("The 'Match' column must contain Boolean values (True/False).")

# Handle edge case for empty DataFrame
if total_observations == 0:
    raise ValueError("The DataFrame is empty; accuracy cannot be computed.")

# Calculate accuracy
df_numeric["Match"] = df_numeric["species"] == df_numeric["Assigned_Label"]
accuracy = df_numeric["Match"].mean()
print(f"Clustering Accuracy after Hungarian Algorithm: {accuracy:.4f}")

# Display the updated DataFrame to inspect results
print(df_numeric[["Cluster", "Assigned_Cluster", "Match", "species"]].head())  # Show relevant columns only

Clustering Accuracy after Hungarian Algorithm: 0.8333
   Cluster Assigned_Cluster  Match species
0        0           setosa   True  setosa
1        0           setosa   True  setosa
2        0           setosa   True  setosa
3        0           setosa   True  setosa
4        0           setosa   True  setosa


In [None]:
accuracy_end = time.time()
print(f"Accuracy calculation time: {accuracy_end - accuracy_start:.4f} seconds")

Accuracy calculation time: 0.0450 seconds


In [None]:
# End measuring total execution time
end_time = time.time()
print(f"\nTotal execution time: {end_time - start_time:.4f} seconds")


Total execution time: 0.6146 seconds
