In [5]:
# ===============================================================
#   K-MEANS NOISE REMOVAL WITH PLOTLY
# ===============================================================

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

# ------------------------------
# 1. Generate synthetic data with noise
# ------------------------------
np.random.seed(42)
X, y_true = make_blobs(n_samples=300, centers=3, cluster_std=1.0, random_state=42)

# Add random noise points
n_noise = 30
noise = np.random.uniform(low=-10, high=10, size=(n_noise, 2))
X_noisy = np.vstack([X, noise])

# ------------------------------
# 1a. Plot original noisy data
# ------------------------------
df_original = pd.DataFrame({
    "Feature 1": X_noisy[:,0],
    "Feature 2": X_noisy[:,1]
})

fig = px.scatter(df_original, x="Feature 1", y="Feature 2",
                 title="Original Data with Noise",
                 color_discrete_sequence=['gray'])
fig.show()

# ------------------------------
# 2. Apply K-Means
# ------------------------------
k = 3
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(X_noisy)
centroids = kmeans.cluster_centers_

# ------------------------------
# 3. Compute distances to cluster centroids (Euclidean)
# ------------------------------
distances = np.zeros(X_noisy.shape[0])
for i in range(X_noisy.shape[0]):
    cluster = labels[i]
    distances[i] = np.linalg.norm(X_noisy[i] - centroids[cluster])

# ------------------------------
# 4. Remove points far from centroid (noise)
# ------------------------------
threshold = np.percentile(distances, 90)  # top 10% farthest points
mask = distances <= threshold
X_denoised = X_noisy[mask]
X_noise_removed = X_noisy[~mask]

# ------------------------------
# 5. Prepare DataFrame for Plotly
# ------------------------------
df_denoised = pd.DataFrame({
    "Feature 1": np.concatenate([X_denoised[:,0], X_noise_removed[:,0], centroids[:,0]]),
    "Feature 2": np.concatenate([X_denoised[:,1], X_noise_removed[:,1], centroids[:,1]]),
    "Type": ["Kept Points"]*len(X_denoised) + ["Removed Noise"]*len(X_noise_removed) + ["Centroids"]*len(centroids)
})

# ------------------------------
# 6. Plot denoised data with centroids
# ------------------------------
fig = px.scatter(df_denoised, x="Feature 1", y="Feature 2", color="Type",
                 color_discrete_map={"Kept Points":"green",
                                     "Removed Noise":"red",
                                     "Centroids":"blue"},
                 symbol="Type",
                 symbol_map={"Kept Points":"circle",
                             "Removed Noise":"circle",
                             "Centroids":"x"},
                 title="Denoised Data using K-Means")

# Set marker sizes: smaller for points, larger for centroids
fig.update_traces(marker=dict(size=6), selector=dict(mode='markers'))  # all points
# Increase centroids size separately
for i, t in enumerate(df_denoised["Type"].unique()):
    if t == "Centroids":
        fig.data[i].marker.size = 15

fig.show()
