In [33]:
# Importing the required modules and libraries.
import numpy as np
import matplotlib.pyplot as plt

In [34]:
# Loading and visualising the dataset.
data = np.loadtxt("data/kmeans_data.txt")
plt.scatter(data[:, 0], data[:, 1], marker="x", color='b')
plt.title("Visualising the actual dataset")
plt.xlabel("Feature-1")
plt.ylabel("Feature-2")
plt.savefig("Original Dataset.png")
plt.close()

In [35]:
# Use of Hand-crafted Features. Choose (x, y) -> (sqrt(x^2 + y^2), y) as the transformation.
transformed_data = np.column_stack((np.sqrt(np.sum(data ** 2, axis=1)), data[:, 1]))
plt.scatter(transformed_data[:, 0], transformed_data[:, 1], marker="x", color='b')
plt.title("Visualising the transformed dataset")
plt.xlabel("Feature-1")
plt.ylabel("Feature-2")
plt.savefig("Transformed Dataset.png")
plt.close()

def k_means(tranformed_data, k, max_iterations, tolerance):
    # Randomly initialize cluster centroids.
    centroids = data[:k]
    for _ in range(max_iterations):
        labels = np.argmin(np.linalg.norm(transformed_data[:, np.newaxis] - centroids, axis=2), axis=1)
        new_centroids = np.array([transformed_data[labels == j].mean(axis=0) for j in range(k)])
        if np.linalg.norm(new_centroids - centroids) < tolerance:
            break
        centroids = new_centroids
    return labels

k, max_iterations, tolerance = 2, 10 ** 6, 1e-4
labels = k_means(transformed_data, k, max_iterations, tolerance)
colors = ["green" if label == 1 else "red" for label in labels]
plt.scatter(data[:, 0], data[:, 1], marker="x", color=colors)
plt.title("Results of K-means clustering after choosing hand-crafted features")
plt.xlabel("Feature-1")
plt.ylabel("Feature-2")
plt.savefig("Hand-Picked Transformation results.png")
plt.close()

In [37]:
# Kernel K-means clustering based on a single landmark point implementation.
def rbf_kernel(data, landmark, gamma):
    return np.exp(-gamma * np.linalg.norm(data - landmark, axis=1) ** 2)

def k_means(tranformed_data, k, max_iterations, tolerance):
    # Randomly initialize cluster centroids.
    centroids = data[:k]
    for _ in range(max_iterations):
        labels = np.argmin(np.linalg.norm(transformed_data[:, np.newaxis] - centroids, axis=2), axis=1)
        new_centroids = np.array([transformed_data[labels == j].mean(axis=0) for j in range(k)])
        if np.linalg.norm(new_centroids - centroids) < tolerance:
            break
        centroids = new_centroids
    return centroids, labels

num_runs, k, gamma, max_iterations, tolerance = 10, 2, 0.1, 10 ** 6, 1e-4
size = data.shape[0]

for run in range(num_runs):
    # Choose a single random landmark point.
    landmark_index = np.random.choice(size)
    landmark = data[landmark_index]
    features = rbf_kernel(data, landmark, gamma)[:, np.newaxis]

    # Run K-means on the extracted features.
    centroids, labels = k_means(features, k, max_iterations, tolerance)
    # Visualising the results.
    colors = ["green" if label == 1 else "red" for label in labels]
    plt.scatter(data[:, 0], data[:, 1], marker="x", color=colors)
    plt.scatter(landmark[0], landmark[1], marker="x", color="blue")
    plt.title(f"Run {run + 1} - K-Means Clustering with Single Landmark")
    plt.xlabel("Feature-1")
    plt.ylabel("Feature-2")
    path = "Run " + str(run + 1) + " of K-means clustering with single landmark point.png"
    plt.savefig(path)
    plt.close()