In [1]:
from sklearn import datasets
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
import random


def q_error(centroids, cluster, data):
    error = 0.0
    for i, c in enumerate(centroids):
        idx = np.where(np.asarray(cluster) == i)
        dist = np.linalg.norm(data[idx] - c)
        dist = dist/len(idx)
        error = error + dist
    error = error/len(centroids)
    return error

def get_velocity(gbest, w, c1, c2, velocity, position, centroids):
    inertia = w * velocity
    personal = c1 * random.uniform(0,1) * (position - centroids)
    social = c2 * random.uniform(0,1) * (gbest - centroids)
    velocity = inertia + personal + social
    return velocity

def calculate_m(n, cluster, datapoints):
    mj = []
    for i in range(n):
        idx = np.where(np.asarray(cluster) == i)
        data = datapoints[idx]
        mj.append(np.mean(data, axis=0))
    return mj

def pso(datapoints, n, text, w=0.72, particles=10, iterations=30, c1=1.49, c2=1.49):
    index = np.random.choice(list(range(len(datapoints))), n)
    centroids = datapoints[index]     
    cluster = [0] * len(datapoints)
    best_score = [99999] * particles
    gbest_pos = centroids
    gbest_error = 99999

    # Create swarm (Step 1 in paper)
    swarm_centroids = []
    swarm_velocities = []
    for i in range(particles):
        index = np.random.choice(list(range(len(datapoints))), n)
        centroids = datapoints[index]                           # Randomly select centroids
        swarm_centroids.append(centroids)                       # Append centoids to swarm
        swarm_velocities.append(np.zeros_like(centroids))       # Append velocities of same shape to swarm
    best_pos = swarm_centroids

    # Loop and update (Step 2 in paper)
    for t in range(iterations):
        for particle in range(particles):
            for j in range(datapoints.shape[0]):
                dist = np.linalg.norm((datapoints[j]-swarm_centroids[particle]), axis=1)
                cluster[j] = np.argmin(dist)

            velocity = get_velocity(np.asarray(gbest_pos), w, c1, c2, np.asarray(swarm_velocities[particle]), np.asarray(best_pos[particle]), np.asarray(swarm_centroids[particle]))
            swarm_centroids[particle] = swarm_centroids[particle] + velocity
            
            mj = calculate_m(n, cluster, datapoints)
            err = q_error(mj, cluster, datapoints)

            # Update local best position and score
            if err < best_score[particle]:
                best_score[particle] = err
                swarm_centroids[particle] = mj

            # Update global best bosition and score
            if err < gbest_error:
                gbest_error = err
                gbest_pos = mj
    print("(PSO, " + text + ") Error: ", q_error(mj, cluster, datapoints))          
    return 0

def kmeans(datapoints, k):
    kmeans = KMeans(n_clusters=k, max_iter=30, random_state=0)
    predictions = kmeans.fit_predict(datapoints)
    return predictions, kmeans





In [9]:
# Load the iris dataset
iris = datasets.load_iris()
X_iris = iris.data
y_iris = iris.target

# Create the artificial dataset
get_y_value = lambda x: 1 if ((x[0]>=0.7) or (x[0]<=0.3 and x[1]>= -0.2-x[0])) else 0
X_artificial = np.random.uniform(-1,1,(400,2))
y_artificial = np.apply_along_axis(get_y_value, 1, X_artificial)

# kMeans
y_kmeans, kmeans_iris = kmeans(X_iris, k=3)
print("(kMeans, Iris) Error: " , q_error(kmeans_iris.cluster_centers_, y_kmeans, X_iris))
y_kmeans, kmeans_art = kmeans(X_artificial, k=2)
print("(kMeans, Artificial) Error: " , q_error(kmeans_art.cluster_centers_, y_kmeans, X_artificial))

# PSO 
pso(X_iris, 3, "Iris")
pso(X_artificial, 2, "Artificial")

(kMeans, Iris) Error:  5.02982569656552
(kMeans, Artificial) Error:  9.245853449777584
(PSO, Iris) Error:  5.02982569656552
(PSO, Artificial) Error:  8.982198757277216


0

In [8]:
# #Visualising the clusters
# plt.scatter(X_iris[y_kmeans == 0, 0], X_iris[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Iris-setosa')
# plt.scatter(X_iris[y_kmeans == 1, 0], X_iris[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Iris-versicolour')
# plt.scatter(X_iris[y_kmeans == 2, 0], X_iris[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Iris-virginica')

# #Plotting the centroids of the clusters
# plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,1], s = 100, c = 'yellow', label = 'Centroids')

# plt.legend()
# plt.show()
# # Apply kmeans
# predictions = KMeans(n_clusters=3, random_state=0).fit_predict(X)
# print(kmeans)