In [69]:
from sklearn import datasets
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
import random
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [6]:
# Artificial dataset
data_artificial = np.random.uniform(-1,1,(400,2))
target_function = lambda x: 1 if ((x[0]>=0.7) or (x[0]<=0.3 and x[1]>= -0.2-x[0])) else 0
target_artificial = np.apply_along_axis(target_function, 1, data_artificial)

# Kmeans dataset
iris = datasets.load_iris()
data_iris = iris.data
target_iris = iris.target

In [62]:
def q_error(data, centroids, cluster):
    error = 0.0
    for i, centroid in enumerate(centroids):
        index = np.where(np.asarray(cluster) == i)
        dist = np.linalg.norm(data[index] - centroid)
        dist = dist/len(index)
        error = error + dist
    return error/len(centroids)

def kmeans_method(data, num_clusters):
    kmeans = KMeans(n_clusters=num_clusters, max_iter=1000)
    output = kmeans.fit_predict(data)
    centroids = kmeans.cluster_centers_ 
    return output, centroids

def get_velocity(gbest, w, c1, c2, velocity, position, centroids):
    inertia = w * velocity
    personal = c1 * random.uniform(0,1) * (position - centroids)
    social = c2 * random.uniform(0,1) * (gbest - centroids)
    velocity = inertia + personal + social
    return velocity

def get_centroids(data, num_clusters, cluster):
    centroids = []
    for i in range(num_clusters):
        index = np.where(np.asarray(cluster) == i)
        data1 = data[index]
        centroids.append(np.mean(data1, axis=0))
    return centroids

def pso_method(data, num_clusters, w=0.72, particles=10, iterations=30, c1=1.49, c2=1.49):
    #set centroids to random positions
    index = np.random.choice(list(range(len(data))), num_clusters)
    centroids = data[index]     
    cluster = [0] * len(data)
    #initialize the global and local fitness
    local_best_score = [99999] * particles
    global_best_pos = centroids
    global_best_error = 99999

    # Creating swarm 
    swarm_centroids = []
    swarm_velocities = []
    for i in range(particles):
        #select random centroids
        index = np.random.choice(list(range(len(data))), num_clusters)
        centroids = data[index]   
        #add centroids to swarm
        swarm_centroids.append(centroids)   
        #initialize velocoties to 0
        swarm_velocities.append(np.zeros_like(centroids)) 
    local_best_pos = swarm_centroids

    # Loop over number of iterations
    for it in range(iterations):
        # Loop over the particles
        for particle in range(particles):
            # Loop over each data vector
            for j in range(data.shape[0]):
                #Calculate euclidean distance
                dist = np.linalg.norm((data[j]-swarm_centroids[particle]), axis=1)
                #Assign data vector to cluster
                cluster[j] = np.argmin(dist)
            
            #calculate fitness
            velocity = get_velocity(np.asarray(global_best_pos), w, c1, c2, np.asarray(swarm_velocities[particle]), np.asarray(local_best_pos[particle]), np.asarray(swarm_centroids[particle]))
            swarm_centroids[particle] = swarm_centroids[particle] + velocity
            
            #update centroids
            new_centroids = get_centroids(data, num_clusters, cluster)
            qerror = q_error(data, new_centroids, cluster)

            # Update local best position and score
            if qerror < local_best_score[particle]:
                local_best_score[particle] = qerror
                swarm_centroids[particle] = new_centroids

            # Update global best position and score
            if qerror < global_best_error:
                global_best_error = qerror
                global_best_pos = new_centroids
    return cluster, new_centroids          


In [63]:
clusters_kmeans_iris, centroids_kmeans_iris = kmeans_method(data = data_iris,num_clusters = 3)
clusters_pso_iris, centroids_pso_iris = pso_method(data = data_iris,num_clusters = 3)
clusters_kmeans_art, centroids_kmeans_art = kmeans_method(data = data_artificial,num_clusters = 2)
clusters_pso_art, centroids_pso_art = pso_method(data = data_artificial,num_clusters = 2)


print(q_error(data_iris, centroids_kmeans_iris, clusters_kmeans_iris))
print(q_error(data_iris, centroids_pso_iris, clusters_pso_iris))
print(q_error(data_artificial, centroids_kmeans_art, clusters_kmeans_iris))
print(q_error(data_artificial, centroids_pso_art, clusters_pso_iris))

5.02982569656552
5.02982569656552
7.4656416712068765
6.038131961515676


In [88]:
from statistics import mean,stdev
num = 30
q_error_kmeans_iris = [0.0] * num
q_error_pso_iris = [0.0] * num
q_error_kmeans_artificial = [0.0] * num
q_error_pso_artificial = [0.0] * num

for n in range(num):
    clusters_kmeans_iris, centroids_kmeans_iris = kmeans_method(data = data_iris,num_clusters = 3)
    clusters_pso_iris, centroids_pso_iris = pso_method(data = data_iris,num_clusters = 3)
    clusters_kmeans_art, centroids_kmeans_art = kmeans_method(data = data_artificial,num_clusters = 2)
    clusters_pso_art, centroids_pso_art = pso_method(data = data_artificial,num_clusters = 2)
    q_error_kmeans_iris[n] = q_error(data_iris, centroids_kmeans_iris, clusters_kmeans_iris)
    q_error_pso_iris[n] = q_error(data_iris, centroids_pso_iris, clusters_pso_iris)
    q_error_kmeans_artificial[n] = q_error(data_artificial, centroids_kmeans_art, clusters_kmeans_iris)
    q_error_pso_artificial[n] = q_error(data_artificial, centroids_pso_art, clusters_pso_iris)


In [108]:
kmeans = [str(str(round(mean(q_error_kmeans_iris),5)) + " +/- " + str(round(stdev(q_error_kmeans_iris),5))),str(str(round(mean(q_error_kmeans_artificial),5)) + " +/- " + str(round(stdev(q_error_kmeans_artificial),5)))]
pso = [str(str(round(mean(q_error_pso_iris),5)) + " +/- " + str(round(stdev(q_error_pso_iris),5))),str(str(round(mean(q_error_pso_artificial),5)) + " +/- " + str(round(stdev(q_error_pso_artificial),5)))]

results = pd.DataFrame(list(zip(kmeans,pso)),columns = ['Kmeans','PSO'],index=['Iris', 'Artificial'])
results.head()


Unnamed: 0,Kmeans,PSO
Iris,5.02983 +/- 0.0,5.03046 +/- 0.00348
Artificial,7.00974 +/- 0.50511,6.73217 +/- 0.56261
