In [1]:
# Load libraries
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd
import sys
import random 

In [109]:
# Load the test data
iris = load_iris()
# Create a pandas dataframe
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['species'] = iris.target
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [110]:
# extract a matriz from the df
iris_matrix = iris_df.values
iris_matrix

array([[5.1, 3.5, 1.4, 0.2, 0. ],
       [4.9, 3. , 1.4, 0.2, 0. ],
       [4.7, 3.2, 1.3, 0.2, 0. ],
       [4.6, 3.1, 1.5, 0.2, 0. ],
       [5. , 3.6, 1.4, 0.2, 0. ],
       [5.4, 3.9, 1.7, 0.4, 0. ],
       [4.6, 3.4, 1.4, 0.3, 0. ],
       [5. , 3.4, 1.5, 0.2, 0. ],
       [4.4, 2.9, 1.4, 0.2, 0. ],
       [4.9, 3.1, 1.5, 0.1, 0. ],
       [5.4, 3.7, 1.5, 0.2, 0. ],
       [4.8, 3.4, 1.6, 0.2, 0. ],
       [4.8, 3. , 1.4, 0.1, 0. ],
       [4.3, 3. , 1.1, 0.1, 0. ],
       [5.8, 4. , 1.2, 0.2, 0. ],
       [5.7, 4.4, 1.5, 0.4, 0. ],
       [5.4, 3.9, 1.3, 0.4, 0. ],
       [5.1, 3.5, 1.4, 0.3, 0. ],
       [5.7, 3.8, 1.7, 0.3, 0. ],
       [5.1, 3.8, 1.5, 0.3, 0. ],
       [5.4, 3.4, 1.7, 0.2, 0. ],
       [5.1, 3.7, 1.5, 0.4, 0. ],
       [4.6, 3.6, 1. , 0.2, 0. ],
       [5.1, 3.3, 1.7, 0.5, 0. ],
       [4.8, 3.4, 1.9, 0.2, 0. ],
       [5. , 3. , 1.6, 0.2, 0. ],
       [5. , 3.4, 1.6, 0.4, 0. ],
       [5.2, 3.5, 1.5, 0.2, 0. ],
       [5.2, 3.4, 1.4, 0.2, 0. ],
       [4.7, 3

# Common functions

## Function to calculate Euclidean Distance

In [111]:
def calculate_euclidean_distance(np_array1, np_array2):
    """
    Args:
        np_array1 (np.ndarray): 1-dimensional array with n elements 
        np_array2 (np.ndarray): 1-dimensional array with n elements 
    """
    # Calculate: 𝑑 = sqrt((𝑋1 −𝑌1)^2+ (𝑋2 − 𝑌2)^2+ ...  + (𝑋𝑛 − 𝑌𝑛)^2)
    return np.sqrt(np.sum((np_array1 - np_array2) ** 2))

## Function to assign an entry to a cluster

In [112]:
def assign_to_cluster(entry, centroids):
    """
    Args:
        entry (np.ndarray): the entry to be assigned. It is a 1-dimensional array with n elements.
        centroids (List[np.ndarray]): a list containing the centroids of each cluster. Each centroid is a 1-dimensional NumPy array with n elements. 
    """
    # this variable is used to retorn the cluster 
    cluster = -1
    # assign to min_distance the larger float
    min_distance = sys.float_info.max
    num_clusters = len(centroids)
    for i in range(num_clusters):
        # calculate euclidian distance 
        current_distance = calculate_euclidean_distance(entry, centroids[i])
        if current_distance < min_distance:
            min_distance = current_distance
            cluster = i
    # return the cluster to which the entry belongs and the euclidean distance.
    return cluster, min_distance

## Function to assign all entries to clusters (get clusters)

In [113]:
def get_clusters(matrix, centroids):
    """
    Args: 
        matrix (np.ndarray): an mxn matrix obtained from the pandas dataframe.
        centroids (List[np.ndarray]): a list containing the centroids of each cluster. Each centroid is a 1-dimensional NumPy array with n elements. 
    """
    # declare a list of lists, each inner list represents a different cluster
    clusters = [[] for _ in range(len(centroids))]
    # iterate over the rows
    for i in range(matrix.shape[0]):
        # get the cluster to which the entry belongs.
        cluster_assigned, _ = assign_to_cluster(matrix[i, :], centroids)
        clusters[cluster_assigned].append(i)
    return clusters

## Function to initialize centroids with random numbers 

In [114]:
def init_centroids_random(matrix, centroids): 
    """
    Args: 
        matrix (np.ndarray): an mxn matrix obtained from the pandas dataframe.
        centroids (List[np.ndarray]): a list containing the centroids of each cluster. Each centroid is a 1-dimensional NumPy array with n elements. 
    """
    dimensions = matrix.shape[1] # number of columns
    centroids_count = len(centroids)
    # Iterate through column
    for i in range(dimensions):
        # get the range of column values 
        min_value = np.min(matrix[:, i])
        max_value = np.max(matrix[:, i])
        # Iterate through centroids
        for j in range(centroids_count):
            # generate a random number within the range. 
            centroids[j][i] = np.random.uniform(min_value, max_value)
    return centroids

# K-means algorithm

In [115]:
def recalculate_centroids_k_means(matrix, clusters):
    """
    Args: 
        matrix (np.ndarray): an mxn matrix obtained from the pandas dataframe.
        clusters (List[List[]]): a list containing lists representing clusters. 
            Each inner list contains the index of each entry that belongs to the cluster.
    """
    #dimentions = matrix.shape[1]
    n_clusters = len(clusters)
    # declare a list that represent centroids
    centroids = [None] * n_clusters
    for i in range(n_clusters):
        if len(clusters[i]) > 0:
            # get the new centroids calculating the mean across dimensions
            centroids[i] = np.mean(matrix[clusters[i]], axis=0)
        else: 
            # when the cluster do not have entries, we generate random values again
            centroids[i] = []
            for j in range(matrix.shape[1]):
                centroids[i].append(np.random.uniform(np.min(matrix[:, j]), np.max(matrix[:, j])))
    return centroids

In [116]:
def k_means(matrix, k):
    # get the number of columns in the matrix
    n = matrix.shape[1]
    # declare a list containing np.arrays that represent arrays
    centroids = [np.zeros(n) for _ in range(k)]
    # (1) Initialize centroids with random number
    init_centroids_random(matrix, centroids)
    # repit while the centroids values do not converge
    converged = False
    while not converged:
        # (2) Assign entries to centroids
        clusters = get_clusters(matrix, centroids)
        #print(centroids)
        #print(clusters)
        # (3) Recalculate centroid values
        updated_centroids = recalculate_centroids_k_means(matrix, clusters)
        #print(updated_centroids)
        #print("--------------------------")
        if np.array_equal(centroids, updated_centroids):
            converged = True
        else:
            centroids = updated_centroids
    return clusters, centroids

# Genetic algorithm

In [None]:
# code

# Brute force algorithm

In [94]:
# code

# Test

In [117]:
clusters, _ = k_means(iris_matrix, 32)
print(clusters)

[[56, 70, 85], [102, 109, 120, 143], [103, 108, 111, 116, 128, 132, 137], [68, 72, 87], [110, 112, 139, 141, 145, 147], [57, 60, 93, 98], [54, 58, 65, 75, 76, 86], [64], [115, 136, 148], [79], [119], [50, 52, 77], [5, 10, 18, 20, 31, 36, 48], [125, 129], [53, 62, 69, 80, 81, 89], [101, 113, 121, 142], [117, 131], [59], [114], [55, 66, 84, 90], [17, 19, 21, 23, 24, 26, 27, 43, 44, 46], [83], [106], [105, 107, 118, 122, 130, 135], [0, 4, 6, 7, 9, 11, 22, 25, 28, 29, 30, 34, 35, 37, 39, 40, 49], [1, 2, 3, 8, 12, 13, 38, 41, 42, 45, 47], [51, 63, 73, 74, 78, 91, 97], [61, 67, 71, 82, 88, 92, 94, 95, 96, 99], [100, 104, 124, 140, 144], [134], [14, 15, 16, 32, 33], [123, 126, 127, 133, 138, 146, 149]]


In [118]:
clusters, _ = k_means(iris_matrix, 4)
print(clusters)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], [50, 51, 52, 54, 56, 58, 63, 65, 68, 70, 72, 73, 74, 75, 76, 77, 78, 83, 85, 86, 87, 91, 97, 101, 113, 114, 119, 121, 123, 126, 127, 133, 134, 138, 142, 146, 149], [53, 55, 57, 59, 60, 61, 62, 64, 66, 67, 69, 71, 79, 80, 81, 82, 84, 88, 89, 90, 92, 93, 94, 95, 96, 98, 99, 106], [100, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 115, 116, 117, 118, 120, 122, 124, 125, 128, 129, 130, 131, 132, 135, 136, 137, 139, 140, 141, 143, 144, 145, 147, 148]]


In [100]:
clusters, _ = k_means(iris_matrix, 3)
print(clusters)

[[50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 106], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], [77, 100, 101, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]]
