In [5]:
import math
import numpy as np

In [42]:
def create_cluster(i, n_clusters, collection):
    
    centroid_index = int(i*len(collection)/n_clusters)
    
    return {
        'index': i,
        'centroid': collection[centroid_index],
        'centroid_index': centroid_index + 1,
        'elements': [],
        'elements_index': []
    }

def distance_function(a,b):
    return np.linalg.norm(a-b)

def calculate_average(elements):
    return np.sum(elements, axis=0) / len(elements)

def find_closest_cluster(e, clusters, fn):
    return list(sorted(clusters, key=lambda x:fn(x['centroid'],e)))[0]

def kmeans(collection, n_clusters, distance_function):
    clusters = [
        create_cluster(i, n_clusters, collection) for i in range(n_clusters)
    ]
    
    for n, e in enumerate(collection):
        closest_cluster = find_closest_cluster(e, clusters, distance_function)
        closest_cluster['elements'] += [e]
        closest_cluster['elements_index'] += [n+1] 
        closest_cluster['centroid'] = calculate_average(closest_cluster['elements'])
        
    return clusters


In [43]:
points = [
    [1.0,1.0],
    [1.5,2.0],
    [3.0,4.0],
    [5.0,7.0],
    [3.5,5.0],
    [4.5,5.0],
    [3.5,4.5]
]

points = [np.array(p) for p in points]

clusters = kmeans(points, 2, distance_function)
clusters

[{'index': 0,
  'centroid': array([1.83333333, 2.33333333]),
  'centroid_index': 1,
  'elements': [array([1., 1.]), array([1.5, 2. ]), array([3., 4.])],
  'elements_index': [1, 2, 3]},
 {'index': 1,
  'centroid': array([4.125, 5.375]),
  'centroid_index': 4,
  'elements': [array([5., 7.]),
   array([3.5, 5. ]),
   array([4.5, 5. ]),
   array([3.5, 4.5])],
  'elements_index': [4, 5, 6, 7]}]