# K-Means from scratch

In this notebook, we will do the K-Means clustering from scratch.  
We will go through it step by step before we combine these steps in one function.

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial import Voronoi, voronoi_plot_2d
from scipy.spatial.distance import euclidean
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

%matplotlib inline

In [None]:
# First create a dataset to work with and plot the actual clustering
X, y = make_blobs(
                n_samples=150, 
                n_features=2,
                centers=5, 
                cluster_std=1.5,
                shuffle=True,
                random_state=42
                )

plt.scatter(X[:, 0], X[:, 1], c=y);

## 1. Choose number of clusters and place centroids in random location

In [None]:
# Create the initial centroids
n_centers=3
np.random.seed(42)
centroids = (np.random.rand(n_centers, 2))*5

In [None]:
# Plot centroids within a Voronoi diagram 
vor = Voronoi(centroids)
_ = voronoi_plot_2d(vor, show_points=True, show_vertices=False, line_width=2, point_size=20)

## 2. Classify given points by calculating euclidean distance between data points and centroids

In [None]:
def classify_points(centers, data):
    '''Classifies points from data according to which center they're closest to.'''
    # Create an empty dictionary to hold the classified points
    classes = defaultdict(list)
    
    # Create list of colors to use as the class names
    class_list = sns.color_palette('husl').as_hex()

    # Loop through each point in the dataset, calculate the euclidean distance to each centroid
    # Result is a dists list with as many different distances as we have clusters
    for pt in data:      
        dists = [euclidean(pt, center) for center in centers]
        # Based on the dists list, we build as many lists as we have clusters 
        # Each list 'represents' one centroid/cluster
        # Each point is appended to one of the lists dependend on its smallest distance
        classes[class_list[dists.index(min(dists))]].append(pt)

    # Concat the list of points in each class together to a np array.
    classes = {cl: np.array(classes[cl]) for cl in classes.keys()}
    return classes

In [None]:
# Execute function for data X and initially created centroids
# Result shows a dictionary containing three concatenated lists (since we have three clusters) 
# Each single list contains the points (x and y-value) with the smallest distance to the centroid represented by list
data_classified = classify_points(centroids, X)
data_classified

## 3. Recalculate the cluster centers as a mean of data points assigned to it

In [None]:
def find_new_centers(data_classified):
    '''Calculates new centers using mean of points in the classified data.'''
    new_means = []
    for pts in data_classified.values():
        new_means.append(pts.mean(axis=0))
    return np.array(new_means)

In [None]:
# Execute function to calculate new centroids using the previously classified data points
find_new_centers(data_classified)

**For a good clustering result, Step 2 and 3 needs to be repeated until no further changes occur**

In the following block, you will find an overview of our defined functions as well as two plotting functions.

In [None]:
# Define the functions for each step of the k-means alogorithm 
# Use a voronoi plot to show which points belong to which centroids 
# Plot the original dataset on the same plot to show the convergence of the algorithm
# Algorithm source : https://blog.jsalv.com/pythonin-voronoi-and-k-means/

# Classify given points by calculating euclidean distance between data points and centroids

def classify_points(centers, data):
    '''Classifies points from data according to which center they're closest to.'''
    # Create an empty dictionary to hold the classified points
    classes = defaultdict(list)

    # Create list of colors to use as the class names
    class_list = sns.color_palette('husl').as_hex()

    # Loop through each point in the dataset, calculate the euclidean distance to each centroid
    for pt in data:
        dists = [euclidean(pt, center) for center in centers]
        classes[class_list[dists.index(min(dists))]].append(pt)

    # Concat the list of points in each class together to a np array.
    classes = {cl: np.array(classes[cl]) for cl in classes.keys()}
    return classes

# Recalculate the cluster centers as a mean of data points assigned to it

def find_new_centers(data_classified):
    '''Calculates new centers using mean of points in the classified data.'''
    new_means = []
    for pts in data_classified.values():
        new_means.append(pts.mean(axis=0))
    return np.array(new_means)

# Plot centroids and data

def plot_voronoi(iteration, centroids, ax):
    '''Plot the Voronoi diagram with our data classified onto the subplot.'''
    # Uses the voronoi function to draw the dividing lines and the centroids
    vor = Voronoi(centroids)
    _ = voronoi_plot_2d(vor, ax[iteration], show_points=True, show_vertices=False, line_width=2, point_size=20)
    
def plot_scatter(iteration, data, ax):
    '''Plots the original data onto the current ax'''
    ax[iteration].scatter(data[:, 0], data[:, 1], c=y, s=15, alpha=0.7)
    ax[iteration].set_title(f"Iteration {iteration}", fontsize=20)
    ax[iteration].set_xlim(-14,10) 
    ax[iteration].set_ylim(-10,14)   

# Defining main function

Let's pack everything into one single function, which clusters our data based on specified number of clusters and number of iterations.  

In [None]:
def visualise_k_means(data, n_centers=3, n_iter=15, seed=42):
    '''Loop through the steps of the alogorithm, stopping to make a plot each time to plot the results'''
    # Create the initial centroids
    
    np.random.seed(seed)
    centroids = (np.random.rand(n_centers, 2))*5

    # Set up the matplotlib figure
    fig, ax = plt.subplots(ncols=(n_iter), figsize=(20, 5))

    # run through iterations of k_means
    for i in range(n_iter):
        # Assign points to centroids
        data_classified = classify_points(centers=centroids, data=data)
        
        # Plot the Voronoi diagram of the centroids
        plot_voronoi(i, centroids, ax)

        # Plot the original data on the same ax
        plot_scatter(i, data, ax)
        
        # Update the centroids based on lastest classified points
        centroids = find_new_centers(data_classified)

In [None]:
# Run the function
visualise_k_means(data=X, n_iter=7, n_centers=5, seed=42)