In [1]:
import numpy as np
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.image as mpimg
'''
IMPORTANT: Code gotten from Tute 5 of Elements of Data Processing workshop
'''
class KMeans:
    """performs k-means clustering"""

    def __init__(self, k, means=None, display=False):
        self.k = k          # number of clusters
        self.means = means   # means of clusters
        self.display = display

    def classify(self, record):
        """return the index of the cluster closest to the input"""
        return min(range(self.k),
                   key=lambda i: np.linalg.norm(record - self.means[i]))

    def train(self, inputs):
        if self.means == None:
            self.means = inputs[np.random.choice(np.shape(inputs)[0], self.k)]

        self.assignments = None

        if self.display:
            step = 0
            print("Step", step)
            print("Data points:", inputs)
            print("Assignments:", self.assignments)
            print("Centroids:", self.means, "\n")
            
        while True:
            # Find new assignments
            new_assignments = list(map(self.classify, inputs))

            # If no assignments have changed, we're done.
            if self.assignments == new_assignments:
                if self.display:
                    step = step + 1
                    print("Step", step)
                    print("Data points:", inputs)
                    print("Assignments:", self.assignments)
                    print("Centroids:", self.means, "\n")
                return

            # Otherwise keep the new assignments,
            self.assignments = new_assignments

            for i in range(self.k):
                i_points = [p for p, a in zip(inputs, self.assignments) if a == i]
                # avoid divide-by-zero if i_points is empty
                if i_points:
                    self.means[i] = np.mean(i_points, axis=0)
            
            if self.display:
                step = step + 1
                print("Step", step)
                print("Data points:", inputs)
                print("Assignments:", self.assignments)
                print("Centroids:", self.means, "\n")

In [2]:
input2d = np.genfromtxt('merge_output.csv', delimiter=',')

clusterer = KMeans(3)
clusterer.train(input2d)
centroids = np.array(clusterer.means)
assignments = list(map(clusterer.classify, input2d))

# generate string color list from cluster assignments
colors = ['red','green','blue']
c_labels = [colors[grp] for grp in assignments]

fig = plt.figure(1)
ax = fig.add_subplot(1,1,1)
ax.scatter(input2d[:,0], input2d[:,1], c=c_labels, s=55)
ax.scatter(centroids[:,0], centroids[:,1], marker="*", s=200, color='yellow')
plt.xlabel("Offence rate", color= (.847, .333, 0), fontsize = 25)
plt.ylabel("Number of Gvt schools", color= (.847, .333, 0), fontsize = 25)
plt.grid(True)
plt.show()