In [1]:
import numpy as np
import scipy as sp
import networkx as nx
from kmeans import kmeans
from spectral_clustering import spectral_clustering, laplacian_matrix, similarity_matrix
from scipy import linalg
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd
import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon

If you decide that k-means++ is not sufficient, my recommendation is to try a method called graph spectral clustering, implemented here. We did not see enough benefit to continue working with it. However, you may find a reason to use the method, so the resource is still there for you. This method will take a bit to understand, but the basic idea is that it treats each school as a node on a graph where distances are edge weights and uses the spectral properties of the graph to reduce the dimension of the data. Then, an algorithm like k-means++ takes over and finishes the clustering. There is a lot of math (specifically linear algebra) involved in this method, and the handbook on this technique can be found here:  https://arxiv.org/abs/0711.0189

This file takes in the unclustered 2A Baseball data and runs Shi and Malik's graph spectral clustering algorithm. I found that this didn't have any worthwhile improvements over k-means, but in case you decide you want to investigate GSC more, this file will help you do that. 

The python code used for the actual clustering algorithm comes from https://ghost-clusters.github.io/icerm-spectral-clustering/

In [2]:
# reading the data
text_file = open("2ABaseballLatLonPython.txt", "r")
lines = text_file.readlines()
text_file.close()

k = len(lines)
x = []
data = np.zeros((k,2))
for i in range(k):
    x = lines[i].split("  ")
    data[i][0] = float(x[0])
    data[i][1] = float(x[1])

In [3]:
# running graph spectral clustering
# def spectral_clustering(data, k, lform, with_eigen = False, kmeans_iters = 100, numOfAtts=None, metric = None ,**kwargs):
k = 16
groups = np.array(spectral_clustering(data, k,"sym", with_eigen=False, kmeans_iters=1000))
count = np.zeros(k)
for i in range(len(groups)):
    count[groups[i]] +=1
print(count) # gives number of schools per cluster

[ 6. 10.  5.  1.  4.  4.  6.  6.  9.  2. 10.  8.  7.  5.  5. 10.]


In [None]:
# Plotting the clusters on a map of Indiana
indiana = gpd.read_file('indianaShapeFile/tl_2010_18_tabblock10.shp')
with plt.style.context(("seaborn", "ggplot")):
    indiana['geometry'].plot(figsize=(18,10),
               color="white",
               edgecolor = "white")
    for i in range(len(groups)):
        if groups[i] == 0:
            plt.scatter(data[i][1], data[i][0], s=15, c='bisque', marker="s", label='first')
        if groups[i] == 1:
            plt.scatter(data[i][1], data[i][0], s=15, c='olivedrab', marker="s", label='first')
        if groups[i] == 2:
            plt.scatter(data[i][1], data[i][0], s=15, c='gold', marker="s", label='first')
        if groups[i] == 3:
            plt.scatter(data[i][1], data[i][0], s=15, c='lightgreen', marker="s", label='first')
        if groups[i] == 4:
            plt.scatter(data[i][1], data[i][0], s=15, c='darkorange', marker="s", label='first')
        if groups[i] == 5:
            plt.scatter(data[i][1], data[i][0], s=15, c='seagreen', marker="s", label='first')
        if groups[i] == 6:
            plt.scatter(data[i][1], data[i][0], s=15, c='turquoise', marker="s", label='first')
        if groups[i] == 7:
            plt.scatter(data[i][1], data[i][0], s=15, c='maroon', marker="s", label='first')
        if groups[i] == 8:
            plt.scatter(data[i][1], data[i][0], s=15, c='tomato', marker="s", label='first')
        if groups[i] == 9:
            plt.scatter(data[i][1], data[i][0], s=15, c='deepskyblue', marker="s", label='first')
        if groups[i] == 10:
            plt.scatter(data[i][1], data[i][0], s=15, c='cornflowerblue', marker="s", label='first')
        if groups[i] == 11:
            plt.scatter(data[i][1], data[i][0], s=15, c='slateblue', marker="s", label='first')
        if groups[i] == 12:
            plt.scatter(data[i][1], data[i][0], s=15, c='purple', marker="s", label='first')
        if groups[i] == 13:
            plt.scatter(data[i][1], data[i][0], s=15, c='blueviolet', marker="s", label='first')
        if groups[i] == 14:
            plt.scatter(data[i][1], data[i][0], s=15, c='magenta', marker="s", label='first')
        if groups[i] == 15:
            plt.scatter(data[i][1], data[i][0], s=15, c='crimson', marker="s", label='first')