# Visualizing clusters
## Lecture objectives
1. Explore how to visualize and interpret clusters
2. Demonstrate radar plots
3. Provide more practice with mapping

Let's begin by recreating the clusters from the previous lecture.

In [None]:
import pandas as pd
from sklearn import preprocessing
from sklearn.cluster import KMeans

df = pd.read_csv('../data/c037_g20_sov_data_by_g20_srprec.csv')
df.set_index('srprec', inplace=True)

# calculate vote shares
df['Biden_pc'] = df.PRSDEM01 / (df.PRSDEM01+df.PRSREP01)*100
props = [col[3:5] for col in df.columns if col.startswith('PR_') and col.endswith('Y')]
for prop in props:
    df[prop+'_pc_yes'] = df['PR_'+prop+'_Y'] / (df['PR_'+prop+'_Y'] 
                                              + df['PR_'+prop+'_N'])*100
for prop in ['20','22']:
    df[prop+'_pc_no'] = 100 - df[prop+'_pc_yes']
    df.drop(columns=[prop+'_pc_yes'], inplace=True)

# choose columns
cols_to_plot = [col for col in df.columns if '_pc' in col]
cols_to_plot.remove('14_pc_yes') 
cols_to_plot.remove('23_pc_yes') 
cols_to_plot.remove('24_pc_yes') 

# standardize
scaler = preprocessing.StandardScaler().fit(df[cols_to_plot])
df_scaled = pd.DataFrame(scaler.transform(df[cols_to_plot]), 
                         columns=cols_to_plot, index=df.index)
df_scaled = df_scaled.dropna()

# cluster
kmeans = KMeans(n_clusters=5, random_state=1).fit(df_scaled)
df_scaled['cluster_id'] = kmeans.labels_

# verify that we got the same result as before
df_scaled.groupby('cluster_id').size()

How can we best visualize what the clusters mean? If we had just two columns, a scatterplot with a color code for each cluster would work well. But we have 10 dimensions (10 columns that are used to cluster).

One way is to redo our original scatter plot matrix, but with each cluster indicated.

In [None]:
import seaborn as sns
ax = sns.pairplot(df_scaled, hue='cluster_id', )

My preferred option, however, is a radar chart. Neither `seaborn` nor `matplotlib` do this natively, but [there is an example in the `matplotlib` gallery](https://matplotlib.org/stable/gallery/specialty_plots/radar_chart.html). I've just copied and pasted that code.

In [None]:
# code from https://matplotlib.org/stable/gallery/specialty_plots/radar_chart.html
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Circle, RegularPolygon
from matplotlib.path import Path
from matplotlib.projections.polar import PolarAxes
from matplotlib.projections import register_projection
from matplotlib.spines import Spine
from matplotlib.transforms import Affine2D

def radar_factory(num_vars, frame='circle'):
    """
    Create a radar chart with `num_vars` axes.

    This function creates a RadarAxes projection and registers it.

    Parameters
    ----------
    num_vars : int
        Number of variables for radar chart.
    frame : {'circle', 'polygon'}
        Shape of frame surrounding axes.

    """
    # calculate evenly-spaced axis angles
    theta = np.linspace(0, 2*np.pi, num_vars, endpoint=False)

    class RadarAxes(PolarAxes):

        name = 'radar'
        # use 1 line segment to connect specified points
        RESOLUTION = 1

        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            # rotate plot such that the first axis is at the top
            self.set_theta_zero_location('N')

        def fill(self, *args, closed=True, **kwargs):
            """Override fill so that line is closed by default"""
            return super().fill(closed=closed, *args, **kwargs)

        def plot(self, *args, **kwargs):
            """Override plot so that line is closed by default"""
            lines = super().plot(*args, **kwargs)
            for line in lines:
                self._close_line(line)

        def _close_line(self, line):
            x, y = line.get_data()
            # FIXME: markers at x[0], y[0] get doubled-up
            if x[0] != x[-1]:
                x = np.append(x, x[0])
                y = np.append(y, y[0])
                line.set_data(x, y)

        def set_varlabels(self, labels):
            self.set_thetagrids(np.degrees(theta), labels)

        def _gen_axes_patch(self):
            # The Axes patch must be centered at (0.5, 0.5) and of radius 0.5
            # in axes coordinates.
            if frame == 'circle':
                return Circle((0.5, 0.5), 0.5)
            elif frame == 'polygon':
                return RegularPolygon((0.5, 0.5), num_vars,
                                      radius=.5, edgecolor="k")
            else:
                raise ValueError("Unknown value for 'frame': %s" % frame)

        def _gen_axes_spines(self):
            if frame == 'circle':
                return super()._gen_axes_spines()
            elif frame == 'polygon':
                # spine_type must be 'left'/'right'/'top'/'bottom'/'circle'.
                spine = Spine(axes=self,
                              spine_type='circle',
                              path=Path.unit_regular_polygon(num_vars))
                # unit_regular_polygon gives a polygon of radius 1 centered at
                # (0, 0) but we want a polygon of radius 0.5 centered at (0.5,
                # 0.5) in axes coordinates.
                spine.set_transform(Affine2D().scale(.5).translate(.5, .5)
                                    + self.transAxes)
                return {'polar': spine}
            else:
                raise ValueError("Unknown value for 'frame': %s" % frame)

    register_projection(RadarAxes)
    return theta

I then adapted the example from the website, putting it in a function called `radar_plot` that takes two arguments:
* the `kmeans` object
* the dataframe with the input data

In [None]:
def radar_plot(kmeans, df_scaled):
    N  = kmeans.cluster_centers_.shape[1]  # number of columns / variables
    k = kmeans.n_clusters
    theta = radar_factory(N, frame='polygon')
    data = kmeans.cluster_centers_.T  # the T means transpose
    spoke_labels = [col for col in df_scaled.columns if col!='cluster_id']
    fig, ax = plt.subplots(figsize=(4, 4),
                                subplot_kw=dict(projection='radar'))
    fig.subplots_adjust(wspace=0.25, hspace=0.20, top=0.85, bottom=0.05)

    ax.plot(theta, data) #, color=color)
    ax.set_varlabels(spoke_labels)

    # add legend relative to top-left plot
    labels = ['Cluster {}'.format(kk) for kk in range(k)]
    ax.legend(labels, loc=(0.95, .95),
                                labelspacing=0.1, fontsize=7)

Let's call this function with our data.

In [None]:
radar_plot(kmeans, df_scaled)

<div class="alert alert-block alert-info">
    <strong>Exercise:</strong> Add the size of each cluster to the legend. <em>Hint</em>: Look at the second to last line that defines the labels. And remember that you can group by <strong>cluster_id</strong> to get the cluster sizes.
</div>

In [None]:
# get the cluster sizes as we did before
csizes = df_scaled.groupby('cluster_id').size()
# then insert that into the string
# you'd change this in the function above, not here
k = kmeans.n_clusters
labels = ['Cluster {} (N={})'.format(kk, csizes.loc[kk]) for kk in range(k)]
labels

### Exploring different numbers of clusters
Here, the interesting finding is that all the clusters form concentric circles. There isn't a cluster of precincts that (say) votes against rent control but is progressive on the other items on the ballot.

We can certainly find these clusters if we increase `k`, but then these "weird" clusters have few precincts.

For example, let's try with `k=10`.

In [None]:
# drop the old cluster id, so that we don't include it in our new estimates
df_scaled.drop(columns=['cluster_id'], inplace=True)  

# this is the same code as before
kmeans = KMeans(n_clusters=10, random_state=1).fit(df_scaled)
df_scaled['cluster_id'] = kmeans.labels_
print(df_scaled.groupby('cluster_id').size())
radar_plot(kmeans, df_scaled)

Let's go back to our original 5 clusters.

In [None]:
df_scaled.drop(columns=['cluster_id'], inplace=True) 
kmeans = KMeans(n_clusters=5, random_state=1).fit(df_scaled)
df_scaled['cluster_id'] = kmeans.labels_
radar_plot(kmeans, df_scaled)

### Mapping the clusters
The Statewide Database team provide geographic boundary files as well as the vote counts. The shapefile for Los Angeles count is in your GitHub respository.

In [None]:
import geopandas as gpd

gdf = gpd.read_file('../data/srprec_037_g20_v01_shp/srprec_037_g20_v01.shp')
gdf.head()

Note that there is no projection file, so geopandas doesn't know the coordinate system.

In [None]:
print(gdf.crs)

The documentation online says it's in lat/lon, so let's set it to EPSG 4326.

In [None]:
gdf.crs = 'EPSG:4326'

Before we do a join, let's look at the data to figure out the number of rows and the join column, and whether `srprec` is a unique identifier.

In [None]:
# looks like we can join on srprec, 
# but we'll need to set that as the index for gdf
from IPython.display import display
display(df_scaled.head())
display(gdf.head())

In [None]:
# we have more observations in our spatial data, so we can do a left join to that
# maybe some precincts have no voters?
print(len(gdf))
print(len(df_scaled))

In [None]:
# both are unique, which makes things easier
print(df_scaled.index.is_unique)
print(gdf.SRPREC.is_unique)

In [None]:
# do the join
gdf.set_index('SRPREC', inplace=True)
joinedGdf = gdf.join(df_scaled)
joinedGdf.head()

Let's map the clusters. We should color code by `cluster_id`.

In [None]:
import contextily as ctx
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(5,5))

joinedGdf.to_crs('EPSG:3857').plot('cluster_id', legend=True, ax = ax, alpha=0.4)
ctx.add_basemap(ax, zoom=12, source=ctx.providers.OpenStreetMap.Mapnik)

# drop Catalina Island
ax.set_ylim([3.98e6, 4.14e6])

# and we really don't need the axis ticks and labels, so we set them to an empty list
ax.set_xticks([])
ax.set_yticks([])

ax.set_title('Typology of voting, 2020 General Election', fontsize=10)

What can we do to improve the map?

The `source` keyword gives access to lots of options. Take a look at the possibilities with `ctx.providers`.

In [None]:
ctx.providers

<div class="alert alert-block alert-info">
<strong>Exercise:</strong> How else would you improve the map?
</div>

There's no right answer here, but I first replace the missing data with an explicit "no data" label. To do that, we need to change the data type of `cluster_id` to string.

We can also remove the decimal point from the other cluster labels using the `str.replace()` function. We replace `.0` with an empty string.

In [None]:
joinedGdf.cluster_id = joinedGdf.cluster_id.astype(str)
joinedGdf.cluster_id = joinedGdf.cluster_id.str.replace('.0', '')
joinedGdf.cluster_id = joinedGdf.cluster_id.str.replace('nan', 'No data')

joinedGdf.cluster_id.head()

In the plot itself, we might:
* replace the colorbar with a legend. This is because we have discrete categories (0-5), not a continuous variable. That is done with the `categorical=True` keyword argument.
* add a legend title. We get the legend and then use the `set_title()` function.
* specify the colors. I find https://colorbrewer2.org the most helpful. 
* specify a gray for missing data (a grayscale color is a string between 0 and 1. E.g. 0 is black and 1 is white, with values in between representing progressively lighter shades.

In [None]:
# getting the colors into a colormap required some searching
# https://stackoverflow.com/questions/38882233/geopandas-matplotlib-plot-custom-colors
from matplotlib.colors import LinearSegmentedColormap
cmap = LinearSegmentedColormap.from_list(
    'mycmap', [(0, '#7fc97f'), (0.2, '#beaed4'), (0.4, '#fdc086'), 
               (0.6, '#ffff99'), (0.8, '#386cb0'), (1.0, '0.5')])

fig, ax = plt.subplots(figsize=(5,5))
joinedGdf.to_crs('EPSG:3857').plot('cluster_id', ax=ax, categorical=True, 
                                  legend=True, alpha=0.4, cmap=cmap,
                                  legend_kwds={'loc': 'upper left'})

# add a legend title
legend = ax.get_legend()
legend.set_title("Cluster", prop={'size':10} )

# all this is the same as before
ctx.add_basemap(ax, zoom=12, source=ctx.providers.OpenStreetMap.Mapnik)
ax.set_title('Typology of voting, 2020 General Election', fontsize=10)                           
ax.set_ylim([3.98e6, 4.14e6])
ax.set_xticks([])
ax.set_yticks([])

<div class="alert alert-block alert-info">
<h3>Key Takeaways</h3>
<ul>
  <li>Even if the clusters are pretty self-explanatory, they can be useful</li>
  <li>Radar plots and maps are two useful visualizations that help you interpret your clusters.
  <li>They can be a starting point for further quantitative research—perhaps, use them as a variable in a regression model</li>
  <li>They can also be useful for qualitative research. Perhaps you might do a case study of each cluster, picking the precinct/city/agency that is closest to each cluster center</li>
</ul>
</div>