In [None]:
# Import the requisite packages
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
from census import Census 
from us import states 

In [None]:
# Set API key
c = Census("CENSUS API KEY HERE")

In [None]:
# Grabbing a few variables of interest pertaining to the economic vitality of an area
geo_demo = [
    "B01003_001E", #"Total Population"
    "B25077_001E", #"Median value of owner occupied units"
    "B25026_001E", #"Total population in occupied housing units"
    "B25008_002E", #"Total number of owner occupied units"
    "B25008_003E", #"Total number of renter occupied units"
    "B06009_002E", #"Population with less than a high school diploma"
    "B06009_003E", #"Population with high school diploma or equivalent"
    "B06009_004E", #"Population with some college/associates degree"
    "B06009_005E", #"Population with bachelors degree"
    "B06009_006E", #"Population with a graduate degree"
    "B01002_001E", #"Median age"
    "B06010_004E", #"Population with income less than 9999"
    "B06010_005E", #"Population with income between 10000 and 14999"
    "B06010_006E", #"Population with income between 15000 and 24999"
    "B06010_007E", #"Population with income between 25000 and 34999"
    "B06010_008E", #"Population with income between 35000 and 49999"
    "B06010_009E", #"Population with income between 50000 and 64999"
    "B06010_010E", #"Population with income between 65000 and 74999"
    "B06010_011E", #"Population with income of 75000 or more"
    "B28007_009E", #"Population in labor force and unemployed"
    "B19059_002E", #"Population that is retired with retirement income"
    "B19059_003E", #"Retired without retirement income"
    "B08013_001E", #"Travel time to work in minutes"
    "B17013_002E" #"Population with income below poverty level in past 12 months"
        ]

In [None]:
# Sources: https://api.census.gov/data/2019/acs/acs5/variables.html; https://pypi.org/project/census/
ny_census = c.acs5.state_county_tract(fields = ('NAME', 'B01003_001E','B25026_001E','B25008_002E','B25008_003E',
                                               'B25077_001E','B06009_002E','B06009_003E','B06009_004E',
                                               'B06009_005E','B06009_006E',
                                               'B01002_001E','B06010_004E','B06010_005E','B06010_006E',
                                               'B06010_007E','B06010_008E','B06010_009E','B06010_010E',
                                               'B06010_011E','B28007_009E','B19059_002E','B19059_003E',
                                               'B08013_001E','B17013_002E'),
                                      state_fips = states.NY.fips,
                                      county_fips = "*",
                                      tract = "*",
                                      year = 2019)

In [None]:
# Create a dataframe from the census data
ny_df = pd.DataFrame(ny_census)

# Show the dataframe and its print its shape
print(ny_df.head(2))
print('Shape of NY DataFrame:', ny_df.shape)

In [None]:
# Access shapefile of NY census tracts
ny_tract = gpd.read_file("https://www2.census.gov/geo/tiger/TIGER2019/TRACT/tl_2019_36_tract.zip")

# Reprojecting the shapefile to the New York State Plan Long Island Zone 
# EPSG:2263 - https://spatialreference.org/ref/epsg/2263/
ny_tract = ny_tract.to_crs(epsg = 2263)

# Print GeoDataFrame of the NY census tract shapefile
print(ny_tract.head(2))
print('NY Tract Shape: ', ny_tract.shape)

# Check projection of the shapefile
print("\nThe shapefile projection for this data is: {}".format(ny_tract.crs))

In [None]:
# Combine the state, county, and tract variables of the ny_df together to create 
# a new string and assign to variable called GEOID

ny_df["GEOID"] = ny_df["state"] + ny_df["county"] + ny_df["tract"]

# Remove the individual columns as they're no longer needed
ny_df = ny_df.drop(columns = ["state", "county", "tract"])

# Display the updated dataframe
ny_df.head(2)

In [None]:
# Join the data together on GEOID to geoenable the census data
ny_merge = ny_tract.merge(ny_df, on = "GEOID")

# Display the results
ny_merge.head(2)

## 2. Cleaning the Data

In [None]:
# Renaming variables in the data set
ny_merge.rename(columns={
    "B01003_001E":"TotPop", #"Total Population"
    "B25077_001E":"MedVal_OwnOccUnit", #"Median value of owner occupied units"
    "B25026_001E":"TotPopOccUnits", #"Total population in occupied housing units"
    "B25008_002E":"TotNumOwnOccUnit", #"Total number of owner occupied units"
    "B25008_003E":"TotNumRentOccUnit", #"Total number of renter occupied units"
    "B06009_002E":"PopLTHSDip", #"Population with less than a high school diploma"
    "B06009_003E":"PopHSDip", #"Population with high school diploma or equivalent"
    "B06009_004E":"PopAssoc", #"Population with some college/associates degree"
    "B06009_005E":"PopBA", #"Population with bachelors degree"
    "B06009_006E":"PopGrad", #"Population with a graduate degree"
    "B01002_001E":"MedAge", #"Median age"
    "B06010_004E":"PopIncLT10", #"Population with income less than 9999"
    "B06010_005E":"PopInc1015", #"Population with income between 10000 and 14999"
    "B06010_006E":"PopInc1525", #"Population with income between 15000 and 24999"
    "B06010_007E":"PopInc2535", #"Population with income between 25000 and 34999"
    "B06010_008E":"PopInc3550", #"Population with income between 35000 and 49999"
    "B06010_009E":"PopInc5065", #"Population with income between 50000 and 64999"
    "B06010_010E":"PopInc6575", #"Population with income between 65000 and 74999"
    "B06010_011E":"PopIncGT75", #"Population with income of 75000 or more"
    "B28007_009E":"UnempPop", #"Population in labor force and unemployed"
    "B19059_002E":"RetPop", #"Population that is retired with retirement income"
    "B19059_003E":"RetPopNoRetInc", #"Retired without retirement income"
    "B08013_001E":"TrvTimWrk", #"Travel time to work in minutes"
    "B17013_002E":"PopBlwPovLvl" #"Population with income below poverty level in past 12 months"
}
                , inplace=True
               )

In [None]:
geo_demo_rn = [
"TotPop", #"Total Population"
"TotPopOccUnits", #"Total population in occupied housing units"
"TotNumOwnOccUnit", #"Total number of owner occupied units"
"TotNumRentOccUnit", #"Total number of renter occupied units"
"PopLTHSDip", #"Population with less than a high school diploma"
"PopHSDip", #"Population with high school diploma or equivalent"
"PopAssoc", #"Population with some college/associates degree"
"PopBA", #"Population with bachelors degree"
"PopGrad", #"Population with a graduate degree"
"PopIncLT10", #"Population with income less than 9999"
"PopInc1015", #"Population with income between 10000 and 14999"
"PopInc1525", #"Population with income between 15000 and 24999"
"PopInc2535", #"Population with income between 25000 and 34999"
"PopInc3550", #"Population with income between 35000 and 49999"
"PopInc5065", #"Population with income between 50000 and 64999"
"PopInc6575", #"Population with income between 65000 and 74999"
"PopIncGT75", #"Population with income of 75000 or more"
"UnempPop", #"Population in labor force and unemployed"
"RetPop", #"Population that is retired with retirement income"
"RetPopNoRetInc", #"Retired without retirement income"
"PopBlwPovLvl" #"Population with income below poverty level in past 12 months"
]

# Cleaning up the dataframe
geo_demo_rn.append('geometry')
ny_merge_2 = ny_merge[geo_demo_rn]
geo_demo_rn.remove('geometry')

In [None]:
# Dropping any areas without population
ny_merge_2 = ny_merge_2[ny_merge_2['TotPop']>0]

# Resetting the index to assist in index based operations later on
ny_merge_2.reset_index(inplace=True)

## 3. Exploratory Data Analysis

In [None]:
# Setting up the figure using subplots to map each of the extracted variables
fig, axes = plt.subplots(nrows=7, ncols=3, figsize=(75,75), layout='tight')

axes = axes.flatten()

# Setting the font size
plt.rcParams['font.size'] = '40'

# Plotting each of the extracted variables in a subplot
for ind, col in enumerate(geo_demo_rn):
    ax = axes[ind]
    ny_merge_2.plot(column=col,
                  ax = ax, scheme = "quantiles", linewidth=0, cmap="coolwarm", 
                    legend=True, legend_kwds={'loc': 'center left','bbox_to_anchor':(1,0.5),'fmt': "{:.0f}"}
                 ) 
    ax.set_axis_off()
    ax.set_title(col)
plt.subplots_adjust(wspace=None, hspace=None)
plt.show()

In [None]:
# Importing packages required for testing spatial autocorrelation
from libpysal.weights import Queen, KNN
from esda.moran import Moran
import numpy as np

# Testing for spatial auto correlation using Moran's I. First, we need to set up the spatial weights matrix
w = Queen.from_dataframe(ny_merge_2)

In [None]:
# Set the numpy random seed for reproducibility
np.random.seed(54321)

# Calculate the Moran's I statistic for each geodemographic variable
moransi_results = [
    Moran(ny_merge_2[v], w) for v in geo_demo_rn
]

# Structure results as a list of tuples
moransi_results = [
    (v, res.I, res.p_sim)
    for v, res in zip(geo_demo_rn, moransi_results)
]

# Display as a table
table = pd.DataFrame(
    moransi_results, columns=["GEODEMO Var", "Moran's I", "P-value"]
).set_index("GEODEMO Var")

table.head(5)

In [None]:
# Importing packages required for additional visualization
import seaborn as sns
sns.set(font_scale=2)

# Given we have 25 variables in the data set to explore, this will be way to many plots to visually inspect. Lets inspect just a handful
sel_vars = [
    "TotNumRentOccUnit", #"Total number of renter occupied units"
    "PopGrad", #"Population with a graduate degree"
    "UnempPop", #"Population in labor force and unemployed"
    "PopBlwPovLvl" #"Population with income below poverty level in past 12 months"
]

pplt = sns.pairplot(
    ny_merge_2[sel_vars], kind="reg", diag_kind="kde",
    height=5, 
)

plt.show()

In [None]:
# Saving out the figure as an image
fig = pplt.fig
fig.savefig("NY_Pairplot.png")

In [None]:
# Set the figure size
plt.figure(figsize=(16, 12))

# Setting the font size
plt.rcParams['font.size'] = '10'

# Create the mask to only show the lower triangle
mask = np.triu(np.ones_like(ny_merge_2.corr(), dtype=np.bool))
heatmap = sns.heatmap(ny_merge_2.corr(), mask=mask, vmin=-1, vmax=1, annot=True, cmap='coolwarm')
heatmap.set_title('ACS Variable Correlation Heatmap', fontdict={'fontsize':18}, pad=16);

## 4. Processing the data

In [None]:
# Importing the packages needed to scale the data
from sklearn.preprocessing import robust_scale

ny_merged_scaled = robust_scale(ny_merge_2[geo_demo_rn])
ny_merged_scaled

## 5. Modeling

### 5.1 KMeans

In [None]:
# Exploring an initial k-means baseline model 
from sklearn.cluster import KMeans

# setting the random seed to ensure reproducibility 
np.random.seed(54321)

In [None]:
distortions = []
K = range(1,15)
for k in K:
    # Instantiating the model
    kmeans=KMeans(n_clusters=k)
    kmeans.fit(ny_merged_scaled)
    distortions.append(kmeans.inertia_)
    
# Setting the font size
plt.rcParams['font.size'] = '2'

plt.figure(figsize=(16,10))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('Elbow Method For Finding Optimial Number of Clusters')
plt.show()

In [None]:
# Running the KMeans model with 5 clusters
kmeans=KMeans(n_clusters=5)
kmeans_5 = kmeans.fit(ny_merged_scaled)

# Printing the cluster labels
kmeans_5.labels_

In [None]:
# Assign labels to a new column called km_5_label
ny_merge_2["kmeans_5_label"] = kmeans_5.labels_

# Setup figure and axis
f, ax = plt.subplots(1, figsize=(20, 20))

# Plot the choropleth map
ny_merge_2.plot(
    column="kmeans_5_label", categorical=True, legend=True, linewidth=0, ax=ax, 
    legend_kwds={'loc': 'center left','bbox_to_anchor':(1,0.5),'fmt': "{:.0f}"},
    cmap="Set2",
)

# Remove the axis
ax.set_axis_off()

# Display the map
plt.show()

In [None]:
# Setting the file path
path = r'G:\My Drive\Geospatial Data Science with Python\Data\\'

ny_merge_3 = ny_merge_2.to_crs('EPSG:4326')

boroughs = gpd.read_file(path + r"NYC Boroughs\nybb_22a\nybb.shp")
boroughs = boroughs.to_crs('EPSG:4326')
boroughs = boroughs.dissolve()

ny_merge_3_mask = ny_merge_3.within(boroughs.loc[0,'geometry'])

ny_merge_4 = ny_merge_3.loc[ny_merge_3_mask]

In [None]:
# Setup figure and axis
f, ax = plt.subplots(1, figsize=(20, 20))

# Plot the choropleth map
ny_merge_4.plot(
    column="kmeans_5_label", categorical=True, legend=True, linewidth=0, ax=ax, 
    legend_kwds={'loc': 'center left','bbox_to_anchor':(1,0.5),'fmt': "{:.0f}"},
    cmap="Set2",
)

# Remove the axis
ax.set_axis_off()

# Display the map
plt.show()

In [None]:
# Group data table by cluster label and count observations
k5distr = ny_merge_2.groupby("kmeans_5_label").size()
k5distr

In [None]:
# Getting the unit of measurement from the CRS
print(ny_merge_2.crs.axis_info)

# Calculate the average area of each cluster
# 1. Create a new column with the area of the census tract and convert from foot to sq. mi. 
ny_merge_2['area'] = (ny_merge_2.geometry.area)*3.587E-8

# 2. Dissolve the tracts and caluculate the area
area = ny_merge_2.dissolve(by="kmeans_5_label", aggfunc="sum")["area"]
print("\nThe area of the clusters is: {}".format(area))

# 3. Create a table with the number of tracts per cluster and the sum area
tract_area = pd.DataFrame({"Num. Tracts": k5distr, "Area": area})
tract_area['Area_per_tract'] = tract_area["Area"]/tract_area["Num. Tracts"]
tract_area.reset_index(inplace=True)

# 4. Plot the area per tract
ax = tract_area.plot.bar(x="kmeans_5_label",y="Area_per_tract")

# Setting size of the labels
plt.xlabel('Cluster labels', fontsize=20)
plt.ylabel('Avg. Area per Tract', fontsize=20)
plt.yticks(fontsize=20)
plt.xticks(fontsize=20)

In [None]:
# Calculating descriptive statistics for each cluster
k5means = ny_merge_2.groupby("kmeans_5_label")[geo_demo_rn].mean()

# Transpose the table and rounding the values to 2 decimal places
k5means.round(2)

In [None]:
import plotly.io as pio
pio.renderers.default = 'iframe'

In [None]:
# Creating a dataframe version of the scaled data
ny_merged_scaled_df = pd.DataFrame(ny_merged_scaled,
                  columns = geo_demo_rn)

# Adding in the cluster labels
ny_merged_scaled_df["kmeans_5_label"] = kmeans_5.labels_

# Calculating descriptive statistics for each cluster
k5means_s = ny_merged_scaled_df.groupby("kmeans_5_label")[geo_demo_rn].mean()

# Transpose the table and rounding the values to 2 decimal places
k5means_s.round(2)

import plotly.graph_objects as go

categories = k5means_s.columns
fig = go.Figure()

for g in k5means_s.index:
    fig.add_trace(go.Scatterpolar(
        r = k5means_s.loc[g].values,
        theta = categories,
        fill = 'toself',
        name = f'cluster #{g}'
    ))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[-2, 5] # here we can define the range
    )),
  showlegend=True,
    title="KMeans Cluster Radial Plot",
    title_x=0.5
)

fig.show()

### 5.2 Hierarchical Clustering w/o Spatial Constraint

In [None]:
# Importing the pakage needed for hierarchical clustering
from sklearn.cluster import AgglomerativeClustering

# Set seed for reproducibility
np.random.seed(54321)

In [None]:
# Instantiate the algorithm
model = AgglomerativeClustering(linkage="ward", n_clusters=5)

# Run clustering
model.fit(ny_merged_scaled)

# Assign labels to main dataframe
ny_merge_2["ward5_label"] = model.labels_

# Assign labels to scaled dataframe
ny_merged_scaled_df["ward5_label"] = model.labels_

In [None]:
ward5sizes = ny_merge_2.groupby("ward5_label").size()
ward5sizes

In [None]:
ward5means_s = ny_merged_scaled_df.groupby("ward5_label")[geo_demo_rn].mean()

# Transpose the table and rounding the values to 2 decimal places
ward5means_s.round(2)

categories = ward5means_s.columns
fig = go.Figure()

for g in ward5means_s.index:
    fig.add_trace(go.Scatterpolar(
        r = ward5means_s.loc[g].values,
        theta = categories,
        fill = 'toself',
        name = f'cluster #{g}'
    ))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[-2, 6] # here we can define the range
    )),
  showlegend=True,
    title="AHC Cluster Radial Plot",
    title_x=0.5
)

fig.show()

In [None]:
# Setup figure and ax
fig, axs = plt.subplots(2, 2, figsize=(12, 6))

# Plotting the k-means map
ax = axs[0,0]
# Plot the choropleth map of the k-means results
ny_merge_2.plot(
    column="kmeans_5_label",
    categorical=True,
    cmap="Set2",
    legend=True,
    legend_kwds={'loc': 'center left','bbox_to_anchor':(1,0.5),'fmt': "{:.0f}"},
    linewidth=0,
    ax=ax,
)
# Remove the axis
ax.set_axis_off()

# Add the title
ax.set_title("KMeans with $k=5$")


# Plot the choropleth map of the Agglomerative Hierarchical Clustering results
ax = axs[0,1]

ny_merge_2.plot(
    column="ward5_label",
    categorical=True,
    cmap="Set3",
    legend=True,
    legend_kwds={'loc': 'center left','bbox_to_anchor':(1,0.5),'fmt': "{:.0f}"},
    linewidth=0,
    ax=ax,
)
# Remove the axis
ax.set_axis_off()

# Add the title
ax.set_title("AHC with $k=5$")

# Deleting the empty axis
axs[1,0].set_axis_off()
axs[1,1].set_axis_off()

# Display the map
plt.show()

### 5.3 Hierarchical Clustering w/ Spatial Constraint

In [None]:
# Set the seed for reproducibility
np.random.seed(54321)

# Specify cluster model with a spatial constraint. Constraint is passed using connectivity parameter. 
model = AgglomerativeClustering(
    linkage="ward", connectivity=w.sparse, n_clusters=5
)
# Fit the algorithm to the data
model.fit(ny_merged_scaled)

# Assign the labels to dataframe
ny_merge_2["ward5wgt_label"] = model.labels_

# Assign labels to the scaled dataframe
ny_merged_scaled_df["ward5wgt_label"] = model.labels_

In [None]:
# Setup figure and ax
fig, axs = plt.subplots(2, 2, figsize=(12, 6))

# Plotting the k-means map
ax = axs[0,0]
# Plot the choropleth map of the k-means results
ny_merge_2.plot(
    column="kmeans_5_label",
    categorical=True,
    cmap="Set2",
    legend=True,
    legend_kwds={'loc': 'center left','bbox_to_anchor':(1,0.5),'fmt': "{:.0f}"},
    linewidth=0,
    ax=ax,
)
# Remove the axis
ax.set_axis_off()

# Add the title
ax.set_title("KMeans with $k=5$")


# Plot the choropleth map of the Agglomerative Hierarchical Clustering results
ax = axs[0,1]

ny_merge_2.plot(
    column="ward5_label",
    categorical=True,
    cmap="Set3",
    legend=True,
    legend_kwds={'loc': 'center left','bbox_to_anchor':(1,0.5),'fmt': "{:.0f}"},
    linewidth=0,
    ax=ax,
)
# Remove the axis
ax.set_axis_off()

# Add the title
ax.set_title("AHC with $k=5$")


# Plot the choropleth map of the Spatially Constrained Agglomerative Hierarchical Clustering results
ax = axs[1, 0]

ny_merge_2.plot(
    column="ward5wgt_label",
    categorical=True,
    cmap="Set1",
    legend=True,
    legend_kwds={'loc': 'center left','bbox_to_anchor':(1,0.5),'fmt': "{:.0f}"},
    linewidth=0,
    ax=ax,
)
# Remove the axis
ax.set_axis_off()

# Add the title
ax.set_title("SC AHC with $k=5$")

# Deleting the empty axis
axs[1,1].set_axis_off()

# Display the map
plt.show()

In [None]:
# Plot the radial plots for the spatially constrained models
ward5_wgt_means_s = ny_merged_scaled_df.groupby("ward5wgt_label")[geo_demo_rn].mean()

# Transpose the table and rounding the values to 2 decimal places
ward5_wgt_means_s.round(2)

categories = ward5_wgt_means_s.columns
fig = go.Figure()

for g in ward5_wgt_means_s.index:
    fig.add_trace(go.Scatterpolar(
        r = ward5_wgt_means_s.loc[g].values,
        theta = categories,
        fill = 'toself',
        name = f'cluster #{g}'
    ))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[-2, 6] # here we can define the range
    )),
  showlegend=True,
    title="SCAHC Cluster Radial Plot",
    title_x=0.5
)

fig.show()

In [None]:
# Changing the spatial constraint to use KNN
w = KNN.from_dataframe(ny_merge_2, k=10)

In [None]:
# Setting the seed for reproducibility
np.random.seed(54321)

# Specifying the cluster model with KNN spatial constraint
model = AgglomerativeClustering(
    linkage="ward", connectivity=w.sparse, n_clusters=5
)
# Fitting the algorithm to the data
model.fit(ny_merged_scaled)

# Assigning the labels to dataframe
ny_merge_2["ward5_knnwgt_label"] = model.labels_

# Assigning labels to scaled dataframe
ny_merged_scaled_df["ward5_knnwgt_label"] = model.labels_

In [None]:
# Setup figure and ax
f, axs = plt.subplots(2, 2, figsize=(12, 6))

# Plotting the k-means map
ax = axs[0,0]
# Plot the choropleth map of the k-means results
ny_merge_2.plot(
    column="kmeans_5_label",
    categorical=True,
    cmap="Set2",
    legend=True,
    legend_kwds={'loc': 'center left','bbox_to_anchor':(1,0.5),'fmt': "{:.0f}"},
    linewidth=0,
    ax=ax,
)
# Remove the axis
ax.set_axis_off()

# Add the title
ax.set_title("KMeans with $k=5$")


# Plot the choropleth map of the Agglomerative Hierarchical Clustering results
ax = axs[0,1]

ny_merge_2.plot(
    column="ward5_label",
    categorical=True,
    cmap="Set3",
    legend=True,
    legend_kwds={'loc': 'center left','bbox_to_anchor':(1,0.5),'fmt': "{:.0f}"},
    linewidth=0,
    ax=ax,
)
# Remove the axis
ax.set_axis_off()

# Add the title
ax.set_title("AHC with $k=5$")


# Plot the choropleth map of the Spatially Constrained Agglomerative Hierarchical Clustering results
ax = axs[1, 0]

ny_merge_2.plot(
    column="ward5wgt_label",
    categorical=True,
    cmap="Set1",
    legend=True,
    legend_kwds={'loc': 'center left','bbox_to_anchor':(1,0.5),'fmt': "{:.0f}"},
    linewidth=0,
    ax=ax,
)
# Remove the axis
ax.set_axis_off()

# Add the title
ax.set_title("SC AHC with $k=5$")

### KNN SC_AHC ###
ax = axs[1,1]

# Plot the choropleth map of the KNN SC AHC results
ny_merge_2.plot(
    column="ward5_knnwgt_label",
    categorical=True,
    cmap="RdPu",
    legend=True,
    legend_kwds={'loc': 'center left','bbox_to_anchor':(1,0.5),'fmt': "{:.0f}"},
    linewidth=0,
    ax=ax,
)
# Remove the axis
ax.set_axis_off()

# Add the title
ax.set_title("KNN SC AHC with $k=5$")

# Display the map
plt.show()

In [None]:
ward5_knnwgt_s = ny_merged_scaled_df.groupby("ward5_knnwgt_label")[geo_demo_rn].mean()

# Transpose the table and rounding the values to 2 decimal places
ward5_knnwgt_s.round(2)

categories = ward5_knnwgt_s.columns
fig = go.Figure()

for g in ward5_knnwgt_s.index:
    fig.add_trace(go.Scatterpolar(
        r = ward5_knnwgt_s.loc[g].values,
        theta = categories,
        fill = 'toself',
        name = f'cluster #{g}'
    ))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[-2, 6] # here we can define the range
    )),
  showlegend=True,
    title="KNN Cluster Radial Plot",
    title_x=0.5
)

fig.show()

## 6. Model Performance

In [None]:
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score

ch_scores = []
db_scores = []
s_scores = []

In [None]:
for model in ("kmeans_5_label", "ward5_label", "ward5wgt_label", "ward5_knnwgt_label"):
    # compute the CH score
    ch_score = calinski_harabasz_score(
        ny_merged_scaled_df[geo_demo_rn],
        ny_merged_scaled_df[model],
    )
    ch_scores.append((model, ch_score))
    
    # compute the DB score
    db_score = davies_bouldin_score(
        ny_merged_scaled_df[geo_demo_rn],
        ny_merged_scaled_df[model],
    )
    db_scores.append((model, db_score))
    
    # compute the silhouette score
    s_score = silhouette_score(
        ny_merged_scaled_df[geo_demo_rn],
        ny_merged_scaled_df[model],
    )
    s_scores.append((model, s_score))

In [None]:
# create a dataframe from the scores
ch_df = pd.DataFrame(
    ch_scores, columns=["model", "CH Score"]
).set_index("model")
db_df = pd.DataFrame(
    db_scores, columns=["model", "DB Score"]
).set_index("model")
s_df = pd.DataFrame(
    s_scores, columns=["model", "Silhouette Score"]
).set_index("model")

# Merging into a combined dataframe
scores_df = ch_df.merge(db_df, on="model")
scores_df = scores_df.merge(s_df, on="model")

# displaying the dataframe
scores_df