In [None]:
from sklearn.datasets import load_iris
from sklearn.cluster import AgglomerativeClustering
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
## import the dataset. Change the file path as needed
df = pd.read_excel(r"C:\Users\Daniel.Lang\Downloads\Park Typology Dataset 3.xlsx")

## change numerical columns to text values
df.rename(columns={2012: '2012', 2016:'2016', 2019:'2019', 2022:'2022'},
    inplace=True)
df.head()

In [None]:
# extract variables of interest by column number
voi = df.iloc[:, [0,1,7, 35]]
voi.head()

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
## choose columns to run the analysis on
data_columns = ['2022','Road Length in Park (Mi)']  
data = voi[data_columns]
## scale the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
## make clusters
agg_clustering = AgglomerativeClustering(n_clusters=10)  
labels = agg_clustering.fit_predict(data_scaled)
print(labels)

In [None]:
## append lapels to original dataset
voi['Cluster_Labels'] = labels
voi.head()

In [None]:
from scipy.cluster.hierarchy import dendrogram , linkage
## Calculate Z (Euclidean Distance)
Z = linkage(data_scaled, method = 'ward')

## (Plot Dendrogram)
dendro = dendrogram(Z)
plt.title('Dendrogram')
plt.ylabel('Euclidean distance')
plt.show()

In [None]:
# Get and sort unique cluster labels
unique_clusters = sorted(voi['Cluster_Labels'].unique())

# Loop through each cluster and print the count
for cluster_label in unique_clusters:
    cluster_length = len(voi.loc[voi['Cluster_Labels'] == cluster_label])
    print(f"{cluster_length} in cluster {cluster_label}")

In [None]:
## find the largest cluster
max_cluster = voi['Cluster_Labels'].value_counts().idxmax()

# Extract rows with the max_cluster value into a new DataFrame
second_sort = voi[voi['Cluster_Labels'] == max_cluster]
second_sort.head()

In [None]:
## choose columns to run the second analysis on
data_columns = ['2022','Road Length in Park (Mi)']  
data = second_sort[data_columns]
## scale the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
## make clusters
agg_clustering = AgglomerativeClustering(n_clusters=10)  
labels = agg_clustering.fit_predict(data_scaled)
print(labels)

In [None]:
## Calculate Z (Euclidean Distance)
Z = linkage(data_scaled, method = 'ward')

## Plot Dendrogram
dendro = dendrogram(Z)
plt.title('Dendrogram')
plt.ylabel('Euclidean distance')
plt.show()

In [None]:
## add results to table
second_sort['Cluster_Labels2'] = labels
second_sort.head()

In [None]:
# Get and sort unique cluster labels
unique_clusters = sorted(second_sort['Cluster_Labels2'].unique())

# Loop through each cluster and print the count
for cluster_label in unique_clusters:
    cluster_length = len(second_sort.loc[second_sort['Cluster_Labels2'] == cluster_label])
    print(f"{cluster_length} in cluster {cluster_label}")

In [None]:
## clean and merge second level results
voi = voi.merge(second_sort[['NPS Full Name','Cluster_Labels2']], on='NPS Full Name', how='left', suffixes=('', '_second_sort'))
voi['Cluster_Labels2'] = voi['Cluster_Labels2'].fillna(999)
voi['Cluster_Labels2'] = voi['Cluster_Labels2'].astype(int)
voi.head()

In [None]:
## list second level clusters
grouped = voi.groupby('Cluster_Labels2')['Simplified Name'].unique()

for cluster_label, names in grouped.items():
    print(f"Cluster {cluster_label}: {', '.join(names)}")

In [None]:
## non-scaled analysis
data_columns = ['2022','Road Length in Park (Mi)']  
data = voi[data_columns]

In [None]:
## create clusters
agg_clustering = AgglomerativeClustering(n_clusters=10)  
labels = agg_clustering.fit_predict(data)
voi['Cluster_Labels_US'] = labels
voi.head()

In [None]:
# Get and sort unique cluster labels
unique_clusters = sorted(voi['Cluster_Labels_US'].unique())

# Loop through each cluster and print the count
for cluster_label in unique_clusters:
    cluster_length = len(voi.loc[voi['Cluster_Labels_US'] == cluster_label])
    print(f"{cluster_length} in cluster {cluster_label}")

In [None]:
## plot dendrogram
Z = linkage(data, method = 'ward')
dendro = dendrogram(Z)
plt.title('Dendrogram')
plt.ylabel('Euclidean distance')
plt.show()

In [None]:
## find the largest cluster
max_cluster = voi['Cluster_Labels_US'].value_counts().idxmax()

## Extract rows with the max_cluster value into a new DataFrame
second_sort = voi[voi['Cluster_Labels_US'] == max_cluster]
second_sort.head()

In [None]:
## choose columns to run the second analysis on
data_columns = ['2022','Road Length in Park (Mi)']  
data = second_sort[data_columns]
## make clusters
agg_clustering = AgglomerativeClustering(n_clusters=10)  
labels = agg_clustering.fit_predict(data)
print(labels)

In [None]:
## add results to table 
second_sort['Cluster_Labels_US2'] = labels
second_sort.head()

In [None]:
# Get and sort unique cluster labels
unique_clusters = sorted(second_sort['Cluster_Labels_US2'].unique())

# Loop through each cluster and print the count
for cluster_label in unique_clusters:
    cluster_length = len(second_sort.loc[second_sort['Cluster_Labels_US2'] == cluster_label])
    print(f"{cluster_length} in cluster {cluster_label}")

In [None]:
## plot dendrogram 
Z = linkage(data, method = 'ward')
dendro = dendrogram(Z)
plt.title('Dendrogram')
plt.ylabel('Euclidean distance')
plt.show()

In [None]:
## clean and merge second level results
voi = voi.merge(second_sort[['NPS Full Name','Cluster_Labels_US2']], on='NPS Full Name', how='left', suffixes=('', '_second_sort'))
voi['Cluster_Labels_US2'] = voi['Cluster_Labels_US2'].fillna(999)
voi['Cluster_Labels_US2'] = voi['Cluster_Labels_US2'].astype(int)
voi.head()

In [None]:
## display second level clusters
grouped = voi.groupby('Cluster_Labels_US2')['Simplified Name'].unique()

for cluster_label, names in grouped.items():
    print(f"Cluster {cluster_label}: {', '.join(names)}")