In [None]:
#Download packages
import pandas as pd
import geopandas as gpd
from sklearn.cluster import KMeans

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import numpy as np

In [None]:
#Open the data as a dataframe
df = pd.read_csv('preprocessed.csv')

In [None]:
shapefile_path = "London-wards/London_Ward.shp" 
gdf = gpd.read_file(shapefile_path, )
gdf = gdf.rename(columns={'NAME': 'ward_n','DISTRICT':'borough'})
#Remove the rows that are about the City of London
gdf = gdf.drop(gdf[gdf['borough'] == 'City and County of the City of London'].index)

#Remove the dots in the ward names.
gdf['ward_n'] = gdf['ward_n'].str.replace('.','')

#Fixing a difference in spelling
gdf = gdf.replace({'Shirly South':'Shirley South'})

In [None]:
#Combine the tables by adding the corresponding geometry polygon for each ward in the PAS data and leaving the other columns as they are.
merged_df = df.merge(gdf.loc[:,['ward_n','borough','geometry']], how = 'left',on=['ward_n','borough'], copy=True)
merged_df

In [None]:
silhouette_scores = []

min_clusters = 4
max_clusters = 20
cluster_data = merged_df.loc[:,['borough','geometry']]
geo_data = cluster_data['geometry']

In [None]:
for num_clusters in range(min_clusters, max_clusters + 1):
    # Perform K-means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=0)
    cluster_labels = kmeans.fit_predict(geo_data)
    
    # Calculate silhouette score
    silhouette_avg = silhouette_score(geo_data, cluster_labels)
    silhouette_scores.append(silhouette_avg)

In [None]:
no_nans = merged_df.dropna(subset=['geometry'])

In [None]:
polygon_list = merged_df.geometry.unique()

In [None]:
no_nans.head(1)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
no_nans.plot(column='ward_n', cmap='viridis', legend=True, ax=ax)

for idx, row in no_nans.iterrows():
    district_name = row['borough']
    centroid = row['geometry'].centroid
    offset_x = centroid.x
    offset_y = centroid.y
    plt.annotate(text=district_name, xy=(offset_x, offset_y), color='black', fontsize=6)

plt.title("Clustered Districts by Education Index")
plt.show()