In [46]:
import pickle
import os
import geopandas as gpd
from shapely.geometry import Point, LineString
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import pandas as pd
from PIL import Image

data_dir = os.path.join("../../data/embeddings/")
lsoas_file = os.path.join("../../data/SpatialData/", "LSOAs_2011", "LSOA_2011_EW_BSC_V4.shp")

### Get spatial data

In [3]:
lsoas =  gpd.read_file(lsoas_file)
manc_lads = ['Manchester', 'Rochdale', 'Bolton', 'Bury', 'Wigan', 'Oldham',  'Trafford', 'Salford', 'Tameside', 'Stockport']
manc_lads_pattern = '|'.join(manc_lads)
gm_lsoa=lsoas[lsoas['LSOA11NMW'].str.contains(manc_lads_pattern)]
gm_lsoa = gm_lsoa.to_crs(epsg=4326)

### Get embeddings (four per location)

In [4]:
points_data_cache = data_dir + "points_with_embeddings.pkl"
with open(points_data_cache, "rb") as f:
    point_records = pickle.load(f)

### Join image embeddings points to gentrification LSOAs

In [8]:
point_coords = [Point(rec['longitude'], rec['latitude']) for rec in point_records]
points_labels_gdf = gpd.GeoDataFrame(point_records, geometry=point_coords, crs="EPSG:4326")

# Perform spatial join to get gentrification label for each point
points_labels_gdf = gpd.sjoin(points_labels_gdf, gm_lsoa, how='inner', predicate='within')
# sjoin may add an index from the polygon ('index_right'); we can drop it
if 'index_right' in points_labels_gdf.columns:
    points_labels_gdf = points_labels_gdf.drop(columns=['index_right'])

print(f"Points after spatial join: {len(points_labels_gdf)} / {len(point_records)}"
      f" `(some points may lie outside the label polygons and were dropped)")

Points after spatial join: 18897 / 18897 `(some points may lie outside the label polygons and were dropped)


### Find the cosine similarity of the 4 images at each location

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

# Example: suppose each row has a list of 4 embeddings in 'embedding' column
# embeddings should be NumPy arrays of same length

def mean_cosine_similarity(embeddings_list):
    """
    embeddings_list: list of 4 embeddings (arrays)
    returns mean pairwise cosine similarity
    """
    # Stack embeddings into a 2D array (4, embedding_dim)
    emb_array = np.stack(embeddings_list)
    
    # Compute cosine similarity matrix
    sim_matrix = cosine_similarity(emb_array)
    
    # Take upper triangle without diagonal
    n = sim_matrix.shape[0]
    upper_tri_indices = np.triu_indices(n, k=1)
    mean_sim = sim_matrix[upper_tri_indices].mean()
    return mean_sim

# Apply to dataframe
points_labels_gdf['embedding_cosine_similarity'] = points_labels_gdf['embedding'].apply(mean_cosine_similarity)




### Group by LSOA

In [None]:
for lsoa_code in np.unique(points_labels_gdf['LSOA11CD']):
    one_lsoa = points_labels_gdf[points_labels_gdf['LSOA11CD'] ==lsoa_code]
    print(len(one_lsoa))

## For each LSOA check the optimal number of clusters with silhoutte scores

In [45]:
for lsoa_code in np.unique(points_labels_gdf['LSOA11CD']):
    one_lsoa = points_labels_gdf[points_labels_gdf['LSOA11CD'] ==lsoa_code]

    # --- Gather all embeddings from one LSOA ---
    all_embeddings = []
    for _, row in one_lsoa.iterrows():
        all_embeddings.extend(row['embedding'])
    all_embeddings = np.stack(all_embeddings)

    # --- Range of cluster counts to test ---
    n_samples = len(all_embeddings)
    max_k = min(20, n_samples - 1)
    k_values = range(2, max_k + 1)

    inertias = []
    sil_scores = []

    for k in k_values:
        kmeans = KMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(all_embeddings)

        inertias.append(kmeans.inertia_)
        sil_scores.append(silhouette_score(all_embeddings, labels))

    best_k = k_values[np.argmax(sil_scores)]
    print(f"{lsoa_code}: {best_k}")

E01004766: 2
E01004767: 2
E01004768: 2
E01004769: 8
E01004770: 2
E01004771: 2
E01004772: 3
E01004773: 4
E01004774: 2
E01004775: 2
E01004776: 6
E01004777: 12
E01004778: 4
E01004779: 3
E01004780: 2
E01004781: 3
E01004782: 8
E01004783: 2
E01004784: 3
E01004785: 5
E01004786: 2
E01004787: 8
E01004788: 12
E01004789: 2
E01004790: 2
E01004791: 5
E01004792: 2
E01004793: 3
E01004794: 2
E01004795: 7
E01004796: 2
E01004797: 3
E01004798: 3
E01004799: 2
E01004800: 3
E01004801: 7
E01004802: 2
E01004803: 2
E01004804: 5
E01004805: 3
E01004806: 2
E01004807: 2
E01004808: 5
E01004809: 5
E01004810: 2
E01004811: 8
E01004812: 3
E01004813: 10
E01004814: 2
E01004815: 2
E01004816: 8
E01004817: 2
E01004818: 2
E01004819: 3
E01004820: 2
E01004822: 2
E01004823: 14
E01004824: 5
E01004825: 2
E01004826: 3
E01004827: 2
E01004828: 3
E01004829: 9
E01004830: 5
E01004831: 6
E01004832: 5
E01004833: 3
E01004834: 4
E01004835: 2
E01004836: 5
E01004837: 2
E01004838: 2
E01004839: 4
E01004840: 3
E01004841: 3
E01004842: 5
E0100484


KeyboardInterrupt



In [47]:
def find_optimal_k(all_embeddings):
    # --- Range of cluster counts to test ---
    n_samples = len(all_embeddings)
    max_k = min(20, n_samples - 1)
    k_values = range(2, max_k + 1)

    inertias = []
    sil_scores = []

    for k in k_values:
        kmeans = KMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(all_embeddings)

        inertias.append(kmeans.inertia_)
        sil_scores.append(silhouette_score(all_embeddings, labels))

    best_k = k_values[np.argmax(sil_scores)]
    return best_k

In [55]:
one_lsoa = points_labels_gdf# [points_labels_gdf['LSOA11CD'] =="E01004844"]

# --- STEP 1: Collect all embeddings + image paths from one LSOA ---
all_embeddings = []
all_image_paths = []

for _, row in one_lsoa.iterrows():
    for emb, img_path in zip(row['embedding'], row['image_files']):
        all_embeddings.append(emb)
        all_image_paths.append(img_path)

all_embeddings = np.stack(all_embeddings)

# --- STEP 2: Cluster embeddings ---
n_clusters =find_optimal_k(all_embeddings)
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(all_embeddings)

cluster_df = pd.DataFrame({
    'image_path': all_image_paths,
    'cluster': cluster_labels
}).sort_values('cluster').reset_index(drop=True)

# --- STEP 3: Plot images by cluster ---
for c in range(n_clusters):
    cluster_imgs = cluster_df[cluster_df['cluster'] == c]['image_path'].tolist()
    n_imgs = len(cluster_imgs)
    
    if n_imgs == 0:
        continue
    
    n_cols = min(4, n_imgs)  # number of columns in grid
    n_rows = int(np.ceil(n_imgs / n_cols))
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 3, n_rows * 3))
    axes = np.atleast_2d(axes)
    
    fig.suptitle(f"Cluster {c} ({n_imgs} images)", fontsize=14, fontweight='bold')
    
    for i, img_path in enumerate(cluster_imgs):
        r, col = divmod(i, n_cols)
        ax = axes[r, col]
        adj_path = img_path.replace("airbnb-manchester/", "embeddings/").replace("../", "../../")
        if os.path.exists(adj_path):
            img = Image.open(adj_path)
            ax.imshow(img)
        ax.axis("off")
    
    # Hide empty subplots if any
    for j in range(i + 1, n_rows * n_cols):
        r, col = divmod(j, n_cols)
        axes[r, col].axis("off")
    
    plt.tight_layout()
    plt.show()


ValueError: Image size of 1200x3091500 pixels is too large. It must be less than 2^16 in each direction.

Error in callback <function _draw_all_if_interactive at 0x7f7f411659d0> (for post_execute):


ValueError: Image size of 1200x3091500 pixels is too large. It must be less than 2^16 in each direction.

ValueError: Image size of 1200x3091500 pixels is too large. It must be less than 2^16 in each direction.

<Figure size 1200x3.0915e+06 with 41220 Axes>

In [32]:
# # Group by LSOA and compute diversity
# lsoa_diversity = points_labels_gdf.groupby('LSOA11CD')['embedding'].apply(
#     lambda x: embedding_diversity(list(x))).reset_index(name='diversity')