## SummariseEmbeddingsOverCategories
This script:
- Reads in a pickle files which contains a dataframe with one row per sampled image. Each image is associated with a location (lat, lon), a link to the image_file, an embedding and category_scores
- Assigns image to one category based on which has the highest score
- Finds the percentage of images in each LSOA, within each of the categories
- Finds the mean/min/max embedding within each category, within each LSOA
- Saves a pickle file containing a dataframe containing this information

In [50]:
import pickle
import os
import numpy as np
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point, LineString
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from functools import reduce
from collections import defaultdict
import hdbscan 
from sklearn.ensemble import IsolationForest
import seaborn as sns
from math import ceil
from PIL import Image
import umap
import random
from sklearn.cluster import KMeans, MiniBatchKMeans
from functions import *

In [51]:
data_dir = os.path.join("../../../../data/embeddings/")
lsoas_file = os.path.join("../../../../data/SpatialData/", "LSOAs_2021", "LSOA_2021_EW_BSC_V4.shp")

In [52]:
# Two options depending on which semantic categories are being used
file_ending = 'userdefinedclasses' #'planninguseclasses' #userdefinedclasses
headline_categories = categories_dict[file_ending]

### Get spatial data

In [53]:
lsoas =  gpd.read_file(lsoas_file)
manc_lads = ['Manchester', 'Rochdale', 'Bolton', 'Bury', 'Wigan', 'Oldham',  'Trafford', 'Salford', 'Tameside', 'Stockport']
manc_lads_pattern = '|'.join(manc_lads)
gm_lsoa=lsoas[lsoas['LSOA21NM'].str.contains(manc_lads_pattern)]
gm_lsoa = gm_lsoa.to_crs(epsg=4326)

### Get embeddings (four per location)

In [54]:
points_data_cache = data_dir + f"sample_points_cache/points_data_cache_with_CLIP_embeddings_and_scores_{file_ending}.pkl"
with open(points_data_cache, "rb") as f:
    point_records = pickle.load(f)

### Join image embeddings points to gentrification LSOAs

In [55]:
point_coords = [Point(rec['longitude'], rec['latitude']) for rec in point_records]
points_labels_gdf = gpd.GeoDataFrame(point_records, geometry=point_coords, crs="EPSG:4326")

# Perform spatial join to get gentrification label for each point
points_labels_gdf = gpd.sjoin(points_labels_gdf, gm_lsoa, how='inner', predicate='within')
# sjoin may add an index from the polygon ('index_right'); we can drop it
if 'index_right' in points_labels_gdf.columns:
    points_labels_gdf = points_labels_gdf.drop(columns=['index_right'])

print(f"Points after spatial join: {len(points_labels_gdf)} / {len(point_records)}"
      f" `(some points may lie outside the label polygons and were dropped)")

Points after spatial join: 18897 / 18897 `(some points may lie outside the label polygons and were dropped)


# Find mean embedding at each location.

In [56]:
points_labels_gdf["mean_embedding"] = points_labels_gdf["embedding"].apply(
    lambda emb_list: np.mean(np.stack(emb_list), axis=0)) 
# save
# points_labels_gdf[['LSOA21CD', 'mean_embedding']].to_pickle(data_dir + "embedding_summaries/mean_embeddings_per_location_CLIP.pkl")

# Expand dataframe so there is one row per image

In [57]:
expanded_rows = []

for _, row in points_labels_gdf.iterrows():
    embeddings = row['embedding']      # list of 4 embeddings
    images = row['image_files']        # list of 4 image paths
    cat_scores = row['category_scores']        # list of 4 image paths
    
    # Skip if lengths don't match
    if len(embeddings) != len(images):
        print(f"Skipping row {row.name}: {len(embeddings)} embedding, {len(images)} images")
        continue

    for score, emb, img in zip(cat_scores, embeddings, images):
        new_row = row.to_dict()        # copy all other columns
        new_row['embedding'] = emb     # single embedding
        new_row['image_files'] = img    # single image
        new_row['category_scores'] = score
        expanded_rows.append(new_row)

# Create new DataFrame
expanded_gdf = pd.DataFrame(expanded_rows)
print(f"Original rows: {len(points_labels_gdf)}, Expanded rows: {len(expanded_gdf)}")
expanded_gdf = pd.DataFrame(expanded_rows)

Skipping row 9158: 3 embedding, 4 images
Skipping row 18664: 3 embedding, 4 images
Original rows: 18897, Expanded rows: 75580


## Assign each image to one classification, based on highest score

In [58]:
# Flatten all images across all points
all_image_paths = []
all_scores = []

for idx, rec in expanded_gdf.iterrows():
    all_image_paths.extend(rec["image_files"])
    all_scores.append(rec["category_scores"])

# Convert list of arrays → single (N,9) array
all_scores = np.vstack(all_scores)

category_probs = np.array(all_scores)
assigned_labels = category_probs.argmax(axis=1)
assigned_names = [headline_categories[i] for i in assigned_labels]
expanded_gdf['category'] = assigned_names

# Create a dataframe with % of images in each category, in each LSOA 

In [59]:
all_embeddings = np.stack(expanded_gdf['embedding'].values)
df = expanded_gdf
category_column = "category"

In [60]:
# --- 1. Count images per (LSOA, category) ---
category_counts = (df.groupby(["LSOA21CD", category_column]).size().reset_index(name="count"))

# --- 2. Total images per LSOA ---
total_counts = (df.groupby("LSOA21CD").size().reset_index(name="total_images"))

# --- 3. Merge totals ---
category_counts = category_counts.merge(total_counts, on="LSOA21CD")

# --- 4. Add percentage for each category ---
category_counts["pct"] = (category_counts["count"] / category_counts["total_images"] * 100)

# --- 5. Wide table: counts in columns ---
counts_wide = (category_counts.pivot(index="LSOA21CD", columns=category_column, values="count").fillna(0).add_prefix("count_"))

# --- 6. Wide table: percentages in columns ---
pct_wide = (category_counts.pivot(index="LSOA21CD", columns=category_column, values="pct")
        .fillna(0).add_prefix("pct_"))

# --- 7. Combine both + total images per LSOA ---
lsoa_summary = (total_counts.set_index("LSOA21CD").join([counts_wide, pct_wide]))

# plt.hist(lsoa_summary['total_images'], bins=20)

lsoa_summary = lsoa_summary.merge(gm_lsoa['LSOA21CD'],'right', on = "LSOA21CD")
lsoa_summary.head()

Unnamed: 0,LSOA21CD,total_images,count_car,count_cucumber,count_highway,count_industrial,count_office,count_park,count_residential_street,count_shops_and_cafes,...,pct_car,pct_cucumber,pct_highway,pct_industrial,pct_office,pct_park,pct_residential_street,pct_shops_and_cafes,pct_single_house,pct_urban_street
0,E01004766,64.0,3.0,0.0,0.0,1.0,0.0,4.0,6.0,1.0,...,4.6875,0.0,0.0,1.5625,0.0,6.25,9.375,1.5625,40.625,35.9375
1,E01004767,72.0,7.0,0.0,3.0,0.0,0.0,3.0,10.0,2.0,...,9.722222,0.0,4.166667,0.0,0.0,4.166667,13.888889,2.777778,34.722222,30.555556
2,E01004768,44.0,1.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,...,2.272727,0.0,0.0,0.0,0.0,11.363636,11.363636,0.0,45.454545,29.545455
3,E01004769,40.0,3.0,0.0,0.0,2.0,0.0,6.0,6.0,0.0,...,7.5,0.0,0.0,5.0,0.0,15.0,15.0,0.0,35.0,22.5
4,E01004770,40.0,0.0,0.0,0.0,1.0,0.0,0.0,9.0,0.0,...,0.0,0.0,0.0,2.5,0.0,0.0,22.5,0.0,35.0,40.0


# Spatial plot of percentage in each category

In [61]:
# # merge with LSOA shapes
# plot_gdf = gm_lsoa.merge(category_pct_wide, on="LSOA21CD", how="left").fillna(0)

# # list of categories
# category_names = category_pct_wide.columns.tolist()

# # -------------------------
# # 2. Create subplots
# # -------------------------

# n = len(category_names)
# ncols = 3
# nrows = int(np.ceil(n / ncols))

# fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 5*nrows))
# axs = axs.flatten()

# for ax, category in zip(axs, category_names):
#     plot_gdf.plot(column=category, ax=ax, legend=True, cmap="viridis",
#         edgecolor="black",  linewidth=0.2, legend_kwds={"shrink": 0.5})
#     ax.set_title(f"{category} (%)")
#     ax.axis("off")

# # turn off any empty axes
# for empty_ax in axs[len(category_names):]:
#     empty_ax.axis("off")

# plt.tight_layout()
# plt.show()

# Find mean/median/max embedding in each LSOA, also by catgeory

In [62]:
# Aggregation functions
def mean_embed(series):
    return np.mean(np.stack(series.values), axis=0)

def max_embed(series):
    return np.max(np.stack(series.values), axis=0)

def median_embed(series):
    return np.median(np.stack(series.values), axis=0)

agg_funcs = {"mean": mean_embed, "max": max_embed, "median": median_embed}

# List of categories
categories = df[category_column].unique()

# Initialize list to hold all DataFrames
all_dfs = []

for agg_name, func in agg_funcs.items():
    dfs = []
    
    # Per-category embeddings
    for cat in categories:
        df_cat = df[df[category_column] == cat]
        emb_cat = df_cat.groupby("LSOA21CD")["embedding"].apply(func).reset_index()
        emb_cat = emb_cat.rename(columns={"embedding": f"{cat}_{agg_name}"})
        dfs.append(emb_cat)
    
    # Merge all categories
    merged = reduce(lambda left, right: pd.merge(left, right, on="LSOA21CD", how="outer"), dfs)
    
    # Overall embedding (all images in LSOA)
    overall = expanded_gdf.groupby("LSOA21CD")["embedding"].apply(func).reset_index()
    overall = overall.rename(columns={"embedding": f"overall_{agg_name}"})
    
    merged = merged.merge(overall, on="LSOA21CD", how="left")
    
    all_dfs.append(merged)

# Merge mean, max, median into a single DataFrame
final_df = reduce(lambda left, right: pd.merge(left, right, on="LSOA21CD", how="outer"), all_dfs)

# # Fill missing embeddings with zeros if needed
# final_df = final_df.fillna(0)

# Create one dataframe with all information

In [63]:
final_df = final_df.merge(lsoa_summary, on = "LSOA21CD")
final_df.to_pickle(data_dir + f"embedding_summaries/big_summary_df_{file_ending}.pkl")