In [2]:
import pickle
import os
import numpy as np
import geopandas as gpd
from shapely.geometry import Point, LineString

In [3]:
data_dir = os.path.join("../../data/embeddings/")
lsoas_file = os.path.join("../../data/SpatialData/", "LSOAs_2021", "LSOA_2021_EW_BSC_V4.shp")

### Get spatial data

In [7]:
lsoas =  gpd.read_file(lsoas_file)
manc_lads = ['Manchester', 'Rochdale', 'Bolton', 'Bury', 'Wigan', 'Oldham',  'Trafford', 'Salford', 'Tameside', 'Stockport']
manc_lads_pattern = '|'.join(manc_lads)
gm_lsoa=lsoas[lsoas['LSOA21NM'].str.contains(manc_lads_pattern)]
gm_lsoa = gm_lsoa.to_crs(epsg=4326)

### Get embeddings (four per location)

In [8]:
points_data_cache = data_dir + "points_with_embeddings.pkl"
with open(points_data_cache, "rb") as f:
    point_records = pickle.load(f)

### Join image embeddings points to gentrification LSOAs

In [9]:
point_coords = [Point(rec['longitude'], rec['latitude']) for rec in point_records]
points_labels_gdf = gpd.GeoDataFrame(point_records, geometry=point_coords, crs="EPSG:4326")

# Perform spatial join to get gentrification label for each point
points_labels_gdf = gpd.sjoin(points_labels_gdf, gm_lsoa, how='inner', predicate='within')
# sjoin may add an index from the polygon ('index_right'); we can drop it
if 'index_right' in points_labels_gdf.columns:
    points_labels_gdf = points_labels_gdf.drop(columns=['index_right'])

print(f"Points after spatial join: {len(points_labels_gdf)} / {len(point_records)}"
      f" `(some points may lie outside the label polygons and were dropped)")

Points after spatial join: 18897 / 18897 `(some points may lie outside the label polygons and were dropped)


# 1. Find mean embedding at each location.

In [10]:
points_labels_gdf["mean_embedding"] = points_labels_gdf["embedding"].apply(
    lambda emb_list: np.mean(np.stack(emb_list), axis=0)) 

In [12]:
points_labels_gdf[['LSOA21CD', 'mean_embedding']].to_pickle(data_dir + "lsoa_embedding_summaries/mean_embeddings_per_location.pkl")

# 2. Find mean embedding in LSOA

In [15]:
def mean_embed(x):
    return np.mean(np.vstack(x), axis=0)

avg_embeddings = (points_labels_gdf.groupby("LSOA21CD")["mean_embedding"]
      .apply(mean_embed)
      .reset_index())

avg_embeddings.to_pickle(data_dir + "lsoa_embedding_summaries/mean_embeddings_per_lsoa.pkl")

# 3. Find median embedding in LSOA

In [16]:
def median_embed(x):
    return np.median(np.vstack(x), axis=0)

median_embeddings = (points_labels_gdf.groupby("LSOA21CD")["mean_embedding"]
      .apply(median_embed)
      .reset_index())
median_embeddings.rename(columns={'mean_embedding':'median_embedding'}, inplace=True)
median_embeddings.to_pickle(data_dir + "lsoa_embedding_summaries/median_embeddings_per_lsoa.pkl")