## FindMedianEmbeddings_ForEachOf7Clusters
This script:
- Reads the expanded dataframe (one row per image) with cluster assignments (output of script 5)
- For a chosen k, computes per-LSOA summary statistics:
    - Count and percentage of images in each cluster
    - Mean, max, and median embedding within each cluster
    - Overall mean, max, and median embedding (across all clusters)
- Saves a single pickle file combining all of the above

In [1]:
import os
import numpy as np
import pandas as pd
from functools import reduce

In [2]:
from directory_filepaths import *

In [3]:
k = 4

### Get data

In [4]:
# Load expanded dataframe with cluster assignments (output of script 5)
expanded_gdf = pd.read_pickle(os.path.join(data_dir, "one_row_per_image_cleaned_with_cluster_numbers.pkl"))
print(f"Loaded {len(expanded_gdf)} image rows")

Loaded 75586 image rows


# Create a dataframe with % of images in each category, in each LSOA 

In [5]:
category_column = f"scene_cluster_{k}"

# Count images per (LSOA, cluster)
category_counts = (
    expanded_gdf.groupby(["LSOA21CD", category_column])
    .size()
    .reset_index(name="count")
)

# Total images per LSOA
total_counts = expanded_gdf.groupby("LSOA21CD").size().reset_index(name="total_images")

# Merge and compute percentages
category_counts = category_counts.merge(total_counts, on="LSOA21CD")
category_counts["pct"] = category_counts["count"] / category_counts["total_images"] * 100

# Pivot to wide format: one column per cluster for counts and percentages
counts_wide = (
    category_counts.pivot(index="LSOA21CD", columns=category_column, values="count")
    .fillna(0)
    .add_prefix("count_")
)
pct_wide = (
    category_counts.pivot(index="LSOA21CD", columns=category_column, values="pct")
    .fillna(0)
    .add_prefix("pct_")
)

# Combine counts, percentages, and totals per LSOA
lsoa_summary = total_counts.set_index("LSOA21CD").join([counts_wide, pct_wide])

# Sanity check - do the counts sum to total_images?
# Compute sums into local variables instead of creating temporary DataFrame columns
counts_sum = lsoa_summary.filter(like="count_").sum(axis=1)
assert np.all(counts_sum == lsoa_summary["total_images"]), "Counts do not sum to total_images for some LSOAs"
# And do the percentages sum to 100%?
pct_sum = lsoa_summary.filter(like="pct_").sum(axis=1)
assert np.allclose(pct_sum, 100), "Percentages do not sum to 100% for some LSOAs"

lsoa_summary.head()

Unnamed: 0_level_0,total_images,count_1,count_2,count_3,count_4,pct_1,pct_2,pct_3,pct_4
LSOA21CD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
E01004766,64,14.0,25.0,5.0,20.0,21.875,39.0625,7.8125,31.25
E01004767,72,19.0,20.0,8.0,25.0,26.388889,27.777778,11.111111,34.722222
E01004768,44,3.0,15.0,15.0,11.0,6.818182,34.090909,34.090909,25.0
E01004769,40,3.0,13.0,17.0,7.0,7.5,32.5,42.5,17.5
E01004770,40,4.0,18.0,9.0,9.0,10.0,45.0,22.5,22.5


# Find mean/median/max embedding in each LSOA, also by category

In [6]:
def mean_embed(series):
    return np.mean(np.stack(series.values), axis=0)

def max_embed(series):
    return np.max(np.stack(series.values), axis=0)

def median_embed(series):
    return np.median(np.stack(series.values), axis=0)

agg_funcs = {"mean": mean_embed, "max": max_embed, "median": median_embed}
categories = sorted(expanded_gdf[category_column].unique())

all_dfs = []

for agg_name, func in agg_funcs.items():
    dfs = []

    # Per-cluster embedding aggregation
    for cat in categories:
        emb_cat = (
            expanded_gdf[expanded_gdf[category_column] == cat]
            .groupby("LSOA21CD")["embedding"]
            .apply(func)
            .reset_index()
            .rename(columns={"embedding": f"{cat}_{agg_name}"})
        )
        dfs.append(emb_cat)

    # Merge all clusters, then add overall (all images in LSOA)
    merged = reduce(lambda left, right: pd.merge(left, right, on="LSOA21CD", how="outer"), dfs)

    overall = (
        expanded_gdf.groupby("LSOA21CD")["embedding"]
        .apply(func)
        .reset_index()
        .rename(columns={"embedding": f"overall_{agg_name}"})
    )
    merged = merged.merge(overall, on="LSOA21CD", how="left")
    all_dfs.append(merged)

# Combine mean, max, and median into a single DataFrame
final_df = reduce(lambda left, right: pd.merge(left, right, on="LSOA21CD", how="outer"), all_dfs)
print(f"{len(final_df)} LSOAs, {len(final_df.columns) - 1} feature columns")

1695 LSOAs, 15 feature columns


In [7]:
final_df

Unnamed: 0,LSOA21CD,1_mean,2_mean,3_mean,4_mean,overall_mean,1_max,2_max,3_max,4_max,overall_max,1_median,2_median,3_median,4_median,overall_median
0,E01004766,"[0.022396863, -0.060344696, 0.0025673595, -0.0...","[0.04366791, -0.061499022, -0.007403755, -0.00...","[0.028472137, -0.056408692, -0.009085846, -0.0...","[0.03406372, -0.054208376, 0.015108442, -0.007...","[0.03482639, -0.058570504, 0.0016810745, -0.00...","[0.05166626, -0.014503479, 0.020477295, 0.0179...","[0.06555176, -0.036071777, 0.027816772, 0.0227...","[0.06604004, -0.025939941, 0.011779785, 0.0054...","[0.06878662, 0.02444458, 0.04058838, 0.0209045...","[0.06878662, 0.02444458, 0.04058838, 0.0227813...","[0.027572632, -0.06323242, 0.004722595, -0.008...","[0.046936035, -0.057922363, -0.009925842, -0.0...","[0.01940918, -0.055511475, -0.015129089, 4.172...","[0.029373169, -0.060531616, 0.015346527, -0.00...","[0.037399292, -0.060531616, 0.0037984848, -0.0..."
1,E01004767,"[0.025314808, -0.060883623, -0.0066822954, -0....","[0.041760635, -0.066438295, -0.006786442, -0.0...","[0.022837639, -0.06278992, -0.020924568, 0.008...","[0.030664215, -0.05010452, 0.013643112, -0.008...","[0.031465285, -0.058895655, -0.0012362666, -0....","[0.061798096, 0.0012311935, 0.020095825, 0.041...","[0.07910156, -0.040283203, 0.022415161, 0.0113...","[0.05206299, -0.046447754, -0.003540039, 0.024...","[0.0647583, 0.007045746, 0.057800293, 0.018951...","[0.07910156, 0.007045746, 0.057800293, 0.04144...","[0.02519226, -0.061187744, -0.005306244, -0.00...","[0.04458618, -0.06074524, -0.009237289, -0.010...","[0.024368286, -0.0566864, -0.016716003, 0.0066...","[0.026123047, -0.047607422, 0.012107849, -0.01...","[0.028137207, -0.05706787, -0.002474308, -0.00..."
2,E01004768,"[0.033966064, -0.067352295, -0.0005553563, -0....","[0.0387441, -0.065450035, -0.0010991414, -0.01...","[0.050431825, -0.06296793, -0.013619105, 0.008...","[0.029031234, -0.059611928, 0.018901478, -0.01...","[0.03997456, -0.06327403, -0.0003300797, -0.00...","[0.039978027, -0.029022217, 0.009536743, -0.00...","[0.059661865, -0.04296875, 0.023986816, 0.0077...","[0.08831787, -0.042388916, 0.014816284, 0.0340...","[0.066711426, -0.04043579, 0.035186768, 0.0102...","[0.08831787, -0.029022217, 0.035186768, 0.0340...","[0.034729004, -0.072265625, 0.0019273758, -0.0...","[0.036499023, -0.06744385, -0.0031776428, -0.0...","[0.048187256, -0.055847168, -0.014961243, 0.00...","[0.026473999, -0.060943604, 0.022583008, -0.01...","[0.036331177, -0.06199646, -0.0030584335, -0.0..."
3,E01004769,"[0.033533733, -0.05350749, -0.012964885, -0.00...","[0.031613275, -0.06352821, -0.007081692, -0.00...","[0.042486753, -0.05386173, -0.02130284, -0.003...","[0.04457528, -0.07030378, 0.012783595, -0.0155...","[0.038646888, -0.059854127, -0.0100904945, -0....","[0.044647217, -0.045562744, -0.009788513, 0.00...","[0.059448242, -0.03744507, 0.024093628, 0.0186...","[0.059020996, -0.026062012, 4.9591064e-05, 0.0...","[0.07623291, -0.04095459, 0.017303467, -0.0046...","[0.07623291, -0.026062012, 0.024093628, 0.0186...","[0.0446167, -0.04586792, -0.013145447, -0.0044...","[0.03062439, -0.06689453, -0.0074157715, -0.00...","[0.04067993, -0.053344727, -0.023239136, -0.00...","[0.046875, -0.07550049, 0.012901306, -0.015823...","[0.039871216, -0.058303833, -0.011016846, -0.0..."
4,E01004770,"[0.025318146, -0.04863739, -0.00014591217, -0....","[0.040929157, -0.055555556, -0.011239065, -0.0...","[0.02157296, -0.061513264, -0.014335632, 0.006...","[0.03234058, -0.06099786, 0.0149474675, -0.010...","[0.03308048, -0.05742874, -0.0049345074, -0.00...","[0.040405273, -0.021728516, 0.014060974, 0.006...","[0.06982422, -0.035095215, 0.022827148, 0.0055...","[0.04348755, -0.034942627, -0.0025405884, 0.02...","[0.045410156, -0.03326416, 0.039764404, 0.0095...","[0.06982422, -0.021728516, 0.039764404, 0.0282...","[0.023727417, -0.053451538, -0.00116539, -0.00...","[0.038406372, -0.053024292, -0.01209259, -0.01...","[0.022415161, -0.062316895, -0.005367279, 0.00...","[0.03692627, -0.06677246, 0.010536194, -0.0069...","[0.031814575, -0.05973816, -0.005092621, -0.00..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1690,E01034138,"[0.03387146, -0.05706787, -0.025102234, -0.002...","[0.046844482, -0.077819824, -0.032073975, -0.0...",,"[0.041215517, -0.050183106, 0.0038211525, 0.00...","[0.03927231, -0.05406189, -0.007460851, -0.002...","[0.04812622, -0.025817871, -0.00995636, 0.0117...","[0.046844482, -0.077819824, -0.032073975, -0.0...",,"[0.056030273, -0.033599854, 0.020553589, 0.013...","[0.056030273, -0.025817871, 0.020553589, 0.013...","[0.0309906, -0.053863525, -0.018844604, -0.007...","[0.046844482, -0.077819824, -0.032073975, -0.0...",,"[0.04788208, -0.048355103, 0.0020739138, 0.004...","[0.043319702, -0.05265808, -0.004934311, -0.00..."
1691,E01034232,"[0.004299164, -0.0635376, 0.0115356445, -0.006...","[0.008399963, -0.07885742, 0.014663696, 0.0036...",,"[0.01678276, -0.08734131, 0.027992249, 0.00157...","[0.011566162, -0.07926941, 0.02054596, 1.95503...","[0.004299164, -0.0635376, 0.0115356445, -0.006...","[0.008399963, -0.07885742, 0.014663696, 0.0036...",,"[0.03866577, -0.07043457, 0.036621094, 0.01350...","[0.03866577, -0.0635376, 0.036621094, 0.013504...","[0.004299164, -0.0635376, 0.0115356445, -0.006...","[0.008399963, -0.07885742, 0.014663696, 0.0036...",,"[0.01678276, -0.08734131, 0.027992249, 0.00157...","[0.0063495636, -0.074645996, 0.01701355, -0.00..."
1692,E01034233,"[0.03287589, -0.066659346, -0.0074954033, 1.87...","[0.03506887, -0.06964493, -0.008808136, 0.0032...","[0.053527832, -0.055196125, -0.026682535, -0.0...","[0.033545684, -0.062756345, 0.001057911, -0.00...","[0.034834925, -0.06553355, -0.0070983605, -0.0...","[0.06793213, -0.012077332, 0.025970459, 0.0277...","[0.06964111, -0.038085938, 0.015792847, 0.0196...","[0.068237305, -0.03982544, -0.017791748, 0.021...","[0.056884766, -0.037353516, 0.028182983, 0.013...","[0.06964111, -0.012077332, 0.028182983, 0.0277...","[0.037139893, -0.06915283, -0.0039367676, -0.0...","[0.027328491, -0.0664978, -0.0117836, 0.004141...","[0.055145264, -0.04776001, -0.029418945, -0.01...","[0.036193848, -0.06768799, 0.00023317337, -0.0...","[0.03717041, -0.0675354, -0.005422592, -0.0004..."
1693,E01034234,"[0.02936554, -0.06730652, -0.024953842, 0.0004...","[0.028117709, -0.06740994, -0.015690697, 0.001...","[0.03176117, -0.06893921, -0.02841425, 0.02110...","[0.03009935, -0.052584562, 0.019768624, -0.011...","[0.029433569, -0.060733795, -0.0012707338, -0....","[0.031051636, -0.05154419, -0.007671356, 0.013...","[0.056152344, -0.037963867, 0.002231598, 0.017...","[0.04135132, -0.053588867, -0.0018053055, 0.02...","[0.06439209, -0.02633667, 0.042877197, 0.01226...","[0.06439209, -0.02633667, 0.042877197, 0.02441...","[0.02936554, -0.06730652, -0.024953842, 0.0004...","[0.02810669, -0.069885254, -0.018310547, -0.00...","[0.03176117, -0.06893921, -0.02841425, 0.02110...","[0.027709961, -0.0574646, 0.015037537, -0.0108...","[0.027908325, -0.0625, -0.0042061806, -0.00431..."


### Save

In [8]:
# Attach the LSOA count/percentage summary and save
final_df = final_df.merge(lsoa_summary, on="LSOA21CD")

output_path = os.path.join(data_dir, "per_lsoa_embedding_summaries", "median_embedding_per_cluster.pkl")
os.makedirs(os.path.dirname(output_path), exist_ok=True)
final_df.to_pickle(output_path)
print(f"Saved to {output_path}")

Saved to ../../../../data/embeddings/per_lsoa_embedding_summaries/median_embedding_per_cluster.pkl


In [9]:
print(final_df['count_1'].sum())
print(final_df['count_2'].sum())

14472.0
27936.0
