## FindMedianEmbeddings_ForEachOf7Clusters
This script:
- Reads the expanded dataframe (one row per image) with cluster assignments (output of script 5)
- For a chosen k, computes per-LSOA summary statistics:
    - Count and percentage of images in each cluster
    - Mean, max, and median embedding within each cluster
    - Overall mean, max, and median embedding (across all clusters)
- Saves a single pickle file combining all of the above

In [None]:
import os
import numpy as np
import pandas as pd
from functools import reduce

In [2]:
from directory_filepaths import *

In [3]:
k = 7 

### Get data

In [None]:
# Load expanded dataframe with cluster assignments (output of script 5)
expanded_gdf = pd.read_pickle(os.path.join(data_dir, "one_row_per_image_cleaned_with_cluster_numbers.pkl"))
print(f"Loaded {len(expanded_gdf)} image rows")

# Create a dataframe with % of images in each category, in each LSOA 

In [None]:
category_column = f"scene_cluster_{k}"

# Count images per (LSOA, cluster)
category_counts = (
    expanded_gdf.groupby(["LSOA21CD", category_column])
    .size()
    .reset_index(name="count")
)

# Total images per LSOA
total_counts = expanded_gdf.groupby("LSOA21CD").size().reset_index(name="total_images")

# Merge and compute percentages
category_counts = category_counts.merge(total_counts, on="LSOA21CD")
category_counts["pct"] = category_counts["count"] / category_counts["total_images"] * 100

# Pivot to wide format: one column per cluster for counts and percentages
counts_wide = (
    category_counts.pivot(index="LSOA21CD", columns=category_column, values="count")
    .fillna(0)
    .add_prefix("count_")
)
pct_wide = (
    category_counts.pivot(index="LSOA21CD", columns=category_column, values="pct")
    .fillna(0)
    .add_prefix("pct_")
)

# Combine counts, percentages, and totals per LSOA
lsoa_summary = total_counts.set_index("LSOA21CD").join([counts_wide, pct_wide])

lsoa_summary.head()

# Find mean/median/max embedding in each LSOA, also by category

In [None]:
def mean_embed(series):
    return np.mean(np.stack(series.values), axis=0)

def max_embed(series):
    return np.max(np.stack(series.values), axis=0)

def median_embed(series):
    return np.median(np.stack(series.values), axis=0)

agg_funcs = {"mean": mean_embed, "max": max_embed, "median": median_embed}
categories = sorted(expanded_gdf[category_column].unique())

all_dfs = []

for agg_name, func in agg_funcs.items():
    dfs = []

    # Per-cluster embedding aggregation
    for cat in categories:
        emb_cat = (
            expanded_gdf[expanded_gdf[category_column] == cat]
            .groupby("LSOA21CD")["embedding"]
            .apply(func)
            .reset_index()
            .rename(columns={"embedding": f"{cat}_{agg_name}"})
        )
        dfs.append(emb_cat)

    # Merge all clusters, then add overall (all images in LSOA)
    merged = reduce(lambda left, right: pd.merge(left, right, on="LSOA21CD", how="outer"), dfs)

    overall = (
        expanded_gdf.groupby("LSOA21CD")["embedding"]
        .apply(func)
        .reset_index()
        .rename(columns={"embedding": f"overall_{agg_name}"})
    )
    merged = merged.merge(overall, on="LSOA21CD", how="left")
    all_dfs.append(merged)

# Combine mean, max, and median into a single DataFrame
final_df = reduce(lambda left, right: pd.merge(left, right, on="LSOA21CD", how="outer"), all_dfs)
print(f"{len(final_df)} LSOAs, {len(final_df.columns) - 1} feature columns")

### Save

In [None]:
# Attach the LSOA count/percentage summary and save
final_df = final_df.merge(lsoa_summary, on="LSOA21CD")

output_path = os.path.join(data_dir, "per_lsoa_embedding_summaries", "median_embedding_per_cluster.pkl")
os.makedirs(os.path.dirname(output_path), exist_ok=True)
final_df.to_pickle(output_path)
print(f"Saved to {output_path}")

In [8]:
print(final_df['count_1'].sum())
print(final_df['count_2'].sum())

6063.0
17571.0
