## FindMeanEmbeddings_ForEachOf7Clusters
This script:
- Reads in a pickle files which contains a dataframe with one row per sampled image. Each image is associated with a location (lat, lon), a link to the image_file, an embedding, category_scores, and the cluster the image has been assigned to with cluster numbers between 2 and 10 
- Finds the percentage of images in each LSOA, within each of the clusters
- Finds the mean/min/max embedding within each cluster, within each LSOA
- Saves a pickle file containing a dataframe containing this information

In [20]:
import pickle
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import reduce

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor

from joblib import Parallel, delayed

def is_missing_embedding(x):
    return isinstance(x, float) and np.isnan(x)

from functions import *

In [21]:
data_dir = os.path.join("../../../../data/embeddings/")

In [22]:
k = 7 

### Get data

In [23]:
points_data_cache = data_dir + f"embeddings/one_row_per_image_cleaned_with_cluster_numbers.pkl"
with open(points_data_cache, "rb") as f:
    expanded_gdf = pickle.load(f)

In [24]:
for num in range(1,8):
    print(num)
    print(len(expanded_gdf[expanded_gdf['scene_cluster_7']==num]))

1
10758
2
12799
3
15354
4
13970
5
7517
6
7150
7
7928


# Create a dataframe with % of images in each category, in each LSOA 

In [25]:
df = expanded_gdf
category_column = f"scene_cluster_{k}"

# --- 1. Count images per (LSOA, category) ---
category_counts = (df.groupby(["LSOA21CD", category_column]).size().reset_index(name="count"))

# --- 2. Total images per LSOA ---
total_counts = (df.groupby("LSOA21CD").size().reset_index(name="total_images"))

# --- 3. Merge totals ---
category_counts = category_counts.merge(total_counts, on="LSOA21CD")

# --- 4. Add percentage for each category ---
category_counts["pct"] = (category_counts["count"] / category_counts["total_images"] * 100)

# --- 5. Wide table: counts in columns ---
counts_wide = (category_counts.pivot(index="LSOA21CD", columns=category_column, values="count").fillna(0).add_prefix("count_"))

# --- 6. Wide table: percentages in columns ---
pct_wide = (category_counts.pivot(index="LSOA21CD", columns=category_column, values="pct")
        .fillna(0).add_prefix("pct_"))

# --- 7. Combine both + total images per LSOA ---
lsoa_summary = (total_counts.set_index("LSOA21CD").join([counts_wide, pct_wide]))

# plt.hist(lsoa_summary['total_images'], bins=20)

lsoa_summary.head()

Unnamed: 0_level_0,total_images,count_1,count_2,count_3,count_4,count_5,count_6,count_7,pct_1,pct_2,pct_3,pct_4,pct_5,pct_6,pct_7
LSOA21CD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
E01004766,64,8.0,10.0,13.0,14.0,6.0,3.0,10.0,12.5,15.625,20.3125,21.875,9.375,4.6875,15.625
E01004767,72,7.0,12.0,13.0,9.0,13.0,5.0,13.0,9.722222,16.666667,18.055556,12.5,18.055556,6.944444,18.055556
E01004768,44,0.0,15.0,14.0,0.0,5.0,9.0,1.0,0.0,34.090909,31.818182,0.0,11.363636,20.454545,2.272727
E01004769,40,1.0,10.0,9.0,1.0,9.0,9.0,1.0,2.5,25.0,22.5,2.5,22.5,22.5,2.5
E01004770,40,1.0,11.0,12.0,5.0,5.0,5.0,1.0,2.5,27.5,30.0,12.5,12.5,12.5,2.5


# Find mean/median/max embedding in each LSOA, also by category

In [30]:
# Aggregation functions
def mean_embed(series):
    return np.mean(np.stack(series.values), axis=0)

def max_embed(series):
    return np.max(np.stack(series.values), axis=0)

def median_embed(series):
    return np.median(np.stack(series.values), axis=0)

agg_funcs = {"mean": mean_embed, "max": max_embed, "median": median_embed}

# List of categories
categories = df[category_column].unique()

# Initialize list to hold all DataFrames
all_dfs = []

for agg_name, func in agg_funcs.items():
    dfs = []
    
    # Per-category embeddings
    for cat in categories:
        df_cat = df[df[category_column] == cat]
        emb_cat = df_cat.groupby("LSOA21CD")["embedding"].apply(func).reset_index()
        emb_cat = emb_cat.rename(columns={"embedding": f"{cat}_{agg_name}"})
        dfs.append(emb_cat)
    
    # Merge all categories
    merged = reduce(lambda left, right: pd.merge(left, right, on="LSOA21CD", how="outer"), dfs)
    
    # Overall embedding (all images in LSOA)
    overall = expanded_gdf.groupby("LSOA21CD")["embedding"].apply(func).reset_index()
    overall = overall.rename(columns={"embedding": f"overall_{agg_name}"})
    
    merged = merged.merge(overall, on="LSOA21CD", how="left")
    
    all_dfs.append(merged)

# Merge mean, max, median into a single DataFrame
final_df = reduce(lambda left, right: pd.merge(left, right, on="LSOA21CD", how="outer"), all_dfs)

# # Fill missing embeddings with zeros if needed
# final_df = final_df.fillna(0)

### Save

In [31]:
final_df = final_df.merge(lsoa_summary, on = "LSOA21CD")
final_df.to_pickle(data_dir + f"per_lsoa_embedding_summaries/mean_embedding_per_cluster.pkl")

In [39]:
print(final_df['count_1'].sum())
print(final_df['count_2'].sum())

10758.0
12799.0
