## CalculateEmbeddings

This script:
- Loads list of sample points, each with 4 associated image files
- Calculates an image embedding for each image file
- Saves a pickle file containig each sample point, the list of image files and the list of embeddings

In [1]:
import os
import io
import numpy as np
from tqdm import tqdm
#os.environ["HF_HOME"] = "/nfs/a319/gy17m2a/scratch/hf_cache"
import pickle
import geopandas as gpd
import torch
from PIL import Image
from tqdm import tqdm
import clip
import h5py

In [2]:
from directory_filepaths import *

### Load list of sample points

This contains points sampled along the road network in 1-SampleStreetNetwork.ipynb  

Each point has an ID, a latitude, a longitude, and 4 image files associated with it (these are sampled in each of the 4 cardinal directions from the sample point)  

This script will also create an 'embeddings' slot that it will fill with a list of embeddings for each of the 4 images

In [3]:
with h5py.File(h5_filename, "r") as f:
    print(f"H5 file contains {len(f["point_id"])} items")
    print("Keys are: ", list(f.keys()))

#points_data_cache = data_dir + "sample_points_cache/points_data_cache.pkl"
#with open(points_data_cache, "rb") as f:
#        point_records = pickle.load(f)
#print(f"Cache currently has {len(point_records)} points.")

H5 file contains 18897 items
Keys are:  ['date', 'embeddings_clip', 'image_paths', 'images_jpeg', 'images_present', 'latitude', 'longitude', 'point_id']


# Compute the Embeddings

In [4]:
# Define model and device to run it

def get_device():
    if torch.backends.mps.is_available():  # macs
        return torch.device("mps")
    elif torch.cuda.is_available():  # for completeness if you ever run on CUDA
        return torch.device("cuda")
    else:
        return torch.device("cpu")

device = get_device()
print("Using device:", device)
model, preprocess = clip.load("ViT-B/32", device=device)

Using device: mps


## Create embedding for each image and find similarity to categories 
- Create embedding for image
- Find similarity score to text embedding for each category
- Convert similarity score to a "probability-like number" using softmax

In [5]:
# Prepare the database (we need an embeddings column)
dim = 512  # Length nof CLIP embedings

with h5py.File(h5_filename, "a") as f:
    N = f["point_id"].shape[0]

    # If exists, delete it
    if "embeddings_clip" in f:
        print("Embeddings exist, replacing them")
        del f["embeddings_clip"]

    # Create new dataset
    f.create_dataset(
        "embeddings_clip",
        shape=(N, 4, dim),  # 4 embeddings per point
        dtype="float32",
        fillvalue=np.nan
    )

print(f"Created embeddings_clip dataset with shape ({N}, 4, {dim})")

Embeddings exist, replacing them
Created embeddings_clip dataset with shape (18897, 4, 512)


In [6]:
def load_pil_from_h5(f, row_idx, slot):
    """
    Returns a PIL.Image for the given row and slot, reading 'images_jpeg' bytes.
    Raises FileNotFoundError if images_present is False.
    """
    if not bool(f["images_present"][row_idx, slot]):
        raise FileNotFoundError(f"No image stored at row {row_idx}, slot {slot}")

    jpeg_bytes = f["images_jpeg"][row_idx, slot].tobytes()
    return Image.open(io.BytesIO(jpeg_bytes)).convert("RGB")


def embed_clip_pil(pil_image):
    """
    Compute a CLIP embedding from a PIL image.
    Returns a (D,) float32 numpy array (unit-normalized).
    """
    image_tensor = preprocess(pil_image).unsqueeze(0).to(device)

    with torch.no_grad():
        raw = model.encode_image(image_tensor)
        emb = raw / raw.norm(dim=-1, keepdim=True)

    return emb.squeeze(0).detach().cpu().numpy().astype("float32")


with h5py.File(h5_filename, "a") as f:
    if "embeddings_clip" not in f:
        raise KeyError("No embeddings file, this should have been created above.")

    emb_ds = f["embeddings_clip"]    # shape: (N, 4, D)
    N, _, D = emb_ds.shape
    print(f"Embeddings shape: {emb_ds.shape}")

    for i in tqdm(range(N), desc="Embedding CLIP from HDF5", unit="point"):
        for j in range(4):
            try:
                # Decode image from HDF5
                pil_img = load_pil_from_h5(f, i, j)

                # Compute embedding and write it
                emb = embed_clip_pil(pil_img)  # (D,)
                if emb.shape[0] != D:
                    raise ValueError(f"Embedding dim mismatch: got {emb.shape[0]}, expected {D}")
                emb_ds[i, j, :] = emb
            except FileNotFoundError:
                # Missing image → leave NaNs
                tqdm.write(f"Missing image at {i}, {j}, not calculating the embedding")
                emb_ds[i, j, :] = np.nan
            except Exception as e:
                # Any other error → write NaNs to mark as missing/bad
                emb_ds[i, j, :] = np.nan
                tqdm.write(f"⚠️ Error at row {i}, slot {j}: {e}")

print("Finished")

Embeddings shape: (18897, 4, 512)


Embedding CLIP from HDF5: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18897/18897 [28:44<00:00, 10.96point/s]


Done!