## Configuration and library loading

In [1]:
import os
os.environ["HF_HOME"] = "/nfs/a319/gy17m2a/scratch/hf_cache"
import pickle
import geopandas as gpd
import torch
from PIL import Image
from tqdm import tqdm
import clip

data_dir = os.path.join("../../../data/embeddings/")

### Load list of sample points

In [2]:
points_data_cache = data_dir + "sample_points_cache/points_data_cache_with_embeddings.pkl"
with open(points_data_cache, "rb") as f:
        point_records = pickle.load(f)
print(f"Cache currently has {len(point_records)} points.")

Cache currently has 18897 points.


# Compute the Embeddings

In [3]:
# Define model
device = "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

## Create text embedding for categories we want to match image embeddings to

- For each headline category, define several different prompts
- Convert each of the subprompts into a text embedding
- For each headling category, find the mean text embedding

In [None]:
multi_prompts = {
    "C ‚Äì Accommodation": [
        "a photo of a house or home",
        "an apartment building on a street",
        "houses in a residential neighborhood",
        "front view of a suburban house",
        "a cozy home exterior with a garden"],
    "B ‚Äì Industrial / Storage": [
        "a warehouse or big industrial building",
        "a factory with chimneys or machinery",
        "storage containers outside a building",
        "a logistics yard with trucks and crates",
        "industrial buildings in an urban area"],
    "E ‚Äì Commercial / Business / Service": [
        "a shop or cafe on the street",
        "a busy high street with stores",
        "a restaurant or small business front",
        "office building in the city",
        "people outside a retail store or service"],
    "F ‚Äì Local Community / Learning": [
        "a school or university building",
        "library or community centre",
        "children playing at a sports field",
        "outdoor playground or swimming pool",
        "museum, gallery or exhibition space"]}

multi_prompts = {
    "indoor": ["a photo taken indoors",
        "an indoor interior scene",
        "inside a building",
        "an indoor room photo"],
    "terraced house": ["a photo of a terraced house",
        "a row of terraced homes",
        "a UK terrace housing street",
        "a brick terraced house"],
    "semi-detached house": [
        "a photo of a detached or semi detached house",
        "a suburban detached house",
        "a single-family home",
        "a detached house on a residential street"],
    "road": ["a photo of a road",
        "a street with cars or buildings",
        "a roadway scene",
        "a street-level view outdoors"],
    "shop": ["a photo of a shop",
        "a store front",
        "a retail business on a street",
        "a commercial storefront"],
    "car": ["a photo dominated by the outside of a car",
        "a vehicle exterior close-up",
        "a photo of a parked car",
        "a car on the street"],
    "industrial": ["a photo of an industrial building",
        "a warehouse or factory building",
        "an industrial site",
        "a manufacturing facility"],
    "wasteland": ["a photo of wasteland or empty space",
        "an abandoned empty outdoor area",
        "a derelict vacant lot",
        "an unused or empty land area"],
    "greenery": ["a photo of nice green space",
        "a park or garden with trees",
        "green plants and nature",
        "a photo of natural greenery"]}

In [10]:
# List of embeddings for each of the 4 headline categories
final_text_features = []
category_names = []  

# This line tells PyTorch that we're not training CLIP (just using to calculate embeddings), so don't need to compute gradients
# This makes the computation faster
with torch.no_grad():
    # Loop through each category and its list of text prompts
    for cat, prompts in multi_prompts.items():

        # Add the category name to your list (used later for plotting or indexing)
        category_names.append(cat)

        # Convert all textual prompts into CLIP token IDs 
        # Token IDs are numerical codes that represent words or sub-words
        tokenized = clip.tokenize(prompts).to(device)

        # Encode all the token IDs into CLIP text embeddings
        txt_feats = model.encode_text(tokenized)

        # Normalise each prompt embedding to unit length
        # (CLIP uses cosine similarity, so normalisation matters)
        txt_feats = txt_feats / txt_feats.norm(dim=-1, keepdim=True)

        # Compute the mean embedding across all prompts for this category
        # This creates a single "category embedding" representing all its prompts
        avg_feat = txt_feats.mean(dim=0)

        # Normalise the averaged embedding again
        # This ensures it remains a proper CLIP embedding for cosine similarity
        avg_feat = avg_feat / avg_feat.norm()

        # Save this averaged category embedding
        final_text_features.append(avg_feat.cpu())

# Convert to tensor of shape (num_categories, 512)
# A tensor is a multi-dimensional array, and is the format expected by PyTorch
final_text_features = torch.stack(final_text_features)
print("Built improved category text embeddings:", final_text_features.shape)

## Create embedding for each image and find similarity to categories 
- Create embedding for image
- Find similarity score to text embedding for each category
- Convert similarity score to a "probability-like number" using softmax

In [None]:
def embed_and_score_clip(image_path):
    """
    Loads an image, computes its CLIP embedding, 
    and calculates similarity-based category probabilities.

    Returns:
        image_embedding (np.array): 512-dim CLIP image embedding
        category_probabilities (np.array): Probability for each category
    """

    # -----------------------------------------------------------
    # 1. LOAD AND PREPROCESS THE IMAGE
    # -----------------------------------------------------------
    # Load image using PIL and convert to 3-channel RGB
    pil_image = Image.open(image_path).convert("RGB")

    # Apply CLIP preprocessing:
    # - resize/crop to 224x224
    # - convert to torch tensor
    # - normalise pixels with CLIP‚Äôs mean/std
    # This produces a tensor of shape (3, 224, 224)
    image_tensor = preprocess(pil_image)

    # Add a batch dimension ‚Üí (1, 3, 224, 224)
    # Required because CLIP expects a batch
    image_tensor = image_tensor.unsqueeze(0)

    # Move tensor to CPU or GPU depending on device
    image_tensor = image_tensor.to(device)

    # -----------------------------------------------------------
    # 2. RUN CLIP TO GET IMAGE EMBEDDING
    # -----------------------------------------------------------
    # Disable gradient tracking 
    with torch.no_grad():

        # Encode the image ‚Üí produces a 512-dim CLIP embedding
        raw_image_embedding = model.encode_image(image_tensor)

        # Normalise embedding to unit length (important for cosine similarity)
        image_embedding = raw_image_embedding / raw_image_embedding.norm(
            dim=-1, keepdim=True)

        # -----------------------------------------------------------
        # 3. COMPUTE SIMILARITIES TO TEXT CATEGORY EMBEDDINGS
        # -----------------------------------------------------------
        # Returns similarity of the 1 image embedding to N text embeddings
        # These are dot products, representing how close the image is to each category in embedding space
        similarity_scores = (image_embedding @ final_text_features.to(device).T)

        ######## Convert raw similarities to probabilities
        # Softmax is a function that turns a set of numbers into a probability-like distribution
        # However, the numbers do not represent true probabilities
        # e.g. Scores of Indoor: 0.75, Greenery: 0.18, Terraced house: 0.04, Road: 0.02, Shop: 0.01
        # Mean that The "indoor" text embedding was much closer to the image embedding than the others.
        # And NOT that the true probability that the scene is indoors is 75%.
        category_probabilities = similarity_scores.softmax(dim=-1)

    # -----------------------------------------------------------
    # 4. RETURN CLEAN CPU NUMPY ARRAYS
    # -----------------------------------------------------------
    return (
        image_embedding.cpu().numpy()[0],       # shape (512,)
        category_probabilities.cpu().numpy()[0] # shape (num_categories,)
    )

# ------------------------------
# 5. Embed all images
# ------------------------------
for rec in tqdm(point_records, desc="Embedding points", unit="point"):

    rec["embedding"] = []
    rec["category_scores"] = []

    for img_path in rec["image_files"]:

        # Use ORIGINAL images
        img_path = img_path.replace("airbnb-manchester/", "embeddings/").replace("../", "../../../")

        try:
            embedding, scores = embed_and_score_clip(img_path)

            rec["embedding"].append(embedding)
            rec["category_scores"].append(scores)

        except Exception as e:
            tqdm.write(f"‚ö†Ô∏è Error: {e}")

## Save outputs to pickle file

In [None]:
output_file = (data_dir + "sample_points_cache/points_data_cache_with_CLIP_embeddings_and_scores_planninguseclasses.pkl")

with open(output_file, "wb") as f:
    pickle.dump(point_records, f)

print(f"\nüíæ Saved embeddings + category scores for {len(point_records)} points.")