In [None]:
# ============================================================
# 📦 Step 0: Setup
# ============================================================
# Install specific package versions to ensure compatibility
!pip install transformers sentence-transformers==2.2.2 huggingface_hub==0.14.1 networkx matplotlib torch -q

import torch
import torch.nn.functional as F
import networkx as nx
import matplotlib.pyplot as plt
from transformers import BlipProcessor, BlipForConditionalGeneration
from sentence_transformers import SentenceTransformer
import numpy as np
from PIL import Image
import requests

In [None]:
# ============================================================
# 🖼️ Step 1: Load or input images
# ============================================================
# You can replace these URLs with your own image files or upload images in Colab.
urls = [
    "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg",  # example image
    "https://images.unsplash.com/photo-1507525428034-b723cf961d3e"                # beach image
]
images = [Image.open(requests.get(u, stream=True).raw).convert("RGB") for u in urls]



In [None]:
# ============================================================
# 🧾 Step 2: Generate captions using BLIP
# ============================================================
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

captions = []
for i, image in enumerate(images):
    inputs = processor(image, return_tensors="pt")
    out = model.generate(**inputs, max_new_tokens=20)
    caption = processor.decode(out[0], skip_special_tokens=True)
    captions.append(caption)
    print(f"🖼️ Image {i+1} Caption → {caption}")


In [None]:

# ============================================================
# 🧩 Step 3: Parse captions → Scene Graphs (spaCy)
# ============================================================
!pip install spacy==3.8.1 -q
import spacy
nlp = spacy.load("en_core_web_sm")

def robust_scene_parse(caption):
    doc = nlp(caption)
    triples = []
    for tok in doc:
        # verb-based
        if tok.pos_ == "VERB":
            subs = [w.text for w in tok.lefts if w.dep_ in ("nsubj","nsubjpass")]
            objs = [w.text for w in tok.rights if w.dep_ in ("dobj","pobj","attr","obl")]
            for s in subs:
                for o in objs:
                    if s != o:
                        triples.append((s, tok.lemma_, o))
        # prepositions
        if tok.dep_ == "pobj" and tok.head.pos_ == "ADP":
            obj = tok.text
            prep = tok.head.text
            head = tok.head.head
            subj = head.text
            if head.pos_ == "VERB":
                subs = [w.text for w in head.lefts if w.dep_ in ("nsubj","nsubjpass")]
                subj = subs[0] if subs else head.text
            if head.dep_ == "acl" and head.head.pos_ == "NOUN":
                subj = head.head.text
            if subj and subj != obj:
                triples.append((subj, prep, obj))
        # adjectival clause (woman sitting)
        if tok.dep_ == "acl" and tok.pos_ == "VERB":
            subj = tok.head.text
            objs = [w.text for w in tok.rights if w.dep_ in ("pobj","dobj")]
            for o in objs:
                triples.append((subj, tok.lemma_, o))

    clean = {(s.strip(), r.strip(), o.strip()) for s,r,o in triples if s.strip() and r.strip() and o.strip()}
    return {"relations": [{"subject": {"name": s}, "relation": r, "object": {"name": o}} for s,r,o in clean]}


In [None]:
# ============================================================
# 🎨 Step 4: Convert to NetworkX + visualize
# ============================================================
def scenegraph_to_networkx(graph_data):
    G = nx.DiGraph()
    for rel in graph_data["relations"]:
        s = rel["subject"]["name"]
        r = rel["relation"]
        o = rel["object"]["name"]
        G.add_node(s)
        G.add_node(o)
        G.add_edge(s, o, label=r)
    return G

def visualize_graph(G, title="Scene Graph"):
    pos = nx.spring_layout(G, seed=42)
    plt.figure(figsize=(6,4))
    nx.draw(G, pos, with_labels=True, node_color="skyblue", node_size=1800,
            edge_color="gray", font_size=10, font_weight="bold")
    edge_labels = nx.get_edge_attributes(G, "label")
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_color="red")
    plt.title(title)
    plt.show()

# Parse + visualize
scene_graphs = [robust_scene_parse(c) for c in captions]
nx_graphs = [scenegraph_to_networkx(g) for g in scene_graphs]
for i,G in enumerate(nx_graphs):
    visualize_graph(G, f"Scene Graph {i+1}")



In [None]:
# ============================================================
# 🔹 Step 5: Unsupervised Graph Embedding via Message Passing
# ============================================================
embedder = SentenceTransformer("all-MiniLM-L6-v2")

def get_emb(text):
    return torch.tensor(embedder.encode(text, normalize_embeddings=True), dtype=torch.float)

def get_graph_matrices(G):
    nodes = list(G.nodes())
    node_idx = {n:i for i,n in enumerate(nodes)}
    A = nx.to_numpy_array(G, nodelist=nodes, weight=None)
    A = A + np.eye(len(nodes))                      # self-loops
    D_inv = np.diag(1 / A.sum(1))
    A_norm = torch.tensor(D_inv @ A, dtype=torch.float)
    X = torch.stack([get_emb(n) for n in nodes])
    return A_norm, X

def propagate(A_norm, X, K=2):
    H = X.clone()
    for _ in range(K):
        H = A_norm @ H
        H = F.normalize(H, p=2, dim=-1)
    return H

def graph_embedding(G, K=2):
    A_norm, X = get_graph_matrices(G)
    H = propagate(A_norm, X, K)
    g_vec = H.mean(dim=0)
    return F.normalize(g_vec, p=2, dim=0)

# Compute graph vectors
graph_vecs = [graph_embedding(G, K=2) for G in nx_graphs]

# Cosine similarity
sim = F.cosine_similarity(graph_vecs[0].unsqueeze(0), graph_vecs[1].unsqueeze(0)).item()
print(f"\n🔢 Semantic Graph Similarity (unsupervised): {sim:.4f}")