In [1]:
import json
import spacy
from tqdm import tqdm
from collections import defaultdict

In [2]:
# Load NLP model
nlp = spacy.load("en_core_web_sm")

# === STEP 1: Load object instances ===
with open("E:/Download/annotations/instances_train2017.json", "r") as f:
    instances = json.load(f)

# Tạo từ điển: {image_id: set(object_names)}
image_objects = defaultdict(set)

# Map từ category_id -> name
category_id2name = {cat["id"]: cat["name"] for cat in instances["categories"]}

for anno in instances["annotations"]:
    image_id = anno["image_id"]
    cat_id = anno["category_id"]
    object_name = category_id2name[cat_id]
    image_objects[image_id].add(object_name)

print("Số ảnh có object:", len(image_objects))

Số ảnh có object: 117266


In [3]:
# === STEP 2: Load captions & extract triplets ===
with open("E:/Download/annotations/captions_train2017.json", "r") as f:
    captions_data = json.load(f)

# Map: {image_id: [captions]}
image_captions = defaultdict(list)
for cap in captions_data["annotations"]:
    image_captions[cap["image_id"]].append(cap["caption"])

# Hàm NLP trích (subject, relation, object) từ caption
def extract_triplets(sentence):
    doc = nlp(sentence)
    triplets = []

    for token in doc:
        # Tìm các verb có subject và object
        if token.pos_ == "VERB":
            subj = [w.text.lower() for w in token.lefts if w.dep_ in ("nsubj", "nsubjpass")]
            obj = [w.text.lower() for w in token.rights if w.dep_ in ("dobj", "attr", "pobj")]

            for s in subj:
                for o in obj:
                    triplets.append((s, token.lemma_.lower(), o))
    return triplets

In [4]:
# === STEP 3: Ghép object và relation thành KG triplets ===
knowledge_graph = []

for image_id in tqdm(image_captions):
    captions = image_captions[image_id]
    objects = image_objects.get(image_id, set())

    for cap in captions:
        triplets = extract_triplets(cap)
        for subj, rel, obj in triplets:
            # Chỉ giữ nếu cả subject và object đều là object thực sự trong ảnh
            if subj in objects and obj in objects:
                knowledge_graph.append({
                    "image_id": image_id,
                    "subject": subj,
                    "relation": rel,
                    "object": obj,
                    "caption": cap
                })

print(f"Tổng số triplets hợp lệ: {len(knowledge_graph)}")

100%|████████████████████████████████████████████████████████████████████████| 118287/118287 [3:09:59<00:00, 10.38it/s]

Tổng số triplets hợp lệ: 1649





In [5]:
import json

with open("coco_kg_triplets.json", "w", encoding="utf-8") as f:
    json.dump(knowledge_graph, f, indent=2, ensure_ascii=False)

print("✅ Đã lưu file coco_kg_triplets.json")

✅ Đã lưu file coco_kg_triplets.json


In [6]:
import csv

with open("coco_kg_triplets.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["image_id", "subject", "relation", "object", "caption"])
    writer.writeheader()
    for row in knowledge_graph:
        writer.writerow(row)

print("✅ Đã lưu file coco_kg_triplets.csv")

✅ Đã lưu file coco_kg_triplets.csv
