In [9]:
import spacy
nlp = spacy.load("en_core_web_sm")

def extract_multiple_triplets(caption):
    doc = nlp(caption)
    triplets = []
    subjects = set()
    objects = set()
    verb_subjects = {}

    spatial_adverbs = {
        "outside", "inside", "nearby", "abroad", "indoors", "outdoors", "underground",
        "overhead", "upstairs", "downstairs", "somewhere", "anywhere", "nowhere",
        "back", "ahead", "overseas", "home", "away"
    }

    # === 1. Subject – Verb – Object
    for token in doc:
        if token.dep_ in ["nsubj", "nsubjpass"]:
            subj = token.text
            verb_token = token.head
            verb = verb_token.lemma_
            verb_subjects[verb_token] = subj
            subjects.add(subj)

            found_obj = False
            for child in verb_token.children:
                if child.dep_ in ["dobj", "attr"] and child.pos_ in ["NOUN", "PROPN"]:
                    obj = get_full_noun_phrase(child)
                    triplets.append((subj, verb, obj))
                    objects.add(obj)
                    found_obj = True

            if not found_obj:
                for prep in verb_token.children:
                    if prep.dep_ == "prep":
                        for pobj in prep.children:
                            if pobj.dep_ == "pobj":
                                obj = get_full_noun_phrase(pobj)
                                triplets.append((subj, verb, obj))
                                objects.add(obj)

    # === 2. ACL or RELCL (mệnh đề quan hệ) xử lý subject đúng
    for token in doc:
        if token.dep_ in ["acl", "relcl"] and token.pos_ == "VERB":
            subj_token = token.head
            if subj_token.text.lower() == "that" and subj_token.dep_ == "nsubj":
                subj = subj_token.head.text
            else:
                subj = subj_token.text

            verb = token.lemma_
            verb_subjects[token] = subj
            subjects.add(subj)

            found_obj = False
            for child in token.children:
                if child.dep_ in ["dobj", "pobj"] and child.pos_ in ["NOUN", "PROPN"]:
                    obj = get_full_noun_phrase(child)
                    triplets.append((subj, verb, obj))
                    objects.add(obj)
                    found_obj = True

            if not found_obj:
                for prep in token.children:
                    if prep.dep_ == "prep":
                        for pobj in prep.children:
                            if pobj.dep_ == "pobj":
                                obj = get_full_noun_phrase(pobj)
                                triplets.append((subj, verb, obj))
                                objects.add(obj)

    # === 3. Prep-relation từ noun hoặc từ verb
    for token in doc:
        if token.dep_ == "pobj" and token.head.dep_ == "prep":
            prep = token.head
            pobj = get_full_noun_phrase(token)
            head = prep.head

            if head.pos_ in ["NOUN", "PROPN"]:
                triplets.append((head.text, prep.text, pobj))
            elif head.pos_ == "VERB" and head in verb_subjects:
                subj = verb_subjects[head]
                triplets.append((subj, prep.text, pobj))

            for obj in objects:
                if obj != pobj:
                    triplets.append((obj, prep.text, pobj))

    # === 4. Spatial adverb → (X, none, outside)
    for token in doc:
        if token.dep_ == "advmod" and token.text.lower() in spatial_adverbs:
            loc = token.text
            for s in subjects:
                triplets.append((s, "none", loc))
            for o in objects:
                triplets.append((o, "none", loc))

    return list(set(triplets))

def get_full_noun_phrase(token):
    mods = [child.text for child in token.children if child.dep_ in ["amod", "compound", "det", "nummod"]]
    return " ".join(mods + [token.text])

In [10]:
captions = [
    "A woman is throwing a frisbee in a park.",
    "A female wearing a white top is playing with a white frisbee.",
    "A person in a white shirt throws a frisbee.",
    "A woman throws a white frisbee in a grassy area.",
    "A woman playing with a frisbee outside on a sunny day."
]

for i, cap in enumerate(captions, 1):
    print(f"{i}. {cap}")
    triplets = extract_multiple_triplets(cap)
    for t in triplets:
        print("   →", t)
    print()

1. A woman is throwing a frisbee in a park.
   → ('woman', 'throw', 'a frisbee')
   → ('a frisbee', 'in', 'a park')
   → ('woman', 'in', 'a park')

2. A female wearing a white top is playing with a white frisbee.
   → ('a white top', 'with', 'a white frisbee')
   → ('female', 'wear', 'a white top')
   → ('female', 'play', 'a white frisbee')
   → ('female', 'with', 'a white frisbee')

3. A person in a white shirt throws a frisbee.
   → ('person', 'in', 'a white shirt')
   → ('a frisbee', 'in', 'a white shirt')
   → ('person', 'throw', 'a frisbee')

4. A woman throws a white frisbee in a grassy area.
   → ('woman', 'in', 'a grassy area')
   → ('woman', 'throw', 'a white frisbee')
   → ('a white frisbee', 'in', 'a grassy area')

5. A woman playing with a frisbee outside on a sunny day.
   → ('woman', 'play', 'a frisbee')
   → ('a frisbee', 'none', 'outside')
   → ('a sunny day', 'with', 'a frisbee')
   → ('a sunny day', 'none', 'outside')
   → ('woman', 'on', 'a sunny day')
   → ('woman',

In [11]:
captions = [
    "A woman is throwing a frisbee in a park.",
    "A female wearing a white top is playing with a white frisbee.",
    "A person in a white shirt throws a frisbee.",
    "A woman throws a white frisbee in a grassy area.",
    "A woman playing with a frisbee outside on a sunny day.",
    "A customized motorcycle with a large rear and skinny front tire",
    "A view of a bathroom that is clean.",
    "a garbage bag in a white lighted bathroom",
    "An old man standing in a kitchen posing for a picture.",
    "A long haired cat eating a dead bird.",
    "A close up of oranges and apples in a bowl.",
    "A person sitting on a stool on the street.",
    "A desk with two monitors depicting security cameras.",
    "A display of vintage animal toys on the floor.",
    "A person that is sitting in a bed facing out."
]

for i, cap in enumerate(captions, 1):
    print(f"{i}. {cap}")
    triplets = extract_multiple_triplets(cap)
    for t in triplets:
        print("   →", t)
    print()

1. A woman is throwing a frisbee in a park.
   → ('woman', 'throw', 'a frisbee')
   → ('a frisbee', 'in', 'a park')
   → ('woman', 'in', 'a park')

2. A female wearing a white top is playing with a white frisbee.
   → ('a white top', 'with', 'a white frisbee')
   → ('female', 'wear', 'a white top')
   → ('female', 'play', 'a white frisbee')
   → ('female', 'with', 'a white frisbee')

3. A person in a white shirt throws a frisbee.
   → ('person', 'in', 'a white shirt')
   → ('a frisbee', 'in', 'a white shirt')
   → ('person', 'throw', 'a frisbee')

4. A woman throws a white frisbee in a grassy area.
   → ('woman', 'in', 'a grassy area')
   → ('woman', 'throw', 'a white frisbee')
   → ('a white frisbee', 'in', 'a grassy area')

5. A woman playing with a frisbee outside on a sunny day.
   → ('woman', 'play', 'a frisbee')
   → ('a frisbee', 'none', 'outside')
   → ('a sunny day', 'with', 'a frisbee')
   → ('a sunny day', 'none', 'outside')
   → ('woman', 'on', 'a sunny day')
   → ('woman',

In [13]:
import json
import pandas as pd

with open("E:/Download/annotations/captions_val2017.json", "r") as f:
    data = json.load(f)

caption_rows = [(ann["image_id"], ann["caption"]) for ann in data["annotations"]]

rows = []
for image_id, caption in caption_rows:
    for s, p, o in extract_multiple_triplets(caption):
        rows.append({
            "subject": s,
            "predicate": p,
            "object": o,
            "image_id": image_id,
            "caption": caption
        })

df = pd.DataFrame(rows)
df = df.sort_values(by=["image_id", "caption"])
df.to_csv("triplets_from_captions_val2017.csv", index=False)
print("Đã lưu file triplets_from_captions_val2017.csv")

Đã lưu file triplets_from_captions_val2017.csv
