In [2]:
import json
import spacy
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

# === Load NLP ===
nlp = spacy.load("en_core_web_sm")

# === Load COCO Captions ===
with open("E:/Download/annotations/captions_val2017.json", "r") as f:
    captions_data = json.load(f)["annotations"]

# === Load COCO Instances ===
with open("E:/Download/annotations/instances_val2017.json", "r") as f:
    instances_all = json.load(f)
    instances_data = instances_all["annotations"]
    category_map = {cat["id"]: cat["name"] for cat in instances_all["categories"]}

# === Build image_id → set of object names ===
image_objects = defaultdict(set)
for inst in instances_data:
    cat_name = category_map[inst["category_id"]]
    image_objects[inst["image_id"]].add(cat_name.lower())

# === Hàm trích triplet từ caption bằng spaCy ===
def extract_triplets(caption):
    doc = nlp(caption)
    triplets = []
    for sent in doc.sents:
        subject, predicate, obj = None, None, None
        for token in sent:
            if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
                subject = token.text.lower()
                predicate = token.head.text.lower()
                for child in token.head.children:
                    if child.dep_ in ("dobj", "attr", "prep", "pobj") and child.pos_ in ("NOUN", "PRON"):
                        obj = child.text.lower()
                        break
        if subject and predicate:
            triplets.append((subject, predicate, obj))
    return triplets

# === Xử lý toàn bộ caption để lọc triplet hợp lệ ===
filtered_rows = []
for item in tqdm(captions_data, desc="Processing captions"):
    img_id = item["image_id"]
    caption = item["caption"]
    triplets = extract_triplets(caption)
    valid_objects = image_objects.get(img_id, set())
    
    for subj, pred, obj in triplets:
        if subj in valid_objects or obj in valid_objects:
            filtered_rows.append({
                "subject": subj,
                "predicate": pred,
                "object": obj,
                "image_id": img_id,
                "caption": caption
            })

# === Xuất ra file CSV ===
df = pd.DataFrame(filtered_rows)
df.to_csv("filtered_triplets.csv", index=False)
print(f"Đã lưu {len(df)} dòng triplet vào filtered_triplets.csv")

Processing captions: 100%|██████████████████████████████████████████████████████| 25014/25014 [04:01<00:00, 103.72it/s]

Đã lưu 2161 dòng triplet vào filtered_triplets.csv





In [6]:
# Danh sách ảnh có caption
caption_ids = set([cap["image_id"] for cap in captions_data])

# Danh sách ảnh có object (annotation)
instance_ids = set(image_objects.keys())

# Lấy ảnh có cả caption và object
valid_ids = caption_ids & instance_ids
print(f"Tổng số ảnh có cả caption và object: {len(valid_ids)}")

# Lấy một ví dụ hợp lệ (có thể thay đổi)
test_image_id = list(valid_ids)[0]
print(f"Chọn image_id hợp lệ: {test_image_id}")

Tổng số ảnh có cả caption và object: 4952
Chọn image_id hợp lệ: 458755


In [8]:
caption = "Young woman with sheep on straw covered floor"
doc = nlp(caption)

print("Dependency parsing result:\n")
print(f"{'Token':<12}{'POS':<8}{'Dep':<12}{'Head':<12}")
print("-" * 44)
for token in doc:
    print(f"{token.text:<12}{token.pos_:<8}{token.dep_:<12}{token.head.text:<12}")

Dependency parsing result:

Token       POS     Dep         Head        
--------------------------------------------
Young       ADJ     amod        woman       
woman       NOUN    ROOT        woman       
with        ADP     prep        woman       
sheep       NOUN    pobj        with        
on          ADP     prep        sheep       
straw       NOUN    npadvmod    covered     
covered     VERB    amod        floor       
floor       NOUN    pobj        on          


In [12]:
caption = "A child places his hands on the head and neck of a sheep while another sheep looks at his face"
doc = nlp(caption)

print("Dependency parsing result:\n")
print(f"{'Token':<12}{'POS':<8}{'Dep':<12}{'Head':<12}")
print("-" * 44)
for token in doc:
    print(f"{token.text:<12}{token.pos_:<8}{token.dep_:<12}{token.head.text:<12}")

Dependency parsing result:

Token       POS     Dep         Head        
--------------------------------------------
A           DET     det         child       
child       NOUN    nsubj       places      
places      VERB    ROOT        places      
his         PRON    poss        hands       
hands       NOUN    dobj        places      
on          ADP     prep        places      
the         DET     det         head        
head        NOUN    pobj        on          
and         CCONJ   cc          head        
neck        NOUN    conj        head        
of          ADP     prep        head        
a           DET     det         sheep       
sheep       NOUN    pobj        of          
while       SCONJ   mark        looks       
another     DET     det         sheep       
sheep       NOUN    nsubj       looks       
looks       VERB    advcl       places      
at          ADP     prep        looks       
his         PRON    poss        face        
face        NOUN    pobj   

In [25]:
caption = "A person petting the head of a cute fluffy sheep"
doc = nlp(caption)

print("Dependency parsing result:\n")
print(f"{'Token':<12}{'POS':<8}{'Dep':<12}{'Head':<12}")
print("-" * 44)
for token in doc:
    print(f"{token.text:<12}{token.pos_:<8}{token.dep_:<12}{token.head.text:<12}")

Dependency parsing result:

Token       POS     Dep         Head        
--------------------------------------------
A           DET     det         person      
person      NOUN    ROOT        person      
petting     VERB    acl         person      
the         DET     det         head        
head        NOUN    dobj        petting     
of          ADP     prep        head        
a           DET     det         sheep       
cute        ADJ     amod        sheep       
fluffy      ADJ     amod        sheep       
sheep       NOUN    pobj        of          


In [26]:
caption = "A child is petting a sheep while another sheep watches"
doc = nlp(caption)

print("Dependency parsing result:\n")
print(f"{'Token':<12}{'POS':<8}{'Dep':<12}{'Head':<12}")
print("-" * 44)
for token in doc:
    print(f"{token.text:<12}{token.pos_:<8}{token.dep_:<12}{token.head.text:<12}")

Dependency parsing result:

Token       POS     Dep         Head        
--------------------------------------------
A           DET     det         child       
child       NOUN    nsubj       petting     
is          AUX     aux         petting     
petting     VERB    ROOT        petting     
a           DET     det         sheep       
sheep       NOUN    dobj        petting     
while       SCONJ   mark        watches     
another     DET     det         sheep       
sheep       NOUN    nsubj       watches     
watches     VERB    advcl       petting     


In [27]:
caption = "A woman kneeling to pet animals while others wait"
doc = nlp(caption)

print("Dependency parsing result:\n")
print(f"{'Token':<12}{'POS':<8}{'Dep':<12}{'Head':<12}")
print("-" * 44)
for token in doc:
    print(f"{token.text:<12}{token.pos_:<8}{token.dep_:<12}{token.head.text:<12}")

Dependency parsing result:

Token       POS     Dep         Head        
--------------------------------------------
A           DET     det         woman       
woman       NOUN    ROOT        woman       
kneeling    VERB    acl         woman       
to          ADP     prep        kneeling    
pet         ADJ     amod        animals     
animals     NOUN    pobj        to          
while       SCONJ   mark        wait        
others      NOUN    nsubj       wait        
wait        VERB    advcl       woman       


In [30]:
def extract_triplets(caption):
    doc = nlp(caption)
    triplets = []

    for sent in doc.sents:
        # 1. subject - verb - object
        for token in sent:
            if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
                subject = token.text.lower()
                predicate = token.head.text.lower()
                obj = None
                for child in token.head.children:
                    if child.dep_ in ("dobj", "attr") and child.pos_ in ("NOUN", "PRON"):
                        obj = child.text.lower()
                        triplets.append((subject, predicate, obj))
                    elif child.dep_ == "prep":
                        pobj = next((t for t in child.children if t.dep_ == "pobj"), None)
                        if pobj:
                            if obj:
                                triplets.append((obj, child.text.lower(), pobj.text.lower()))
                                for conj in pobj.children:
                                    if conj.dep_ == "conj":
                                        triplets.append((obj, child.text.lower(), conj.text.lower()))
                            else:
                                triplets.append((subject, predicate, pobj.text.lower()))

        # 2. noun - prep - noun
        for token in sent:
            if token.dep_ == "prep" and token.head.pos_ == "NOUN":
                pobj = next((child for child in token.children if child.dep_ == "pobj"), None)
                if pobj:
                    triplets.append((token.head.text.lower(), token.text.lower(), pobj.text.lower()))
                    for conj in pobj.children:
                        if conj.dep_ == "conj":
                            triplets.append((token.head.text.lower(), token.text.lower(), conj.text.lower()))

        # 3. noun - and - noun
        for token in sent:
            if token.dep_ == "conj" and token.head.pos_ == "NOUN":
                for parent in token.head.children:
                    if parent.dep_ in ("det", "amod"):
                        triplets.append((token.head.text.lower(), "and", token.text.lower()))

        # 4. amod verb + npadvmod
        for token in sent:
            if token.dep_ == "amod" and token.head.pos_ == "NOUN" and token.pos_ == "VERB":
                modifier = next((child for child in token.children if child.dep_ == "npadvmod"), None)
                if modifier:
                    triplets.append((token.head.text.lower(), token.text.lower(), modifier.text.lower()))

        # 5. tính từ bổ nghĩa cho danh từ (e.g., fluffy sheep)
        for token in sent:
            if token.dep_ == "amod" and token.head.pos_ == "NOUN":
                triplets.append((token.head.text.lower(), "amod", token.text.lower()))

        # 6. advcl (e.g., while others wait)
        for token in sent:
            if token.dep_ == "advcl" and token.pos_ == "VERB":
                verb = token.text.lower()
                subject = None
                obj = None
                for child in token.children:
                    if child.dep_ == "nsubj":
                        subject = child.text.lower()
                    elif child.dep_ in ("dobj", "attr", "pobj") and child.pos_ == "NOUN":
                        obj = child.text.lower()
                if subject:
                    triplets.append((subject, verb, obj))

        # 7. acl + to + amod/xcomp → chuyển động từ bổ sung
        for token in sent:
            if token.pos_ == "VERB" and token.dep_ == "acl":
                subj = token.head.text.lower()
                verb1 = token.text.lower()
                triplets.append((subj, verb1, None))

                for child in token.children:
                    if child.dep_ == "prep" and child.text.lower() == "to":
                        for grandchild in child.children:
                            if grandchild.dep_ in ("amod", "xcomp") and grandchild.head.pos_ == "NOUN":
                                triplets.append((subj, grandchild.text.lower(), grandchild.head.text.lower()))

    return triplets

In [31]:
import json

# Đường dẫn tới file caption COCO (bạn chỉnh nếu cần)
caption_file = "E:/Download/annotations/captions_val2017.json"

# Load caption từ file JSON
with open(caption_file, "r") as f:
    caption_data = json.load(f)["annotations"]

# ID cần kiểm thử
test_image_id = 458755

# Lấy 5 caption của ảnh này
captions = [item["caption"] for item in caption_data if item["image_id"] == test_image_id]

# Kiểm thử bằng extract_triplets()
print(f"\n📸 Testing image_id = {test_image_id}")
for i, caption in enumerate(captions, 1):
    print(f"\n📌 Caption {i}: {caption}")
    triplets = extract_triplets(caption)
    if not triplets:
        print("   → ❌ Không trích được triplet nào.")
    else:
        for trip in triplets:
            print("   → Triplet:", trip)


📸 Testing image_id = 458755

📌 Caption 1: Young woman with sheep on straw covered floor.
   → Triplet: ('woman', 'with', 'sheep')
   → Triplet: ('sheep', 'on', 'floor')
   → Triplet: ('floor', 'covered', 'straw')
   → Triplet: ('woman', 'amod', 'young')
   → Triplet: ('floor', 'amod', 'covered')

📌 Caption 2: A child places his hands on the head and neck of a sheep while another sheep looks at his face.
   → Triplet: ('child', 'places', 'hands')
   → Triplet: ('hands', 'on', 'head')
   → Triplet: ('hands', 'on', 'neck')
   → Triplet: ('sheep', 'looks', 'face')
   → Triplet: ('head', 'of', 'sheep')
   → Triplet: ('head', 'and', 'neck')
   → Triplet: ('sheep', 'looks', None)

📌 Caption 3: A person petting the head of a cute fluffy sheep.
   → Triplet: ('head', 'of', 'sheep')
   → Triplet: ('sheep', 'amod', 'cute')
   → Triplet: ('sheep', 'amod', 'fluffy')
   → Triplet: ('person', 'petting', None)

📌 Caption 4: A child is petting a sheep while another sheep watches.
   → Triplet: ('child

In [33]:
def extract_triplets(caption):
    doc = nlp(caption)
    triplets = []

    for sent in doc.sents:
        # 1. subject - verb - object
        for token in sent:
            if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
                subject = token.text.lower()
                predicate = token.head.text.lower()
                obj = None
                for child in token.head.children:
                    if child.dep_ in ("dobj", "attr") and child.pos_ in ("NOUN", "PRON"):
                        obj = child.text.lower()
                        triplets.append((subject, predicate, obj))
                    elif child.dep_ == "prep":
                        pobj = next((t for t in child.children if t.dep_ == "pobj"), None)
                        if pobj:
                            if obj:
                                triplets.append((obj, child.text.lower(), pobj.text.lower()))
                                for conj in pobj.children:
                                    if conj.dep_ == "conj":
                                        triplets.append((obj, child.text.lower(), conj.text.lower()))
                            else:
                                triplets.append((subject, predicate, pobj.text.lower()))

        # 2. noun - prep - noun
        for token in sent:
            if token.dep_ == "prep" and token.head.pos_ == "NOUN":
                pobj = next((child for child in token.children if child.dep_ == "pobj"), None)
                if pobj:
                    triplets.append((token.head.text.lower(), token.text.lower(), pobj.text.lower()))
                    for conj in pobj.children:
                        if conj.dep_ == "conj":
                            triplets.append((token.head.text.lower(), token.text.lower(), conj.text.lower()))

        # 3. noun - and - noun
        for token in sent:
            if token.dep_ == "conj" and token.head.pos_ == "NOUN":
                for parent in token.head.children:
                    if parent.dep_ in ("det", "amod"):
                        triplets.append((token.head.text.lower(), "and", token.text.lower()))

        # 4. amod verb + npadvmod
        for token in sent:
            if token.dep_ == "amod" and token.head.pos_ == "NOUN" and token.pos_ == "VERB":
                modifier = next((child for child in token.children if child.dep_ == "npadvmod"), None)
                if modifier:
                    triplets.append((token.head.text.lower(), token.text.lower(), modifier.text.lower()))

        # 5. tính từ bổ nghĩa cho danh từ
        for token in sent:
            if token.dep_ == "amod" and token.head.pos_ == "NOUN":
                triplets.append((token.head.text.lower(), "amod", token.text.lower()))
                # hỗ trợ nhiều amod như: black and white photo
                for sibling in token.children:
                    if sibling.dep_ == "conj" and sibling.pos_ == "ADJ":
                        triplets.append((token.head.text.lower(), "amod", sibling.text.lower()))

        # 6. advcl (while others wait)
        for token in sent:
            if token.dep_ == "advcl" and token.pos_ == "VERB":
                verb = token.text.lower()
                subject = None
                obj = None
                for child in token.children:
                    if child.dep_ == "nsubj":
                        subject = child.text.lower()
                    elif child.dep_ in ("dobj", "attr", "pobj") and child.pos_ == "NOUN":
                        obj = child.text.lower()
                if subject:
                    triplets.append((subject, verb, obj))

        # 7. acl + to + amod/xcomp (e.g., kneeling to pet animals)
        for token in sent:
            if token.pos_ == "VERB" and token.dep_ == "acl":
                subj = token.head.text.lower()
                verb1 = token.text.lower()
                triplets.append((subj, verb1, None))

                for child in token.children:
                    if child.dep_ == "prep" and child.text.lower() == "to":
                        for grandchild in child.children:
                            if grandchild.dep_ in ("amod", "xcomp") and grandchild.head.pos_ == "NOUN":
                                triplets.append((subj, grandchild.text.lower(), grandchild.head.text.lower()))

        # 8. conj động từ (e.g., standing and sitting)
        for token in sent:
            if token.dep_ == "conj" and token.head.pos_ == "VERB":
                subject = None
                for sibling in token.head.children:
                    if sibling.dep_ == "nsubj":
                        subject = sibling.text.lower()
                if subject:
                    triplets.append((subject, token.text.lower(), None))

    return triplets

In [34]:
import json

# Đường dẫn tới file caption COCO (bạn chỉnh nếu cần)
caption_file = "E:/Download/annotations/captions_val2017.json"

# Load caption từ file JSON
with open(caption_file, "r") as f:
    caption_data = json.load(f)["annotations"]

# ID cần kiểm thử
test_image_id = 2299

# Lấy 5 caption của ảnh này
captions = [item["caption"] for item in caption_data if item["image_id"] == test_image_id]

# Kiểm thử bằng extract_triplets()
print(f"\n📸 Testing image_id = {test_image_id}")
for i, caption in enumerate(captions, 1):
    print(f"\n📌 Caption {i}: {caption}")
    triplets = extract_triplets(caption)
    if not triplets:
        print("   → ❌ Không trích được triplet nào.")
    else:
        for trip in triplets:
            print("   → Triplet:", trip)


📸 Testing image_id = 2299

📌 Caption 1: Many small children are posing together in the black and white photo. 
   → Triplet: ('children', 'posing', 'photo')
   → Triplet: ('children', 'amod', 'many')
   → Triplet: ('children', 'amod', 'small')
   → Triplet: ('photo', 'amod', 'black')
   → Triplet: ('photo', 'amod', 'white')

📌 Caption 2: A vintage school picture of grade school aged children.
   → Triplet: ('picture', 'of', 'children')
   → Triplet: ('school', 'amod', 'vintage')
   → Triplet: ('children', 'amod', 'aged')

📌 Caption 3: A black and white photo of a group of kids.
   → Triplet: ('photo', 'of', 'group')
   → Triplet: ('group', 'of', 'kids')
   → Triplet: ('photo', 'amod', 'black')
   → Triplet: ('photo', 'amod', 'white')

📌 Caption 4: A group of children standing next to each other.
   → Triplet: ('group', 'of', 'children')
   → Triplet: ('group', 'standing', None)

📌 Caption 5: A group of children standing and sitting beside each other. 
   → Triplet: ('group', 'of', 'ch

In [36]:
def extract_triplets(caption):
    doc = nlp(caption)
    triplets = []

    for sent in doc.sents:
        # 1. subject - verb - object
        for token in sent:
            if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
                subject = token.text.lower()
                predicate = token.head.text.lower()
                for child in token.head.children:
                    if child.dep_ in ("dobj", "attr") and child.pos_ in ("NOUN", "PRON"):
                        obj = child.text.lower()
                        triplets.append((subject, predicate, obj))
                    elif child.dep_ == "prep":
                        pobj = next((t for t in child.children if t.dep_ == "pobj"), None)
                        if pobj:
                            triplets.append((subject, predicate, pobj.text.lower()))
                if not any(t[0] == subject and t[1] == predicate for t in triplets):
                    triplets.append((subject, predicate, None))

        # 2. noun - prep - noun
        for token in sent:
            if token.dep_ == "prep" and token.head.pos_ == "NOUN":
                pobj = next((child for child in token.children if child.dep_ == "pobj"), None)
                if pobj:
                    triplets.append((token.head.text.lower(), token.text.lower(), pobj.text.lower()))

        # 3. conj nouns (e.g., "head and neck")
        for token in sent:
            if token.dep_ == "conj" and token.head.pos_ == "NOUN":
                for parent in token.head.children:
                    if parent.dep_ == "prep":
                        pobj = next((child for child in parent.children if child.dep_ == "pobj"), None)
                        if pobj:
                            triplets.append((token.text.lower(), parent.text.lower(), pobj.text.lower()))
                    elif parent.dep_ in ("det", "amod", "compound"):
                        triplets.append((token.head.text.lower(), "and", token.text.lower()))

        # 4. amod (e.g., "white cake")
        for token in sent:
            if token.dep_ == "amod" and token.head.pos_ == "NOUN":
                triplets.append((token.head.text.lower(), "amod", token.text.lower()))

        # 5. compound (e.g., "close-up photo")
        for token in sent:
            if token.dep_ == "compound" and token.head.pos_ == "NOUN":
                compound_word = f"{token.text.lower()}-{token.head.text.lower()}"
                triplets.append((compound_word, "compound", token.head.text.lower()))

        # 6. acl (e.g., "cake topped with berries")
        for token in sent:
            if token.dep_ == "acl" and token.head.pos_ == "NOUN":
                subj = token.head.text.lower()
                verb = token.text.lower()
                triplets.append((subj, verb, None))
                for child in token.children:
                    if child.dep_ == "prep":
                        pobj = next((t for t in child.children if t.dep_ == "pobj"), None)
                        if pobj:
                            triplets.append((subj, verb, pobj.text.lower()))

        # 7. verb with "to" + amod (e.g., "kneeling to pet animals")
        for token in sent:
            if token.pos_ == "VERB" and token.dep_ == "acl":
                subj = token.head.text.lower()
                verb1 = token.text.lower()
                triplets.append((subj, verb1, None))
                for child in token.children:
                    if child.dep_ == "prep" and child.text.lower() == "to":
                        for grandchild in child.children:
                            if grandchild.dep_ in ("amod", "xcomp") and grandchild.head.pos_ == "NOUN":
                                triplets.append((subj, grandchild.text.lower(), grandchild.head.text.lower()))

        # 8. advcl (e.g., "others wait" in "while others wait")
        for token in sent:
            if token.dep_ == "advcl" and token.pos_ == "VERB":
                subject = next((t for t in token.children if t.dep_ == "nsubj"), None)
                if subject:
                    triplets.append((subject.text.lower(), token.text.lower(), None))

    return triplets

In [37]:
import json

# Đường dẫn tới file caption COCO (bạn chỉnh nếu cần)
caption_file = "E:/Download/annotations/captions_val2017.json"

# Load caption từ file JSON
with open(caption_file, "r") as f:
    caption_data = json.load(f)["annotations"]

# ID cần kiểm thử
test_image_id = 2157

# Lấy 5 caption của ảnh này
captions = [item["caption"] for item in caption_data if item["image_id"] == test_image_id]

# Kiểm thử bằng extract_triplets()
print(f"\n📸 Testing image_id = {test_image_id}")
for i, caption in enumerate(captions, 1):
    print(f"\n📌 Caption {i}: {caption}")
    triplets = extract_triplets(caption)
    if not triplets:
        print("   → ❌ Không trích được triplet nào.")
    else:
        for trip in triplets:
            print("   → Triplet:", trip)


📸 Testing image_id = 2157

📌 Caption 1: A plate of finger foods next to a blue and raspberry topped cake.
   → Triplet: ('plate', 'of', 'foods')
   → Triplet: ('cake', 'amod', 'blue')
   → Triplet: ('cake', 'amod', 'topped')
   → Triplet: ('finger-foods', 'compound', 'foods')

📌 Caption 2: A nicely set dining table filled with food and a cake topped with berries.
   → Triplet: ('table', 'amod', 'set')
   → Triplet: ('dining-table', 'compound', 'table')
   → Triplet: ('table', 'filled', None)
   → Triplet: ('table', 'filled', 'food')
   → Triplet: ('cake', 'topped', None)
   → Triplet: ('cake', 'topped', 'berries')
   → Triplet: ('table', 'filled', None)
   → Triplet: ('cake', 'topped', None)

📌 Caption 3: a close up of a table with many plates of food
   → Triplet: ('close', 'with', 'plates')
   → Triplet: ('plates', 'of', 'food')
   → Triplet: ('plates', 'amod', 'many')

📌 Caption 4: A table topped with a cake covered in berries next to a plate of sandwiches.
   → Triplet: ('plate', 

In [41]:
def extract_triplets(caption):
    doc = nlp(caption)
    triplets = []

    for sent in doc.sents:
        # 1. subject - verb - object
        for token in sent:
            if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
                subject = token.text.lower()
                predicate = token.head.text.lower()
                object_ = None
                for child in token.head.children:
                    if child.dep_ in ("dobj", "attr") and child.pos_ in ("NOUN", "PRON", "PROPN"):
                        object_ = child.text.lower()
                        triplets.append((subject, predicate, object_))
                    elif child.dep_ == "prep":
                        pobj = next((t for t in child.children if t.dep_ == "pobj"), None)
                        if pobj:
                            object_ = pobj.text.lower()
                            triplets.append((subject, predicate, object_))
                if object_ is None:
                    triplets.append((subject, predicate, None))

        # 2. noun – prep – pobj
        for token in sent:
            if token.dep_ == "prep" and token.head.pos_ == "NOUN":
                pobj = next((child for child in token.children if child.dep_ == "pobj"), None)
                if pobj:
                    triplets.append((token.head.text.lower(), token.text.lower(), pobj.text.lower()))

        # 3. amod modifiers (multiple)
        for token in sent:
            if token.dep_ == "amod" and token.head.pos_ == "NOUN":
                triplets.append((token.head.text.lower(), "amod", token.text.lower()))

        # 4. compound (e.g. "finger foods", "dining table")
        for token in sent:
            if token.dep_ == "compound" and token.head.pos_ == "NOUN":
                triplets.append((token.text.lower() + "-" + token.head.text.lower(), "compound", token.head.text.lower()))

        # 5. conj (e.g., "red and yellow apple and bananas")
        for token in sent:
            if token.dep_ == "conj" and token.head.pos_ == "NOUN":
                triplets.append((token.head.text.lower(), "and", token.text.lower()))
                for child in token.children:
                    if child.dep_ == "amod":
                        triplets.append((token.text.lower(), "amod", child.text.lower()))

        # 6. acl/advcl: noun with verb modifier
        for token in sent:
            if token.dep_ in ("acl", "advcl") and token.pos_ == "VERB" and token.head.pos_ == "NOUN":
                subj = token.head.text.lower()
                pred = token.text.lower()
                obj = None
                for child in token.children:
                    if child.dep_ in ("dobj", "pobj", "attr") and child.pos_ in ("NOUN", "PROPN", "PRON"):
                        obj = child.text.lower()
                        triplets.append((subj, pred, obj))
                triplets.append((subj, pred, obj))

    return triplets

In [42]:
import json

# Đường dẫn tới file caption COCO (bạn chỉnh nếu cần)
caption_file = "E:/Download/annotations/captions_val2017.json"

# Load caption từ file JSON
with open(caption_file, "r") as f:
    caption_data = json.load(f)["annotations"]

# ID cần kiểm thử
test_image_id = 6012

# Lấy 5 caption của ảnh này
captions = [item["caption"] for item in caption_data if item["image_id"] == test_image_id]

# Kiểm thử bằng extract_triplets()
print(f"\n📸 Testing image_id = {test_image_id}")
for i, caption in enumerate(captions, 1):
    print(f"\n📌 Caption {i}: {caption}")
    triplets = extract_triplets(caption)
    if not triplets:
        print("   → ❌ Không trích được triplet nào.")
    else:
        for trip in triplets:
            print("   → Triplet:", trip)


📸 Testing image_id = 6012

📌 Caption 1: There are bananas around another piece of fruit. 
   → Triplet: ('bananas', 'around', 'piece')
   → Triplet: ('piece', 'of', 'fruit')

📌 Caption 2: a yellow and red apple and some bananas
   → Triplet: ('apple', 'amod', 'yellow')
   → Triplet: ('apple', 'and', 'bananas')

📌 Caption 3: An apple sits in between a bunch of bananas
   → Triplet: ('apple', 'sits', 'bunch')
   → Triplet: ('bunch', 'of', 'bananas')

📌 Caption 4: A bunch of bananas with an apple sitting in the middle.
   → Triplet: ('bunch', 'of', 'bananas')
   → Triplet: ('bunch', 'with', 'apple')
   → Triplet: ('apple', 'sitting', None)

📌 Caption 5: Six bananas and a red fruit make an artistic sight paired with a blue background.
   → Triplet: ('bananas', 'make', 'sight')
   → Triplet: ('fruit', 'amod', 'red')
   → Triplet: ('sight', 'amod', 'artistic')
   → Triplet: ('background', 'amod', 'blue')
   → Triplet: ('bananas', 'and', 'fruit')
   → Triplet: ('fruit', 'amod', 'red')
   → T

In [43]:
import json

# Đường dẫn tới file caption COCO (bạn chỉnh nếu cần)
caption_file = "E:/Download/annotations/captions_val2017.json"

# Load caption từ file JSON
with open(caption_file, "r") as f:
    caption_data = json.load(f)["annotations"]

# ID cần kiểm thử
test_image_id = 2157

# Lấy 5 caption của ảnh này
captions = [item["caption"] for item in caption_data if item["image_id"] == test_image_id]

# Kiểm thử bằng extract_triplets()
print(f"\n📸 Testing image_id = {test_image_id}")
for i, caption in enumerate(captions, 1):
    print(f"\n📌 Caption {i}: {caption}")
    triplets = extract_triplets(caption)
    if not triplets:
        print("   → ❌ Không trích được triplet nào.")
    else:
        for trip in triplets:
            print("   → Triplet:", trip)


📸 Testing image_id = 2157

📌 Caption 1: A plate of finger foods next to a blue and raspberry topped cake.
   → Triplet: ('plate', 'of', 'foods')
   → Triplet: ('cake', 'amod', 'blue')
   → Triplet: ('cake', 'amod', 'topped')
   → Triplet: ('finger-foods', 'compound', 'foods')

📌 Caption 2: A nicely set dining table filled with food and a cake topped with berries.
   → Triplet: ('table', 'amod', 'set')
   → Triplet: ('dining-table', 'compound', 'table')
   → Triplet: ('food', 'and', 'cake')
   → Triplet: ('table', 'filled', None)
   → Triplet: ('cake', 'topped', None)

📌 Caption 3: a close up of a table with many plates of food
   → Triplet: ('close', 'with', 'plates')
   → Triplet: ('plates', 'of', 'food')
   → Triplet: ('plates', 'amod', 'many')

📌 Caption 4: A table topped with a cake covered in berries next to a plate of sandwiches.
   → Triplet: ('plate', 'of', 'sandwiches')
   → Triplet: ('table', 'topped', None)
   → Triplet: ('cake', 'covered', None)

📌 Caption 5: A white cake 

In [44]:
def extract_triplets(caption):
    doc = nlp(caption)
    triplets = []

    for sent in doc.sents:
        # 1. subject - verb - object
        for token in sent:
            if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
                subject = token.text.lower()
                predicate = token.head.text.lower()
                object_ = None
                for child in token.head.children:
                    if child.dep_ in ("dobj", "attr") and child.pos_ in ("NOUN", "PRON", "PROPN"):
                        object_ = child.text.lower()
                        triplets.append((subject, predicate, object_))
                    elif child.dep_ == "prep":
                        pobj = next((t for t in child.children if t.dep_ == "pobj"), None)
                        if pobj:
                            object_ = pobj.text.lower()
                            triplets.append((subject, predicate, object_))
                if object_ is None:
                    triplets.append((subject, predicate, None))

        # 2. noun – prep – pobj
        for token in sent:
            if token.dep_ == "prep" and token.head.pos_ == "NOUN":
                pobj = next((child for child in token.children if child.dep_ == "pobj"), None)
                if pobj:
                    triplets.append((token.head.text.lower(), token.text.lower(), pobj.text.lower()))
                    for conj in pobj.children:
                        if conj.dep_ == "conj":
                            triplets.append((token.head.text.lower(), token.text.lower(), conj.text.lower()))

        # 3. amod modifiers (and chained)
        for token in sent:
            if token.dep_ == "amod" and token.head.pos_ == "NOUN":
                triplets.append((token.head.text.lower(), "amod", token.text.lower()))
                for sibling in token.children:
                    if sibling.dep_ == "conj" and sibling.pos_ == "ADJ":
                        triplets.append((token.head.text.lower(), "amod", sibling.text.lower()))

        # 4. compound noun (e.g., dining-table)
        for token in sent:
            if token.dep_ == "compound" and token.head.pos_ == "NOUN":
                compound_word = f"{token.text.lower()}-{token.head.text.lower()}"
                triplets.append((compound_word, "compound", token.head.text.lower()))

        # 5. conj nouns (apple and bananas)
        for token in sent:
            if token.dep_ == "conj" and token.head.pos_ == "NOUN":
                triplets.append((token.head.text.lower(), "and", token.text.lower()))

        # 6. acl / advcl
        for token in sent:
            if token.dep_ in ("acl", "advcl") and token.pos_ == "VERB" and token.head.pos_ == "NOUN":
                subj = token.head.text.lower()
                pred = token.text.lower()
                obj = None
                for child in token.children:
                    if child.dep_ in ("dobj", "pobj", "attr") and child.pos_ in ("NOUN", "PROPN", "PRON"):
                        obj = child.text.lower()
                        triplets.append((subj, pred, obj))
                    if child.dep_ == "prep":
                        pobj = next((t for t in child.children if t.dep_ == "pobj"), None)
                        if pobj:
                            triplets.append((subj, pred, pobj.text.lower()))
                triplets.append((subj, pred, obj))

        # 7. acl + to + amod/xcomp (e.g., kneeling to pet animals)
        for token in sent:
            if token.pos_ == "VERB" and token.dep_ == "acl":
                subj = token.head.text.lower()
                verb1 = token.text.lower()
                triplets.append((subj, verb1, None))
                for child in token.children:
                    if child.dep_ == "prep" and child.text.lower() == "to":
                        for grandchild in child.children:
                            if grandchild.dep_ in ("amod", "xcomp") and grandchild.head.pos_ == "NOUN":
                                triplets.append((subj, grandchild.text.lower(), grandchild.head.text.lower()))

        # 8. conj verb (e.g., standing and sitting)
        for token in sent:
            if token.dep_ == "conj" and token.head.pos_ == "VERB":
                subject = None
                for sibling in token.head.children:
                    if sibling.dep_ == "nsubj":
                        subject = sibling.text.lower()
                if subject:
                    triplets.append((subject, token.text.lower(), None))

    return triplets

In [45]:
import json

# Đường dẫn tới file caption COCO (bạn chỉnh nếu cần)
caption_file = "E:/Download/annotations/captions_val2017.json"

# Load caption từ file JSON
with open(caption_file, "r") as f:
    caption_data = json.load(f)["annotations"]

# ID cần kiểm thử
test_image_id = 2157

# Lấy 5 caption của ảnh này
captions = [item["caption"] for item in caption_data if item["image_id"] == test_image_id]

# Kiểm thử bằng extract_triplets()
print(f"\n📸 Testing image_id = {test_image_id}")
for i, caption in enumerate(captions, 1):
    print(f"\n📌 Caption {i}: {caption}")
    triplets = extract_triplets(caption)
    if not triplets:
        print("   → ❌ Không trích được triplet nào.")
    else:
        for trip in triplets:
            print("   → Triplet:", trip)


📸 Testing image_id = 2157

📌 Caption 1: A plate of finger foods next to a blue and raspberry topped cake.
   → Triplet: ('plate', 'of', 'foods')
   → Triplet: ('cake', 'amod', 'blue')
   → Triplet: ('cake', 'amod', 'topped')
   → Triplet: ('finger-foods', 'compound', 'foods')

📌 Caption 2: A nicely set dining table filled with food and a cake topped with berries.
   → Triplet: ('table', 'amod', 'set')
   → Triplet: ('dining-table', 'compound', 'table')
   → Triplet: ('food', 'and', 'cake')
   → Triplet: ('table', 'filled', 'food')
   → Triplet: ('table', 'filled', None)
   → Triplet: ('cake', 'topped', 'berries')
   → Triplet: ('cake', 'topped', None)
   → Triplet: ('table', 'filled', None)
   → Triplet: ('cake', 'topped', None)

📌 Caption 3: a close up of a table with many plates of food
   → Triplet: ('close', 'with', 'plates')
   → Triplet: ('plates', 'of', 'food')
   → Triplet: ('plates', 'amod', 'many')

📌 Caption 4: A table topped with a cake covered in berries next to a plate o

In [46]:
import json

# Đường dẫn tới file caption COCO (bạn chỉnh nếu cần)
caption_file = "E:/Download/annotations/captions_val2017.json"

# Load caption từ file JSON
with open(caption_file, "r") as f:
    caption_data = json.load(f)["annotations"]

# ID cần kiểm thử
test_image_id = 6012

# Lấy 5 caption của ảnh này
captions = [item["caption"] for item in caption_data if item["image_id"] == test_image_id]

# Kiểm thử bằng extract_triplets()
print(f"\n📸 Testing image_id = {test_image_id}")
for i, caption in enumerate(captions, 1):
    print(f"\n📌 Caption {i}: {caption}")
    triplets = extract_triplets(caption)
    if not triplets:
        print("   → ❌ Không trích được triplet nào.")
    else:
        for trip in triplets:
            print("   → Triplet:", trip)


📸 Testing image_id = 6012

📌 Caption 1: There are bananas around another piece of fruit. 
   → Triplet: ('bananas', 'around', 'piece')
   → Triplet: ('piece', 'of', 'fruit')

📌 Caption 2: a yellow and red apple and some bananas
   → Triplet: ('apple', 'amod', 'yellow')
   → Triplet: ('apple', 'amod', 'red')
   → Triplet: ('apple', 'and', 'bananas')

📌 Caption 3: An apple sits in between a bunch of bananas
   → Triplet: ('apple', 'sits', 'bunch')
   → Triplet: ('bunch', 'of', 'bananas')

📌 Caption 4: A bunch of bananas with an apple sitting in the middle.
   → Triplet: ('bunch', 'of', 'bananas')
   → Triplet: ('bunch', 'with', 'apple')
   → Triplet: ('apple', 'sitting', 'middle')
   → Triplet: ('apple', 'sitting', None)
   → Triplet: ('apple', 'sitting', None)

📌 Caption 5: Six bananas and a red fruit make an artistic sight paired with a blue background.
   → Triplet: ('bananas', 'make', 'sight')
   → Triplet: ('fruit', 'amod', 'red')
   → Triplet: ('sight', 'amod', 'artistic')
   → Tr

In [47]:
# === Xử lý file caption_val2017.json ===
with open("E:/Download/annotations/captions_val2017.json", "r") as f:
    coco_data = json.load(f)

results = []

# Mỗi ảnh sẽ lưu triplet không trùng lặp riêng
image_triplet_set = {}

for item in tqdm(coco_data["annotations"]):
    image_id = item["image_id"]
    caption = item["caption"]
    triplets = extract_triplets(caption)

    if image_id not in image_triplet_set:
        image_triplet_set[image_id] = set()

    for triplet in triplets:
        if triplet not in image_triplet_set[image_id]:
            image_triplet_set[image_id].add(triplet)
            subject, predicate, object_ = triplet
            results.append({
                "subject": subject,
                "predicate": predicate,
                "object": object_ if object_ else "",
                "image_id": image_id
            })

# === Lưu file CSV để dùng huấn luyện ===
df = pd.DataFrame(results)
df.to_csv("triplets_cleaned.csv", index=False)
print("Đã lưu thành công.")

100%|███████████████████████████████████████████████████████████████████████████| 25014/25014 [03:05<00:00, 134.74it/s]


Đã lưu thành công.


In [48]:
import pandas as pd

# 1. Đọc file CSV
df = pd.read_csv('triplets_cleaned.csv')

# 2. Đếm số triplet ban đầu
total_before = len(df)
print(f"Số lượng triplet ban đầu: {total_before}")

# 3. Loại bỏ các triplet bị thiếu subject, predicate hoặc object
df_cleaned = df.dropna(subset=['subject', 'predicate', 'object'])
df_cleaned = df_cleaned[(df_cleaned['subject'] != '') &
                        (df_cleaned['predicate'] != '') &
                        (df_cleaned['object'] != '')]

# 4. Đếm lại số lượng triplet sau khi làm sạch
total_after = len(df_cleaned)
print(f"Số lượng triplet sau khi loại bỏ triplet thiếu: {total_after}")

# 5. Xuất ra file mới
df_cleaned.to_csv('triplets.csv', index=False)
print("Đã lưu file triplets.csv")

Số lượng triplet ban đầu: 81618
Số lượng triplet sau khi loại bỏ triplet thiếu: 70810
Đã lưu file triplets.csv
