In [1]:
import pandas as pd

# Load COCO
coco = pd.read_csv("f_coco_triplets.csv")

# Chỉ giữ cột cần thiết
coco = coco[["subject", "predicate", "object"]]

# Chuẩn hoá text
for col in ["subject", "predicate", "object"]:
    coco[col] = coco[col].str.lower().str.strip()

# Lưu ra file txt format cho PyKEEN
coco.to_csv("transE_triplets.tsv", sep="\t", index=False, header=False)

print(f"✅ Đã ghi {len(coco)} triplet vào transE_triplets.tsv")

✅ Đã ghi 3316 triplet vào transE_triplets.tsv


In [2]:
import random

# Đọc toàn bộ dòng từ file gốc
with open('transE_triplets.tsv', 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Xáo trộn thứ tự dòng để tránh thiên lệch
random.shuffle(lines)

# Tính số lượng cho từng tập
n_total = len(lines)
n_train = int(0.8 * n_total)
n_valid = int(0.1 * n_total)
n_test = n_total - n_train - n_valid  # phần còn lại

# Cắt theo tỷ lệ
train_lines = lines[:n_train]
valid_lines = lines[n_train:n_train + n_valid]
test_lines = lines[n_train + n_valid:]

# Ghi ra các file riêng
with open('train.tsv', 'w', encoding='utf-8') as f:
    f.writelines(train_lines)

with open('valid.tsv', 'w', encoding='utf-8') as f:
    f.writelines(valid_lines)

with open('test.tsv', 'w', encoding='utf-8') as f:
    f.writelines(test_lines)

print(f'Đã chia thành: {len(train_lines)} train | {len(valid_lines)} valid | {len(test_lines)} test')

Đã chia thành: 2652 train | 331 valid | 333 test


In [3]:
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory

# Tạo các triples factory từ file
train_tf = TriplesFactory.from_path('train.tsv', separator='\t')
valid_tf = TriplesFactory.from_path('valid.tsv', separator='\t')
test_tf = TriplesFactory.from_path('test.tsv', separator='\t')

# Khởi chạy pipeline huấn luyện
result = pipeline(
    training=train_tf,
    validation=valid_tf,
    testing=test_tf,

    model='TransE',
    model_kwargs=dict(embedding_dim=100),

    training_loop='slcwa',
    optimizer='Adam',
    optimizer_kwargs=dict(lr=1e-3),

    stopper='early',
    stopper_kwargs=dict(frequency=5, patience=10, metric='mean_rank'),

    random_seed=42,
    device='cpu' 
)

Training epochs on cpu:   0%|          | 0/5 [00:00<?, ?epoch/s]

Training batches on cpu:   0%|          | 0/9 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/9 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/9 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/9 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/9 [00:00<?, ?batch/s]

Evaluating on cpu:   0%|          | 0.00/316 [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.90s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 5: 538.302978515625. Saved model weights to C:\Users\admin\.data\pykeen\checkpoints\best-model-weights-1ee243a9-8c4c-4bc5-bdca-8350c262ea9d.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 5.


Evaluating on cpu:   0%|          | 0.00/312 [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.95s seconds


In [5]:
# Entity embeddings
entity_embeddings = result.model.entity_representations[0]().detach().cpu().numpy()
entities = result.training.entity_to_id

# Relation embeddings
relation_embeddings = result.model.relation_representations[0]().detach().cpu().numpy()
relations = result.training.relation_to_id

# Lưu entity embedding
import numpy as np
np.save("entity_embeddings.npy", entity_embeddings)
with open("entities.txt", "w", encoding='utf-8') as f:
    for k, v in entities.items():
        f.write(f"{v}\t{k}\n")

# Lưu relation embedding
np.save("relation_embeddings.npy", relation_embeddings)
with open("relations.txt", "w", encoding='utf-8') as f:
    for k, v in relations.items():
        f.write(f"{v}\t{k}\n")

print("✅ Đã lưu embedding")

✅ Đã lưu embedding


In [7]:
entity2id = {}
with open("entities.txt", encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split("\t")
        if len(parts) != 2:
            continue  # bỏ qua dòng không hợp lệ
        idx, name = parts
        entity2id[name] = int(idx)

relation2id = {}
with open("relations.txt", encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split("\t")
        if len(parts) != 2:
            continue
        idx, name = parts
        relation2id[name] = int(idx)

In [9]:
import pandas as pd

# Load và xử lý COCO
triplet_map = pd.read_csv("f_coco_triplets.csv").copy()
triplet_map["source"] = "coco"

# Chuẩn hóa text
triplet_map[["subject", "predicate", "object"]] = triplet_map[["subject", "predicate", "object"]].apply(
    lambda x: x.str.lower().str.strip()
)

# Nếu bạn vẫn muốn dùng tên biến là `all_map`
all_map = triplet_map

# Kiểm tra nhanh
print(f"✅ Tổng số triplet COCO: {len(all_map)}")
print(all_map.head())

✅ Tổng số triplet COCO: 3316
     subject predicate  object  image_id source
0        man      wear     hat     93437   coco
1  telephone      have  banana     12667   coco
2      group     drive    tree    314251   coco
3        man     enjoy     nap    223747   coco
4        man      wear     hat     93437   coco


In [10]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Nhúng triplet truy vấn
def embed_triplet(head, rel, tail):
    try:
        h_id = entity2id[head]
        r_id = relation2id[rel]
        t_id = entity2id[tail]
    except KeyError:
        return None

    h_vec = entity_embeddings[h_id]
    r_vec = relation_embeddings[r_id]
    t_vec = entity_embeddings[t_id]

    score_vec = h_vec + r_vec - t_vec  # TransE scoring
    return score_vec

# Tạo embedding cho tất cả triplet trong KG
def build_triplet_embedding_matrix(df):
    triplet_vecs = []
    valid_triplets = []
    for _, row in df.iterrows():
        trip = embed_triplet(row["subject"], row["predicate"], row["object"])
        if trip is not None:
            triplet_vecs.append(trip)
            valid_triplets.append((row["subject"], row["predicate"], row["object"], row["image_id"], row["source"]))
    return np.stack(triplet_vecs), valid_triplets

print("📌 Đang tạo vector cho tất cả triplet từ KG...")
triplet_matrix, triplet_info = build_triplet_embedding_matrix(all_map)

📌 Đang tạo vector cho tất cả triplet từ KG...


In [11]:
def find_nearest_image_from_query(query_triplet, top_k=5):
    query_vec = embed_triplet(*query_triplet)
    if query_vec is None:
        return []

    sims = cosine_similarity(query_vec.reshape(1, -1), triplet_matrix)[0]
    top_k_idx = sims.argsort()[::-1][:top_k]
    results = [triplet_info[i] for i in top_k_idx]
    return results

In [12]:
query = ("man", "riding", "horse")
results = find_nearest_image_from_query(query)

for s, p, o, img_id, src in results:
    print(f"[{src}] image {img_id}: ({s}, {p}, {o})")

In [13]:
# Dùng 500 triplet test từ COCO để đánh giá
test_queries = all_map.drop_duplicates(subset=["subject", "predicate", "object"]).sample(500, random_state=42)

In [14]:
from collections import defaultdict

# Tạo dict: (s,p,o) → tập các image_id thực sự chứa triplet đó
gt_dict = defaultdict(set)

for _, row in all_map.iterrows():
    key = (row["subject"], row["predicate"], row["object"])
    gt_dict[key].add(str(row["image_id"]))

In [15]:
def evaluate_single_triplet(query, top_k=5):
    results = find_nearest_image_from_query(query, top_k)
    predicted_ids = {str(r[3]) for r in results}
    true_ids = gt_dict.get(query, set())

    if not true_ids:
        return None  # không có ground truth → bỏ qua

    tp = len(predicted_ids & true_ids)
    fp = len(predicted_ids - true_ids)
    fn = len(true_ids - predicted_ids)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall    = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1        = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

In [17]:
from tqdm import tqdm

total_p, total_r, total_f1 = 0, 0, 0
count = 0

for _, row in tqdm(test_queries.iterrows(), total=len(test_queries)):
    query = (row["subject"], row["predicate"], row["object"])
    result = evaluate_single_triplet(query, top_k=5)
    if result:
        p, r, f1 = result
        total_p += p
        total_r += r
        total_f1 += f1
        count += 1

if count > 0:
    print("📊 Evaluation Results (top-5):")
    print(f"→ Precision@5: {total_p/count:.4f}")
    print(f"→ Recall@5:    {total_r/count:.4f}")
    print(f"→ F1-score@5:  {total_f1/count:.4f}")
else:
    print("❌ Không có truy vấn nào có ground-truth")

100%|███████████████████████████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 471.60it/s]

📊 Evaluation Results (top-5):
→ Precision@5: 0.2349
→ Recall@5:    0.9328
→ F1-score@5:  0.3602



