In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd

In [2]:
# TransH Model Definition
# ------------------------
class TransH(nn.Module):
    def __init__(self, num_entities, num_relations, embedding_dim=100, margin=1.0):
        super(TransH, self).__init__()
        self.margin = margin
        self.embedding_dim = embedding_dim

        self.ent_embeddings = nn.Embedding(num_entities, embedding_dim)
        self.rel_embeddings = nn.Embedding(num_relations, embedding_dim)
        self.norm_embeddings = nn.Embedding(num_relations, embedding_dim)  # vector pháp tuyến

        nn.init.xavier_uniform_(self.ent_embeddings.weight.data)
        nn.init.xavier_uniform_(self.rel_embeddings.weight.data)
        nn.init.xavier_uniform_(self.norm_embeddings.weight.data)

    def _project(self, e, norm):
        norm = F.normalize(norm, p=2, dim=-1)
        return e - torch.sum(e * norm, dim=-1, keepdim=True) * norm

    def forward(self, pos_triples, neg_triples):
        ph = self.ent_embeddings(pos_triples[:, 0])
        pr = self.rel_embeddings(pos_triples[:, 1])
        pt = self.ent_embeddings(pos_triples[:, 2])
        pn = self.norm_embeddings(pos_triples[:, 1])

        nh = self.ent_embeddings(neg_triples[:, 0])
        nr = self.rel_embeddings(neg_triples[:, 1])
        nt = self.ent_embeddings(neg_triples[:, 2])
        nn_ = self.norm_embeddings(neg_triples[:, 1])

        ph_proj = self._project(ph, pn)
        pt_proj = self._project(pt, pn)
        nh_proj = self._project(nh, nn_)
        nt_proj = self._project(nt, nn_)

        pos_score = torch.norm(ph_proj + pr - pt_proj, p=2, dim=1)
        neg_score = torch.norm(nh_proj + nr - nt_proj, p=2, dim=1)

        return pos_score, neg_score

In [3]:
# Negative Sampling Helper
# ------------------------
def corrupt_batch(batch, num_entities):
    corrupted = batch.copy()
    for i in range(len(batch)):
        if np.random.rand() < 0.5:
            corrupted[i][0] = np.random.randint(0, num_entities)
        else:
            corrupted[i][2] = np.random.randint(0, num_entities)
    return corrupted

In [9]:
# Training Function
# ------------------------
def train_model(model, train_data, num_entities, num_relations, num_epochs=100, batch_size=512, lr=0.001, patience=10):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MarginRankingLoss(margin=model.margin)
    best_loss = float('inf')
    wait = 0

    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        np.random.shuffle(train_data)

        for i in range(0, len(train_data), batch_size):
            batch_pos = train_data[i:i+batch_size]
            batch_neg = corrupt_batch(batch_pos.copy(), num_entities)

            batch_pos = torch.tensor(batch_pos, dtype=torch.long)
            batch_neg = torch.tensor(batch_neg, dtype=torch.long)
            y = torch.ones(batch_pos.size(0))

            pos_score, neg_score = model(batch_pos, batch_neg)
            loss = criterion(pos_score, neg_score, y)

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / (len(train_data) / batch_size)
        print(f"Epoch {epoch+1} | Avg Loss: {avg_loss:.4f}")

        # Early stopping
        if avg_loss < best_loss:
            best_loss = avg_loss
            wait = 0
            torch.save(model.state_dict(), "best_transh_model.pth")
        else:
            wait += 1
            if wait >= patience:
                print(f"Early stopping at epoch {epoch+1} (best loss: {best_loss:.4f})")
                break

In [10]:
# Ví dụ: DataFrame chứa triplets
df_triplet = pd.read_csv("f_coco_triplets.csv")
entities = pd.concat([df_triplet["subject"], df_triplet["object"]]).unique()
entity2id = {entity: idx for idx, entity in enumerate(entities)}
relation2id = {rel: idx for idx, rel in enumerate(df_triplet["predicate"].unique())}

df_triplet["head_id"] = df_triplet["subject"].map(entity2id)
df_triplet["tail_id"] = df_triplet["object"].map(entity2id)
df_triplet["relation_id"] = df_triplet["predicate"].map(relation2id)

triplets = df_triplet[["head_id", "relation_id", "tail_id"]].values

In [11]:
model = TransH(num_entities=len(entity2id), num_relations=len(relation2id), embedding_dim=100, margin=1.0)
train_model(model, triplets, len(entity2id), len(relation2id), num_epochs=300, batch_size=512, lr=0.001, patience=10)

Epoch 1 | Avg Loss: 1.0728
Epoch 2 | Avg Loss: 1.0439
Epoch 3 | Avg Loss: 1.0144
Epoch 4 | Avg Loss: 0.9841
Epoch 5 | Avg Loss: 0.9533
Epoch 6 | Avg Loss: 0.9201
Epoch 7 | Avg Loss: 0.8886
Epoch 8 | Avg Loss: 0.8548
Epoch 9 | Avg Loss: 0.8242
Epoch 10 | Avg Loss: 0.7881
Epoch 11 | Avg Loss: 0.7539
Epoch 12 | Avg Loss: 0.7160
Epoch 13 | Avg Loss: 0.6819
Epoch 14 | Avg Loss: 0.6562
Epoch 15 | Avg Loss: 0.6266
Epoch 16 | Avg Loss: 0.5988
Epoch 17 | Avg Loss: 0.5767
Epoch 18 | Avg Loss: 0.5497
Epoch 19 | Avg Loss: 0.5221
Epoch 20 | Avg Loss: 0.5062
Epoch 21 | Avg Loss: 0.4836
Epoch 22 | Avg Loss: 0.4744
Epoch 23 | Avg Loss: 0.4638
Epoch 24 | Avg Loss: 0.4559
Epoch 25 | Avg Loss: 0.4421
Epoch 26 | Avg Loss: 0.4330
Epoch 27 | Avg Loss: 0.4272
Epoch 28 | Avg Loss: 0.4176
Epoch 29 | Avg Loss: 0.4015
Epoch 30 | Avg Loss: 0.3864
Epoch 31 | Avg Loss: 0.3894
Epoch 32 | Avg Loss: 0.3795
Epoch 33 | Avg Loss: 0.3721
Epoch 34 | Avg Loss: 0.3769
Epoch 35 | Avg Loss: 0.3616
Epoch 36 | Avg Loss: 0.3679
E

In [12]:
# Embedding Saving
# ------------------------
def save_embeddings(model, entity2id, relation2id):
    entity_embeddings = model.ent_embeddings.weight.detach().cpu().numpy()
    relation_embeddings = model.rel_embeddings.weight.detach().cpu().numpy()

    id2entity = {v: k for k, v in entity2id.items()}
    id2relation = {v: k for k, v in relation2id.items()}

    entity_df = pd.DataFrame(entity_embeddings)
    entity_df.insert(0, "entity", [id2entity[i] for i in range(len(id2entity))])
    entity_df.to_csv("entity_embeddings_transh.csv", index=False)

    relation_df = pd.DataFrame(relation_embeddings)
    relation_df.insert(0, "relation", [id2relation[i] for i in range(len(id2relation))])
    relation_df.to_csv("relation_embeddings_transh.csv", index=False)

In [18]:
import numpy as np
from collections import defaultdict
from tqdm import tqdm

# Hàm đánh giá 1 triplet
def evaluate_single_triplet_transH(head_id, rel_id, tail_id, entity_embeddings, relation_embeddings, image_embeddings, image_ids, img_entity_map, top_k=5):
    # Truy xuất vector embedding
    h_emb = entity_embeddings[head_id]
    r_emb = relation_embeddings[rel_id]
    t_emb = entity_embeddings[tail_id]

    # Vector dự đoán: h + r
    pred_vec = h_emb + r_emb

    # Tính khoảng cách tới các entity của từng ảnh
    scores = []
    for img_idx, img_id in enumerate(image_ids):
        entities_in_image = img_entity_map.get(img_id, set())
        if not entities_in_image:
            continue
        dists = [np.linalg.norm(pred_vec - entity_embeddings[e_id]) for e_id in entities_in_image]
        score = min(dists)  # Lấy khoảng cách ngắn nhất
        scores.append((img_id, score))

    # Lấy top-K ảnh gần nhất (khoảng cách nhỏ nhất)
    scores = sorted(scores, key=lambda x: x[1])[:top_k]
    predicted_ids = set(str(x[0]) for x in scores)
    
    return predicted_ids

In [19]:
def evaluate_transH_model(entity_embeddings, relation_embeddings, image_embeddings, image_ids, test_triplets, entity2id, relation2id, img_entity_map, all_map, top_k=5):
    gt_dict = defaultdict(set)
    for _, row in all_map.iterrows():
        key = (row["subject"], row["predicate"], row["object"])
        gt_dict[key].add(str(row["image_id"]))

    total_p, total_r, total_f1 = 0, 0, 0
    count = 0

    for _, row in tqdm(test_triplets.iterrows(), total=len(test_triplets)):
        query_key = (row["subject"], row["predicate"], row["object"])
        h = entity2id.get(row["subject"])
        r = relation2id.get(row["predicate"])
        t = entity2id.get(row["object"])
        if h is None or r is None or t is None:
            continue

        pred_ids = evaluate_single_triplet_transH(h, r, t, entity_embeddings, relation_embeddings, image_embeddings, image_ids, img_entity_map, top_k)
        true_ids = gt_dict.get(query_key, set())

        if not true_ids:
            continue

        tp = len(pred_ids & true_ids)
        fp = len(pred_ids - true_ids)
        fn = len(true_ids - pred_ids)

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall    = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1        = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

        total_p += precision
        total_r += recall
        total_f1 += f1
        count += 1

    if count > 0:
        print(f"\nEvaluation Results (top-{top_k}):")
        print(f"→ Precision@{top_k}: {total_p/count:.4f}")
        print(f"→ Recall@{top_k}:    {total_r/count:.4f}")
        print(f"→ F1-score@{top_k}:  {total_f1/count:.4f}")
    else:
        print("No valid triplets for evaluation.")

In [21]:
# Load ảnh và feature (đã trích xuất từ mô hình CNN)
image_embeddings = np.load("image_features.npy")  # shape (num_images, d)
image_ids = np.load("image_ids.npy")  # danh sách ID ảnh, dạng str hoặc int

In [22]:
from collections import defaultdict

# Load triplet mapping
df_all = pd.read_csv("f_coco_triplets.csv")  # phải có cột: subject, predicate, object, image_id

# Ánh xạ entity
entities = pd.concat([df_all["subject"], df_all["object"]]).unique()
entity2id = {entity: idx for idx, entity in enumerate(entities)}
relation2id = {rel: idx for idx, rel in enumerate(df_all["predicate"].unique())}

# Tạo ánh xạ image_id → tập entity_id
img_entity_map = defaultdict(set)
for _, row in df_all.iterrows():
    img_id = row["image_id"]
    subj = row["subject"]
    obj = row["object"]
    if subj in entity2id:
        img_entity_map[str(img_id)].add(entity2id[subj])
    if obj in entity2id:
        img_entity_map[str(img_id)].add(entity2id[obj])

In [32]:
# Lưu embedding sau khi huấn luyện
save_embeddings(model, entity2id, relation2id)

In [30]:
# Nếu bạn vừa train model:
entity_embeddings = model.ent_embeddings.weight.detach().cpu().numpy()
relation_embeddings = model.rel_embeddings.weight.detach().cpu().numpy()

In [33]:
entity_df = pd.read_csv("entity_embeddings_transh.csv")
relation_df = pd.read_csv("relation_embeddings_transh.csv")

# Tách ID và embedding
entity_embeddings = entity_df.drop("entity", axis=1).values
relation_embeddings = relation_df.drop("relation", axis=1).values

In [34]:
from sklearn.model_selection import train_test_split

# Chia tập test từ file gốc
df_test = df_all.drop_duplicates(subset=["subject", "predicate", "object", "image_id"])
test_triplets = df_test.sample(frac=0.1, random_state=42)  # 10% để test

# Chạy đánh giá
evaluate_transH_model(
    entity_embeddings=entity_embeddings,
    relation_embeddings=relation_embeddings,
    image_embeddings=image_embeddings,
    image_ids=[str(i) for i in image_ids],  # ép sang str nếu chưa phải str
    test_triplets=test_triplets,
    entity2id=entity2id,
    relation2id=relation2id,
    img_entity_map=img_entity_map,
    all_map=df_all,
    top_k=5
)

100%|██████████████████████████████████████████████████████████████████████████████| 320/320 [00:00<00:00, 4609.49it/s]


Evaluation Results (top-5):
→ Precision@5: 0.0000
→ Recall@5:    0.0000
→ F1-score@5:  0.0000





In [35]:
import numpy as np
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

# Load embedding từ file CSV đã lưu
def load_embeddings(entity_path, relation_path):
    entity_df = pd.read_csv(entity_path)
    relation_df = pd.read_csv(relation_path)

    entity_names = entity_df["entity"].tolist()
    entity_vectors = entity_df.drop(columns=["entity"]).values

    relation_names = relation_df["relation"].tolist()
    relation_vectors = relation_df.drop(columns=["relation"]).values

    entity2id = {name: i for i, name in enumerate(entity_names)}
    relation2id = {name: i for i, name in enumerate(relation_names)}

    return entity_vectors, relation_vectors, entity2id, relation2id

# Load ảnh và embedding ảnh từ CNN
def load_image_embeddings(img_feat_path, img_id_path):
    image_embeddings = np.load(img_feat_path)
    image_ids = np.load(img_id_path)
    image_ids = [str(i) for i in image_ids]
    return image_embeddings, image_ids

# Tạo ánh xạ ảnh → entity
def build_img_entity_map(df_all, entity2id):
    img_entity_map = defaultdict(set)
    for _, row in df_all.iterrows():
        img_id = str(row["image_id"])
        for ent in [row["subject"], row["object"]]:
            if ent in entity2id:
                img_entity_map[img_id].add(entity2id[ent])
    return img_entity_map

# Đánh giá 1 triplet
def evaluate_single_triplet_transH(head_id, rel_id, tail_id, entity_embeddings, relation_embeddings, image_embeddings, image_ids, img_entity_map, top_k=5):
    pred_vec = entity_embeddings[head_id] + relation_embeddings[rel_id]
    scores = []

    for img_idx, img_id in enumerate(image_ids):
        entities_in_img = img_entity_map.get(img_id, set())
        if not entities_in_img:
            continue
        dists = [np.linalg.norm(pred_vec - entity_embeddings[eid]) for eid in entities_in_img]
        score = min(dists)
        scores.append((img_id, score))

    scores = sorted(scores, key=lambda x: x[1])[:top_k]
    return set([img_id for img_id, _ in scores])

# Hàm đánh giá toàn bộ
def evaluate_transH_model(entity_embeddings, relation_embeddings, image_embeddings, image_ids, test_triplets, entity2id, relation2id, img_entity_map, all_map, top_k=5):
    gt_dict = defaultdict(set)
    for _, row in all_map.iterrows():
        key = (row["subject"], row["predicate"], row["object"])
        gt_dict[key].add(str(row["image_id"]))

    total_p, total_r, total_f1 = 0, 0, 0
    count = 0

    for _, row in tqdm(test_triplets.iterrows(), total=len(test_triplets)):
        query_key = (row["subject"], row["predicate"], row["object"])
        h = entity2id.get(row["subject"])
        r = relation2id.get(row["predicate"])
        t = entity2id.get(row["object"])
        if h is None or r is None or t is None:
            continue

        pred_ids = evaluate_single_triplet_transH(h, r, t, entity_embeddings, relation_embeddings, image_embeddings, image_ids, img_entity_map, top_k)
        true_ids = gt_dict.get(query_key, set())

        if not true_ids:
            continue

        tp = len(pred_ids & true_ids)
        fp = len(pred_ids - true_ids)
        fn = len(true_ids - pred_ids)

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall    = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1        = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

        total_p += precision
        total_r += recall
        total_f1 += f1
        count += 1

    if count > 0:
        print(f"\nEvaluation Results (top-{top_k}):")
        print(f"→ Precision@{top_k}: {total_p/count:.4f}")
        print(f"→ Recall@{top_k}:    {total_r/count:.4f}")
        print(f"→ F1-score@{top_k}:  {total_f1/count:.4f}")
    else:
        print("Không có triplet hợp lệ để đánh giá.")

# ========== CHẠY ==========
if __name__ == "__main__":
    # Đường dẫn
    ENTITY_CSV = "entity_embeddings_transh.csv"
    REL_CSV = "relation_embeddings_transh.csv"
    IMAGE_FEAT = "image_features.npy"
    IMAGE_IDS = "image_ids.npy"
    TRIPLET_FILE = "f_coco_triplets.csv"

    # Tải dữ liệu
    entity_embeddings, relation_embeddings, entity2id, relation2id = load_embeddings(ENTITY_CSV, REL_CSV)
    image_embeddings, image_ids = load_image_embeddings(IMAGE_FEAT, IMAGE_IDS)
    df_all = pd.read_csv(TRIPLET_FILE)
    img_entity_map = build_img_entity_map(df_all, entity2id)

    # Tách tập test
    df_test = df_all.sample(frac=0.2, random_state=42)  # hoặc dùng tập riêng nếu có

    # Đánh giá
    evaluate_transH_model(
        entity_embeddings=entity_embeddings,
        relation_embeddings=relation_embeddings,
        image_embeddings=image_embeddings,
        image_ids=image_ids,
        test_triplets=df_test,
        entity2id=entity2id,
        relation2id=relation2id,
        img_entity_map=img_entity_map,
        all_map=df_all,
        top_k=5
    )


100%|██████████████████████████████████████████████████████████████████████████████| 663/663 [00:00<00:00, 3500.37it/s]


Evaluation Results (top-5):
→ Precision@5: 0.0000
→ Recall@5:    0.0000
→ F1-score@5:  0.0000



