In [1]:
import pandas as pd
import networkx as nx
import dgl
import torch
import numpy as np
import pickle
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
import torch.nn as nn
from dgl.nn import GraphConv
import torch.nn.functional as F
import os

# 1. Đọc dữ liệu và tạo đồ thị NetworkX
df = pd.read_csv('f_coco_triplets.csv').dropna().astype(str)
triplets = list(zip(df['subject'], df['predicate'], df['object'], df['image_id']))

G = nx.DiGraph()
for s, p, o, img_id in triplets:
    mid_node = f"{s}_{p}_{o}"
    G.add_edge(s, mid_node, relation=p)
    G.add_edge(mid_node, o, relation=p)
    G.nodes[mid_node]['image_id'] = img_id

In [2]:
# 2. Chuyển sang đồ thị DGL
all_nodes = list(G.nodes)
le = LabelEncoder()
node_ids = le.fit_transform(all_nodes)
node_id_map = dict(zip(all_nodes, node_ids))

src = [node_id_map[u] for u, v in G.edges()]
dst = [node_id_map[v] for u, v in G.edges()]
g = dgl.graph((src, dst))
g = dgl.add_self_loop(g)
features = torch.eye(len(all_nodes))

In [6]:
# 3. Gán nhãn: node_mid → label = image_id
node_labels = -1 * np.ones(len(all_nodes), dtype=np.int32)
image_id_set = sorted({G.nodes[n]['image_id'] for n in G.nodes if 'image_id' in G.nodes[n]})
image_id_encoder = LabelEncoder().fit(image_id_set)

for node in G.nodes:
    if 'image_id' in G.nodes[node]:
        idx = node_id_map[node]
        label = image_id_encoder.transform([G.nodes[node]['image_id']])[0]
        node_labels[idx] = label

labels = torch.tensor(node_labels, dtype=torch.long)
mask = labels != -1  # chỉ train các node có image_id

In [9]:
# 4. Định nghĩa mô hình GCN
class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, out_feats):
        super().__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats, out_feats)

    def forward(self, g, x):
        x = self.conv1(g, x)
        x = torch.relu(x)
        x = self.conv2(g, x)
        return x

num_classes = len(image_id_encoder.classes_)
model = GCN(features.shape[1], 64, num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [11]:
# 5. Huấn luyện mô hình
for epoch in range(100):
    model.train()
    out = model(g, features)
    loss = F.cross_entropy(out[mask], labels[mask])
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch {epoch} | Loss: {loss.item():.4f}")

Epoch 0 | Loss: 0.0356
Epoch 10 | Loss: 0.0257
Epoch 20 | Loss: 0.0205
Epoch 30 | Loss: 0.0172
Epoch 40 | Loss: 0.0149
Epoch 50 | Loss: 0.0131
Epoch 60 | Loss: 0.0117
Epoch 70 | Loss: 0.0105
Epoch 80 | Loss: 0.0095
Epoch 90 | Loss: 0.0087
Epoch 100 | Loss: 0.0080
Epoch 110 | Loss: 0.0073
Epoch 120 | Loss: 0.0068
Epoch 130 | Loss: 0.0063
Epoch 140 | Loss: 0.0059
Epoch 150 | Loss: 0.0055
Epoch 160 | Loss: 0.0051
Epoch 170 | Loss: 0.0048
Epoch 180 | Loss: 0.0045
Epoch 190 | Loss: 0.0043


In [12]:
# 6. Trích node embedding sau huấn luyện
model.eval()
with torch.no_grad():
    node_embeddings = model(g, features)

In [13]:
# 7. Tính embedding trung bình mỗi ảnh
embeddings = node_embeddings.numpy()
image_embeddings = defaultdict(list)

for node, idx in node_id_map.items():
    if 'image_id' in G.nodes[node]:
        img_id = G.nodes[node]['image_id']
        image_embeddings[img_id].append(embeddings[idx])

for img in image_embeddings:
    vecs = np.stack(image_embeddings[img])
    image_embeddings[img] = np.mean(vecs, axis=0)

In [14]:
# 8. Lưu kết quả
os.makedirs("saved_model_GCN", exist_ok=True)

torch.save(model.state_dict(), "saved_model_GCN/gcn_model_weights.pt")
np.save("saved_model_GCN/node_embeddings.npy", node_embeddings.numpy())
with open("saved_model_GCN/image_embeddings.pkl", "wb") as f:
    pickle.dump(image_embeddings, f)
with open("saved_model_GCN/node_label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

print("Đã huấn luyện xong GCN và lưu toàn bộ embedding.")

Đã huấn luyện xong GCN và lưu toàn bộ embedding.


In [19]:
# === Tạo và lưu entity_encoder (dành cho tìm kiếm ảnh theo triplet caption) ===
entities = pd.unique(df[['subject', 'object']].values.ravel())
entity_encoder = LabelEncoder().fit(entities)

# Tạo entity_idx_to_images
entity_idx_to_images = defaultdict(list)
for s, p, o, img_id in triplets:
    for e in [s, o]:
        try:
            idx = entity_encoder.transform([e])[0]
            entity_idx_to_images[idx].append(int(img_id))
        except:
            pass

# === Lưu thêm 3 file cần cho tìm kiếm caption ===
with open("saved_model_GCN/entity_encoder.pkl", "wb") as f:
    pickle.dump(entity_encoder, f)

with open("saved_model_GCN/entity_idx_to_images.pkl", "wb") as f:
    pickle.dump(entity_idx_to_images, f)

In [15]:
import pandas as pd
from collections import defaultdict
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Đọc file triplet
df = pd.read_csv("f_coco_triplets.csv").dropna().astype(str)

# Ground-truth: ảnh nào chia sẻ subject/predicate/object thì coi là "liên quan"
entity_to_images = defaultdict(set)
for _, row in df.iterrows():
    entity_to_images[row["subject"]].add(row["image_id"])
    entity_to_images[row["predicate"]].add(row["image_id"])
    entity_to_images[row["object"]].add(row["image_id"])

# Mỗi ảnh → các ảnh liên quan (ground truth)
ground_truth = defaultdict(set)
for _, row in df.iterrows():
    img_id = row["image_id"]
    related_imgs = entity_to_images[row["subject"]] | entity_to_images[row["predicate"]] | entity_to_images[row["object"]]
    related_imgs.discard(img_id)  # Không tính chính nó
    ground_truth[img_id].update(related_imgs)

In [16]:
import pickle

# Đọc embedding ảnh từ file đã lưu
with open("saved_model_GCN/image_embeddings.pkl", "rb") as f:
    image_embeddings = pickle.load(f)

image_ids = list(image_embeddings.keys())
embedding_matrix = np.stack([image_embeddings[img_id] for img_id in image_ids])

In [17]:
predicted = {}

similarity_matrix = cosine_similarity(embedding_matrix)

for i, query_id in enumerate(image_ids):
    sim_scores = similarity_matrix[i]
    # Sắp xếp và lấy top ảnh (loại bỏ chính nó)
    sorted_idx = np.argsort(-sim_scores)
    top_imgs = [image_ids[j] for j in sorted_idx if image_ids[j] != query_id][:5]
    predicted[query_id] = top_imgs

In [18]:
def evaluate_image_retrieval(ground_truth, predicted, top_k=5):
    precision_list = []
    recall_list = []
    f1_list = []

    for query_id in ground_truth:
        gt_set = ground_truth[query_id]
        pred_list = predicted.get(query_id, [])[:top_k]
        pred_set = set(pred_list)

        true_positive = len(gt_set & pred_set)
        precision = true_positive / len(pred_list) if pred_list else 0.0
        recall = true_positive / len(gt_set) if gt_set else 0.0
        f1 = (2 * precision * recall / (precision + recall)) if precision + recall > 0 else 0.0

        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    precision_avg = sum(precision_list) / len(precision_list)
    recall_avg = sum(recall_list) / len(recall_list)
    f1_avg = sum(f1_list) / len(f1_list)

    return precision_avg, recall_avg, f1_avg

# ✅ Gọi hàm đánh giá
p, r, f1 = evaluate_image_retrieval(ground_truth, predicted, top_k=5)
print(f"Precision@5: {p:.4f}")
print(f"Recall@5:    {r:.4f}")
print(f"F1-score@5:  {f1:.4f}")

Precision@5: 0.6493
Recall@5:    0.0364
F1-score@5:  0.0594
