In [1]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# === 1. Load dữ liệu từ file JSON ===
with open("E:/Download/annotations/captions_val2017.json", 'r') as f:
    coco_data = json.load(f)

# === 2. Tạo DataFrame từ annotations ===
annotations = coco_data['annotations']
df = pd.DataFrame(annotations)
df['image_id'] = df['image_id'].astype(str)
df['image_file'] = df['image_id'].apply(lambda x: f"{x.zfill(12)}.jpg")

# === 3. Chọn caption đầu tiên cho mỗi ảnh (để tránh trùng) ===
df_unique = df.groupby('image_id').first().reset_index()

# === 4. TF-IDF vectorization ===
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df_unique['caption'])

In [2]:
# === 5. Truy vấn người dùng ===
query = "man riding horse"
query_vec = vectorizer.transform([query])

# === 6. Tính độ tương đồng cosine ===
similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()

# === 7. Trả về top-K ảnh phù hợp ===
top_k = 5
top_indices = similarities.argsort()[::-1][:top_k]
print("Top kết quả truy vấn:")
for idx in top_indices:
    print(f"- {df_unique.iloc[idx]['image_file']} | Caption: {df_unique.iloc[idx]['caption']} | Score: {similarities[idx]:.4f}")

Top kết quả truy vấn:
- 000000382111.jpg | Caption: A man riding on the back of a brown horse. | Score: 0.8222
- 000000023034.jpg | Caption: A man riding on the back of a brown horse. | Score: 0.8222
- 000000183675.jpg | Caption: A man is riding a horse in an open field. | Score: 0.7352
- 000000492282.jpg | Caption: A man who is riding a horse down a brick street. | Score: 0.6923
- 000000454798.jpg | Caption: there is a young boy that is riding a horse | Score: 0.6335


In [4]:
# === 5. Truy vấn người dùng ===
query = "A man is riding a pony"
query_vec = vectorizer.transform([query])

# === 6. Tính độ tương đồng cosine ===
similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()

# === 7. Trả về top-K ảnh phù hợp ===
top_k = 5
top_indices = similarities.argsort()[::-1][:top_k]
print("Top kết quả truy vấn:")
for idx in top_indices:
    print(f"- {df_unique.iloc[idx]['image_file']} | Caption: {df_unique.iloc[idx]['caption']} | Score: {similarities[idx]:.4f}")

Top kết quả truy vấn:
- 000000050380.jpg | Caption: A  man guiding a pony with a boy riding on it. | Score: 0.6792
- 000000546823.jpg | Caption: a small pony a with two smaller ponies standing in front of it | Score: 0.4480
- 000000546976.jpg | Caption: A man riding on the back of a motorcycle. | Score: 0.3849
- 000000396863.jpg | Caption: The young girl wearing red and black rides atop the white pony. | Score: 0.3795
- 000000348243.jpg | Caption: A chubby black pony in a pasture looking ahead. | Score: 0.3757
