In [23]:
import json
from pathlib import Path
import os
import re

def ignore_sentence_contain_char(sentence ,  conditions=[['logo','htv'],['thời gian phát sóng'] , ['dưới' , 'dòng chữ'] , ['bên phải','htv']] , regex_char=[r'([01]\d|2[0-3]):[0-5]\d:[0-5]\d']):
        # return False if sentence contain char
        if len(conditions) == 0:
            return True
        
        for condition in conditions:
            if len(condition) == 1:
                if re.search(condition[0], sentence):
                    return False
            else:
                if all(re.search(char, sentence) for char in condition):
                    return False
                
        for pattern in regex_char:
            if re.search(pattern, sentence):
                return False
            
        return True

def read_image_captions(folder_path):
    image_captions = {}

    for file in Path(folder_path).glob("*.jsonl"):
        file_image_captions = read_jsonl_to_list(file)
        file_base, extension = os.path.splitext(os.path.basename(file))
        image_captions[file_base] = file_image_captions
    return image_captions

def read_jsonl_to_list(file_path):
    result = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                json_obj = json.loads(line)
                for key, value in json_obj.items():
                    #value = ignore_sentence_contain_char(value)
                    result.append([key.split('.')[0], value])
            except json.JSONDecodeError:
                print(f"Skipping invalid JSON line: {line}")
    return result

image_captions = read_image_captions("data/image-caption")

In [None]:
from transformers import PhobertTokenizer, BertModel
import torch
import numpy as np

tokenizer = PhobertTokenizer.from_pretrained("vinai/phobert-base")
model = BertModel.from_pretrained("vinai/phobert-base")
device = torch.device("cuda")
model.to('cuda')

In [None]:
text_input = "Một cánh đồng cây xanh tươi tốt. Các cây được trồng theo hàng dọc, tạo thành những hình dạng độc đáo."
inputs_sample = tokenizer(text_input, return_tensors="pt", padding=True, truncation=True)
inputs_sample = {k: v.to(device) for k, v in inputs_sample.items()}
outputs_sample = model(**inputs_sample)
embeddings_sample = outputs_sample.last_hidden_state.mean(dim=1).detach().cpu().numpy()

caption_scores = []

for caption in image_captions["L13_V001"]:
    inputs_candidate = tokenizer(caption, return_tensors="pt", padding=True, truncation=True)
    inputs_candidate = {k: v.to(device) for k, v in inputs_candidate.items()}
    outputs_candidate = model(**inputs_candidate)
    embeddings_candidate = outputs_candidate.last_hidden_state.mean(dim=1).detach().cpu().numpy()
    similarity = np.dot(embeddings_sample, embeddings_candidate.T) / (np.linalg.norm(embeddings_sample) * np.linalg.norm(embeddings_candidate))
    caption_scores.append([caption[1], similarity[0][0]])
caption_scores = sorted(caption_scores, key = lambda item: item[-1], reverse=True)
print(caption_scores)