In [5]:
import csv
from pathlib import Path

from ns_vfs.common.utility import save_frames
from ns_vfs.config.loader import load_config
from ns_vfs.data.frame import BenchmarkLTLFrame, FramesofInterest
from ns_vfs.frame_searcher import FrameSearcher
from ns_vfs.model.vision.grounding_dino import GroundingDino
from ns_vfs.processor.benchmark_video_processor import BenchmarkVideoFrameProcessor
from ns_vfs.video_to_automaton import VideotoAutomaton
from common import get_available_benchmark_video
from ns_vfs.model.vision.yolo import Yolo
from common import get_available_benchmark_video, get_precision_recall_f1_score
from ns_vfs.common.utility import save_dict_to_pickle

**Global Variable**

In [6]:
config = load_config()
benchmark_frame_video_root_dir = Path(
    "/opt/Neuro-Symbolic-Video-Frame-Search/artifacts/test_benchmark_frame_video/"
)
benchmark_image_set_dir = [x for x in benchmark_frame_video_root_dir.iterdir() if x.is_dir()]
cv_model_list = ["grounding_dino", "yolo"]

**Local Variable for the experiment**

In [7]:
"""
It will go over all available benchmark video and search for frame of interest for each cv detection model.
+ No manual confidence score
"""
result = {}
for benchmark_name_dir in benchmark_image_set_dir:
    ltl_video_dir_set = [x for x in benchmark_name_dir.iterdir() if x.is_dir()]
    if len(ltl_video_dir_set) > 0:
        print(f"--processing {benchmark_name_dir.name}--")
        print(f"number of ltl rule: {len(ltl_video_dir_set)}")
        result[benchmark_name_dir.name] = {}
        for ltl_video_dir in ltl_video_dir_set:
            result[benchmark_name_dir.name][ltl_video_dir] = {}
            benchmark_video_file_list = get_available_benchmark_video(ltl_video_dir)
            print(f"number of examples of {ltl_video_dir.name}: {len(benchmark_video_file_list)}")

            for benchmark_video_file in benchmark_video_file_list:
                ltl_formula = benchmark_video_file.name.split(".")[0].split("_ltl_")[-1]
                # result[benchmark_name_dir.name][ltl_video_dir.name][ltl_formula] = {}
                search_result_per_video = {}

                for cv_model in cv_model_list:
                    if cv_model == "yolo":
                        cv_detection_model = Yolo(config=config.YOLO,
                                                  weight_path=config.YOLO.YOLO_CHECKPOINT_PATH)
                    elif cv_model == "grounding_dino":
                        cv_detection_model = GroundingDino(
                                config=config.GROUNDING_DINO,
                                weight_path=config.GROUNDING_DINO.GROUNDING_DINO_CHECKPOINT_PATH,
                                config_path=config.GROUNDING_DINO.GROUNDING_DINO_CONFIG_PATH,
                            )
                    benchmark_video_processor = BenchmarkVideoFrameProcessor(
                        video_path=benchmark_video_file,
                        artifact_dir=config.VERSION_AND_PATH.ARTIFACTS_PATH)

                    benchmark_video: BenchmarkLTLFrame = benchmark_video_processor.benchmark_image_frames

                    video_automata_builder = VideotoAutomaton(
                        detector=cv_detection_model,
                        video_processor=benchmark_video_processor,
                        artifact_dir=config.VERSION_AND_PATH.ARTIFACTS_PATH,
                        proposition_set=benchmark_video.proposition,
                        save_annotation=False,  # TODO: Debug only
                        save_image=False,  # TODO: Debug only
                        ltl_formula=f"P>=0.80 [{benchmark_video.ltl_formula}]",
                        verbose=False,
                    )
                    frame_sercher = FrameSearcher(
                        video_automata_builder=video_automata_builder,
                        video_processor=benchmark_video_processor,
                    )

                    frame_of_interest = frame_sercher.search()
                    # search_result_per_video 
                    search_result_per_video["benchmark_video"] = benchmark_video
                    search_result_per_video[cv_model] = frame_of_interest

                # classification_metrics
                search_result_per_video = get_precision_recall_f1_score(search_result_per_video)
                result[ltl_formula] = search_result_per_video
save_dict_to_pickle(result, path="/opt/Neuro-Symbolic-Video-Frame-Search/artifacts/", file_name="all_test_result.pkl")

--processing cifar100--
number of ltl rule: 1
number of examples of Fprop1: 12
final text_encoder_type: bert-base-uncased



0: 640x640 (no detections), 6.3ms
Speed: 5.3ms preprocess, 6.3ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.3ms
Speed: 1.9ms preprocess, 6.3ms inference, 7.9ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 8.7ms
Speed: 2.7ms preprocess, 8.7ms inference, 11.6ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 3.1ms preprocess, 6.4ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 3.1ms preprocess, 6.4ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.3ms
Speed: 3.2ms preprocess, 6.3ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.2ms
Speed: 1.7ms preprocess, 4.2ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.4ms
Speed: 1.7ms preprocess, 4.4ms inference, 0

final text_encoder_type: bert-base-uncased



0: 640x640 (no detections), 6.2ms
Speed: 2.3ms preprocess, 6.2ms inference, 11.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 2.8ms preprocess, 6.4ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 11.0ms
Speed: 4.8ms preprocess, 11.0ms inference, 11.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 2.9ms preprocess, 6.4ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.1ms
Speed: 3.0ms preprocess, 4.1ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.2ms
Speed: 1.7ms preprocess, 4.2ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.5ms
Speed: 1.8ms preprocess, 6.5ms inference, 11.6ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.2ms
Speed: 3.1ms preprocess, 6.2ms inference,

final text_encoder_type: bert-base-uncased



0: 640x640 (no detections), 6.3ms
Speed: 6.5ms preprocess, 6.3ms inference, 11.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 2.6ms preprocess, 6.4ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 8.7ms
Speed: 3.0ms preprocess, 8.7ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 5.2ms
Speed: 1.8ms preprocess, 5.2ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 5.7ms
Speed: 1.9ms preprocess, 5.7ms inference, 12.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.3ms
Speed: 3.0ms preprocess, 6.3ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 7.0ms
Speed: 1.8ms preprocess, 7.0ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 3.0ms preprocess, 6.4ms inference, 

final text_encoder_type: bert-base-uncased



0: 640x640 (no detections), 6.3ms
Speed: 2.3ms preprocess, 6.3ms inference, 11.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 8.8ms
Speed: 3.7ms preprocess, 8.8ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 2.8ms preprocess, 6.4ms inference, 11.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 3.0ms preprocess, 6.4ms inference, 11.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.3ms
Speed: 2.9ms preprocess, 6.3ms inference, 11.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 5.7ms
Speed: 1.7ms preprocess, 5.7ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 5.8ms
Speed: 1.8ms preprocess, 5.8ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.3ms
Speed: 3.8ms preprocess, 6.3ms inference, 1

final text_encoder_type: bert-base-uncased



0: 640x640 (no detections), 6.3ms
Speed: 2.3ms preprocess, 6.3ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.1ms
Speed: 2.7ms preprocess, 6.1ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 2.9ms preprocess, 6.4ms inference, 11.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.1ms
Speed: 3.1ms preprocess, 4.1ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.1ms
Speed: 3.0ms preprocess, 4.1ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 3.1ms preprocess, 6.4ms inference, 8.4ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.0ms
Speed: 1.6ms preprocess, 4.0ms inference, 0.4ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.8ms
Speed: 1.7ms preprocess, 4.8ms inference, 1

final text_encoder_type: bert-base-uncased



0: 640x640 (no detections), 6.3ms
Speed: 2.3ms preprocess, 6.3ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.3ms
Speed: 2.7ms preprocess, 6.3ms inference, 5.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 3.7ms preprocess, 6.4ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 3.1ms preprocess, 6.4ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 3.1ms preprocess, 6.4ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 3.0ms preprocess, 6.4ms inference, 11.6ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.2ms
Speed: 3.2ms preprocess, 4.2ms inference, 0.4ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.3ms
Speed: 1.8ms preprocess, 4.3ms inference, 0

final text_encoder_type: bert-base-uncased



0: 640x640 (no detections), 6.4ms
Speed: 5.9ms preprocess, 6.4ms inference, 11.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 2.2ms preprocess, 6.4ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 8.7ms
Speed: 3.0ms preprocess, 8.7ms inference, 11.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 2.9ms preprocess, 6.4ms inference, 11.6ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.5ms
Speed: 2.8ms preprocess, 4.5ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.6ms
Speed: 1.7ms preprocess, 4.6ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 10.6ms
Speed: 2.0ms preprocess, 10.6ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.3ms
Speed: 2.8ms preprocess, 6.3ms inference,

final text_encoder_type: bert-base-uncased



0: 640x640 (no detections), 6.3ms
Speed: 2.3ms preprocess, 6.3ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 5.2ms
Speed: 2.9ms preprocess, 5.2ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 8.8ms
Speed: 5.0ms preprocess, 8.8ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 3.0ms preprocess, 6.4ms inference, 11.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.1ms
Speed: 3.2ms preprocess, 4.1ms inference, 11.6ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 3.2ms preprocess, 6.4ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.2ms
Speed: 1.7ms preprocess, 4.2ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.4ms
Speed: 1.7ms preprocess, 4.4ms inference, 0

final text_encoder_type: bert-base-uncased



0: 640x640 (no detections), 6.4ms
Speed: 2.3ms preprocess, 6.4ms inference, 11.6ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 2.8ms preprocess, 6.4ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 8.8ms
Speed: 2.7ms preprocess, 8.8ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.1ms
Speed: 3.1ms preprocess, 4.1ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.3ms
Speed: 1.7ms preprocess, 4.3ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.3ms
Speed: 1.9ms preprocess, 6.3ms inference, 11.6ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.0ms
Speed: 3.2ms preprocess, 4.0ms inference, 7.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.1ms
Speed: 2.9ms preprocess, 4.1ms inference, 11

final text_encoder_type: bert-base-uncased



0: 640x640 (no detections), 10.9ms
Speed: 7.9ms preprocess, 10.9ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 2.0ms preprocess, 6.4ms inference, 13.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 2.9ms preprocess, 6.4ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 3.2ms preprocess, 6.4ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.3ms
Speed: 3.0ms preprocess, 6.3ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.2ms
Speed: 1.8ms preprocess, 4.2ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.7ms
Speed: 1.9ms preprocess, 4.7ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.3ms
Speed: 2.9ms preprocess, 6.3ms inference,

final text_encoder_type: bert-base-uncased



0: 640x640 (no detections), 6.3ms
Speed: 2.3ms preprocess, 6.3ms inference, 11.6ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.2ms
Speed: 2.2ms preprocess, 6.2ms inference, 11.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 3.0ms preprocess, 6.4ms inference, 11.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 3.1ms preprocess, 6.4ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.3ms
Speed: 3.2ms preprocess, 6.3ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 3.2ms preprocess, 6.4ms inference, 3.9ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.4ms
Speed: 1.7ms preprocess, 4.4ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.7ms
Speed: 1.9ms preprocess, 4.7ms inference, 7

final text_encoder_type: bert-base-uncased



0: 640x640 (no detections), 5.2ms
Speed: 2.3ms preprocess, 5.2ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 5.3ms
Speed: 1.6ms preprocess, 5.3ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 7.0ms
Speed: 1.7ms preprocess, 7.0ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.3ms
Speed: 3.0ms preprocess, 6.3ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.3ms
Speed: 2.6ms preprocess, 6.3ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 3.0ms preprocess, 6.4ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 3.0ms preprocess, 6.4ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 2.8ms preprocess, 6.4ms inference, 1

--processing coco--
number of ltl rule: 4
number of examples of Gprop2: 10
final text_encoder_type: bert-base-uncased



0: 448x640 (no detections), 161.4ms
Speed: 6.5ms preprocess, 161.4ms inference, 14.8ms postprocess per image at shape (1, 3, 448, 640)

0: 640x640 (no detections), 13.4ms
Speed: 3.2ms preprocess, 13.4ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 640)

0: 480x640 (no detections), 143.0ms
Speed: 2.8ms preprocess, 143.0ms inference, 18.8ms postprocess per image at shape (1, 3, 480, 640)

0: 640x640 (no detections), 13.4ms
Speed: 2.6ms preprocess, 13.4ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 640)

0: 448x640 2 forks, 13.3ms
Speed: 2.7ms preprocess, 13.3ms inference, 2.6ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 2 forks, 4.4ms
Speed: 1.0ms preprocess, 4.4ms inference, 1.0ms postprocess per image at shape (1, 3, 448, 640)

0: 640x480 (no detections), 131.1ms
Speed: 1.2ms preprocess, 131.1ms inference, 11.3ms postprocess per image at shape (1, 3, 640, 480)

0: 480x640 1 fork, 5.0ms
Speed: 1.7ms preprocess, 5.0ms inference, 1.1ms po

final text_encoder_type: bert-base-uncased



0: 448x640 (no detections), 5.2ms
Speed: 1.2ms preprocess, 5.2ms inference, 5.6ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 6.1ms
Speed: 2.7ms preprocess, 6.1ms inference, 18.6ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 4.7ms
Speed: 1.4ms preprocess, 4.7ms inference, 18.7ms postprocess per image at shape (1, 3, 448, 640)

0: 480x640 (no detections), 13.1ms
Speed: 3.1ms preprocess, 13.1ms inference, 18.7ms postprocess per image at shape (1, 3, 480, 640)

0: 448x640 3 apples, 13.3ms
Speed: 3.1ms preprocess, 13.3ms inference, 21.9ms postprocess per image at shape (1, 3, 448, 640)

0: 480x640 (no detections), 4.6ms
Speed: 1.4ms preprocess, 4.6ms inference, 0.6ms postprocess per image at shape (1, 3, 480, 640)

0: 448x640 (no detections), 5.7ms
Speed: 1.2ms preprocess, 5.7ms inference, 0.9ms postprocess per image at shape (1, 3, 448, 640)

0: 640x640 (no detections), 13.2ms
Speed: 3.9ms preprocess, 13.2ms inference, 18.

final text_encoder_type: bert-base-uncased



0: 480x640 (no detections), 13.1ms
Speed: 2.3ms preprocess, 13.1ms inference, 14.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.3ms
Speed: 1.4ms preprocess, 6.3ms inference, 18.7ms postprocess per image at shape (1, 3, 480, 640)

0: 448x640 (no detections), 13.2ms
Speed: 3.0ms preprocess, 13.2ms inference, 18.8ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 6.2ms
Speed: 2.8ms preprocess, 6.2ms inference, 18.8ms postprocess per image at shape (1, 3, 448, 640)

0: 640x480 (no detections), 5.0ms
Speed: 1.5ms preprocess, 5.0ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 480)

0: 224x640 1 airplane, 136.1ms
Speed: 1.0ms preprocess, 136.1ms inference, 15.7ms postprocess per image at shape (1, 3, 224, 640)

0: 640x640 2 airplanes, 4.8ms
Speed: 2.4ms preprocess, 4.8ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 416x640 (no detections), 13.2ms
Speed: 2.6ms preprocess, 13.2ms inference, 

final text_encoder_type: bert-base-uncased



0: 448x640 (no detections), 13.1ms
Speed: 6.5ms preprocess, 13.1ms inference, 10.5ms postprocess per image at shape (1, 3, 448, 640)

0: 640x512 (no detections), 13.3ms
Speed: 2.5ms preprocess, 13.3ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 512)

0: 640x416 (no detections), 13.2ms
Speed: 2.9ms preprocess, 13.2ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 416)

0: 448x640 (no detections), 13.2ms
Speed: 3.0ms preprocess, 13.2ms inference, 10.4ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 4.3ms
Speed: 1.3ms preprocess, 4.3ms inference, 0.6ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 4.5ms
Speed: 0.9ms preprocess, 4.5ms inference, 0.7ms postprocess per image at shape (1, 3, 448, 640)

0: 352x640 (no detections), 13.0ms
Speed: 3.0ms preprocess, 13.0ms inference, 18.6ms postprocess per image at shape (1, 3, 352, 640)

0: 480x640 (no detections), 13.2ms
Speed: 1.3ms preprocess, 13.2ms 

final text_encoder_type: bert-base-uncased



0: 480x640 1 sandwich, 13.1ms
Speed: 2.3ms preprocess, 13.1ms inference, 15.2ms postprocess per image at shape (1, 3, 480, 640)

0: 640x480 (no detections), 13.3ms
Speed: 2.6ms preprocess, 13.3ms inference, 11.7ms postprocess per image at shape (1, 3, 640, 480)

0: 640x448 (no detections), 13.3ms
Speed: 3.0ms preprocess, 13.3ms inference, 18.8ms postprocess per image at shape (1, 3, 640, 448)

0: 448x640 (no detections), 13.3ms
Speed: 2.9ms preprocess, 13.3ms inference, 15.1ms postprocess per image at shape (1, 3, 448, 640)

0: 640x512 (no detections), 4.9ms
Speed: 1.4ms preprocess, 4.9ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 512)

0: 448x640 (no detections), 4.8ms
Speed: 1.2ms preprocess, 4.8ms inference, 0.7ms postprocess per image at shape (1, 3, 448, 640)

0: 480x640 (no detections), 13.4ms
Speed: 1.3ms preprocess, 13.4ms inference, 18.6ms postprocess per image at shape (1, 3, 480, 640)

0: 448x640 (no detections), 13.3ms
Speed: 3.0ms preprocess, 13.3ms infer

final text_encoder_type: bert-base-uncased



0: 448x640 (no detections), 5.4ms
Speed: 1.3ms preprocess, 5.4ms inference, 10.3ms postprocess per image at shape (1, 3, 448, 640)

0: 480x640 1 sandwich, 13.1ms
Speed: 2.9ms preprocess, 13.1ms inference, 10.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.3ms
Speed: 2.2ms preprocess, 6.3ms inference, 18.7ms postprocess per image at shape (1, 3, 480, 640)

0: 640x640 (no detections), 13.4ms
Speed: 2.9ms preprocess, 13.4ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 640)

0: 448x640 1 sandwich, 13.2ms
Speed: 2.2ms preprocess, 13.2ms inference, 15.0ms postprocess per image at shape (1, 3, 448, 640)

0: 480x640 (no detections), 5.1ms
Speed: 1.3ms preprocess, 5.1ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 4.7ms
Speed: 1.5ms preprocess, 4.7ms inference, 0.7ms postprocess per image at shape (1, 3, 480, 640)

0: 544x640 (no detections), 13.1ms
Speed: 6.6ms preprocess, 13.1ms inference, 18.

final text_encoder_type: bert-base-uncased



0: 480x640 (no detections), 13.1ms
Speed: 6.5ms preprocess, 13.1ms inference, 8.2ms postprocess per image at shape (1, 3, 480, 640)

0: 640x448 (no detections), 13.2ms
Speed: 2.3ms preprocess, 13.2ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 (no detections), 6.2ms
Speed: 2.2ms preprocess, 6.2ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 448)

0: 640x640 (no detections), 13.4ms
Speed: 2.4ms preprocess, 13.4ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 640)

0: 448x640 (no detections), 4.6ms
Speed: 1.1ms preprocess, 4.6ms inference, 1.2ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 4.4ms
Speed: 1.1ms preprocess, 4.4ms inference, 0.6ms postprocess per image at shape (1, 3, 448, 640)

0: 480x640 (no detections), 7.6ms
Speed: 1.4ms preprocess, 7.6ms inference, 18.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 zebra, 6.2ms
Speed: 2.9ms preprocess, 6.2ms inference, 6.4m

final text_encoder_type: bert-base-uncased



0: 448x640 (no detections), 11.6ms
Speed: 1.6ms preprocess, 11.6ms inference, 18.6ms postprocess per image at shape (1, 3, 448, 640)

0: 640x480 (no detections), 5.2ms
Speed: 2.7ms preprocess, 5.2ms inference, 18.8ms postprocess per image at shape (1, 3, 640, 480)

0: 448x640 1 tv, 13.3ms
Speed: 2.8ms preprocess, 13.3ms inference, 25.7ms postprocess per image at shape (1, 3, 448, 640)

0: 480x640 1 tv, 13.3ms
Speed: 3.0ms preprocess, 13.3ms inference, 9.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 tvs, 4.5ms
Speed: 1.3ms preprocess, 4.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 4.4ms
Speed: 1.3ms preprocess, 4.4ms inference, 0.7ms postprocess per image at shape (1, 3, 480, 640)

0: 512x640 2 tvs, 13.1ms
Speed: 1.7ms preprocess, 13.1ms inference, 25.5ms postprocess per image at shape (1, 3, 512, 640)

0: 480x640 (no detections), 13.2ms
Speed: 1.6ms preprocess, 13.2ms inference, 18.7ms postprocess per image at sh

final text_encoder_type: bert-base-uncased



0: 480x640 (no detections), 13.3ms
Speed: 2.6ms preprocess, 13.3ms inference, 11.7ms postprocess per image at shape (1, 3, 480, 640)

0: 640x576 (no detections), 139.1ms
Speed: 4.0ms preprocess, 139.1ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 576)

0: 640x448 (no detections), 13.2ms
Speed: 1.9ms preprocess, 13.2ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 (no detections), 6.2ms
Speed: 2.3ms preprocess, 6.2ms inference, 17.6ms postprocess per image at shape (1, 3, 640, 448)

0: 480x640 (no detections), 6.1ms
Speed: 1.6ms preprocess, 6.1ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

0: 384x640 (no detections), 6.9ms
Speed: 1.0ms preprocess, 6.9ms inference, 18.6ms postprocess per image at shape (1, 3, 384, 640)

0: 640x480 (no detections), 13.1ms
Speed: 2.4ms preprocess, 13.1ms inference, 3.9ms postprocess per image at shape (1, 3, 640, 480)

0: 384x640 (no detections), 13.2ms
Speed: 2.0ms preprocess, 13.2ms 

final text_encoder_type: bert-base-uncased



0: 640x448 (no detections), 9.5ms
Speed: 6.5ms preprocess, 9.5ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 448)

0: 448x640 2 sinks, 13.3ms
Speed: 1.8ms preprocess, 13.3ms inference, 25.7ms postprocess per image at shape (1, 3, 448, 640)

0: 640x448 (no detections), 13.2ms
Speed: 1.2ms preprocess, 13.2ms inference, 18.8ms postprocess per image at shape (1, 3, 640, 448)

0: 480x640 (no detections), 5.5ms
Speed: 2.9ms preprocess, 5.5ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 4.9ms
Speed: 1.1ms preprocess, 4.9ms inference, 0.7ms postprocess per image at shape (1, 3, 480, 640)

0: 448x640 (no detections), 14.2ms
Speed: 1.3ms preprocess, 14.2ms inference, 18.7ms postprocess per image at shape (1, 3, 448, 640)

0: 480x640 (no detections), 5.4ms
Speed: 3.0ms preprocess, 5.4ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 640x480 (no detections), 13.3ms
Speed: 1.9ms preprocess, 13.3ms inference, 18.

number of examples of prop1Uprop2: 12
final text_encoder_type: bert-base-uncased



0: 448x640 (no detections), 4.7ms
Speed: 1.1ms preprocess, 4.7ms inference, 0.8ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 4.6ms
Speed: 1.1ms preprocess, 4.6ms inference, 18.6ms postprocess per image at shape (1, 3, 448, 640)

0: 640x448 (no detections), 13.1ms
Speed: 1.6ms preprocess, 13.1ms inference, 3.1ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 (no detections), 6.3ms
Speed: 3.0ms preprocess, 6.3ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 448)

0: 640x480 (no detections), 13.2ms
Speed: 3.1ms preprocess, 13.2ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 (no detections), 6.2ms
Speed: 2.9ms preprocess, 6.2ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 480)

0: 448x640 (no detections), 5.0ms
Speed: 1.2ms preprocess, 5.0ms inference, 0.8ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 4.3ms
Speed: 0.9ms preprocess, 4.3ms inference

final text_encoder_type: bert-base-uncased



0: 480x640 (no detections), 13.1ms
Speed: 2.2ms preprocess, 13.1ms inference, 18.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.2ms
Speed: 2.8ms preprocess, 6.2ms inference, 18.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 mouses, 6.2ms
Speed: 2.8ms preprocess, 6.2ms inference, 25.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 5.6ms
Speed: 1.3ms preprocess, 5.6ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 5.9ms
Speed: 1.6ms preprocess, 5.9ms inference, 18.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.2ms
Speed: 3.1ms preprocess, 6.2ms inference, 8.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.3ms
Speed: 1.4ms preprocess, 6.3ms inference, 18.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.2ms
Speed: 3.0ms preprocess, 6.2ms inference, 18.8ms

final text_encoder_type: bert-base-uncased



0: 640x480 (no detections), 13.1ms
Speed: 6.5ms preprocess, 13.1ms inference, 10.5ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 (no detections), 6.3ms
Speed: 2.9ms preprocess, 6.3ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 (no detections), 6.2ms
Speed: 2.2ms preprocess, 6.2ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 (no detections), 6.3ms
Speed: 2.4ms preprocess, 6.3ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 480)

0: 480x640 (no detections), 5.4ms
Speed: 2.9ms preprocess, 5.4ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 4.5ms
Speed: 1.0ms preprocess, 4.5ms inference, 0.7ms postprocess per image at shape (1, 3, 480, 640)

0: 512x640 (no detections), 5.3ms
Speed: 1.1ms preprocess, 5.3ms inference, 1.7ms postprocess per image at shape (1, 3, 512, 640)

0: 512x640 (no detections), 6.1ms
Speed: 1.8ms preprocess, 6.1ms inference, 

final text_encoder_type: bert-base-uncased



0: 640x448 (no detections), 13.2ms
Speed: 2.3ms preprocess, 13.2ms inference, 18.8ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 (no detections), 6.2ms
Speed: 2.8ms preprocess, 6.2ms inference, 18.8ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 (no detections), 6.2ms
Speed: 3.0ms preprocess, 6.2ms inference, 10.3ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 (no detections), 4.2ms
Speed: 0.9ms preprocess, 4.2ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 448)

0: 384x640 (no detections), 5.5ms
Speed: 1.1ms preprocess, 5.5ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 6.0ms
Speed: 2.0ms preprocess, 6.0ms inference, 18.6ms postprocess per image at shape (1, 3, 384, 640)

0: 480x640 (no detections), 5.3ms
Speed: 3.1ms preprocess, 5.3ms inference, 8.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.2ms
Speed: 2.8ms preprocess, 6.2ms inference, 

final text_encoder_type: bert-base-uncased



0: 480x640 1 horse, 15.2ms
Speed: 6.5ms preprocess, 15.2ms inference, 25.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.2ms
Speed: 2.4ms preprocess, 6.2ms inference, 18.7ms postprocess per image at shape (1, 3, 480, 640)

0: 448x640 (no detections), 13.3ms
Speed: 2.8ms preprocess, 13.3ms inference, 4.3ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 6.6ms
Speed: 1.3ms preprocess, 6.6ms inference, 0.8ms postprocess per image at shape (1, 3, 448, 640)

0: 480x640 (no detections), 15.9ms
Speed: 2.3ms preprocess, 15.9ms inference, 18.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 7.3ms
Speed: 2.5ms preprocess, 7.3ms inference, 18.7ms postprocess per image at shape (1, 3, 480, 640)

0: 448x640 (no detections), 13.2ms
Speed: 2.9ms preprocess, 13.2ms inference, 18.8ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 6.2ms
Speed: 3.0ms preprocess, 6.2ms inference, 1

final text_encoder_type: bert-base-uncased



0: 480x640 (no detections), 5.5ms
Speed: 1.2ms preprocess, 5.5ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 4.5ms
Speed: 1.0ms preprocess, 4.5ms inference, 0.7ms postprocess per image at shape (1, 3, 480, 640)

0: 448x640 (no detections), 6.7ms
Speed: 1.2ms preprocess, 6.7ms inference, 0.7ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 5.1ms
Speed: 1.4ms preprocess, 5.1ms inference, 0.7ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 5.0ms
Speed: 1.3ms preprocess, 5.0ms inference, 0.7ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 4.8ms
Speed: 1.3ms preprocess, 4.8ms inference, 0.8ms postprocess per image at shape (1, 3, 448, 640)

0: 640x480 (no detections), 5.6ms
Speed: 1.3ms preprocess, 5.6ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 (no detections), 4.9ms
Speed: 1.0ms preprocess, 4.9ms inference, 0.7ms 

final text_encoder_type: bert-base-uncased



0: 640x640 (no detections), 6.4ms
Speed: 2.6ms preprocess, 6.4ms inference, 18.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.4ms
Speed: 2.3ms preprocess, 6.4ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x480 (no detections), 13.2ms
Speed: 2.4ms preprocess, 13.2ms inference, 18.8ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 (no detections), 4.6ms
Speed: 2.8ms preprocess, 4.6ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 480)

0: 480x640 (no detections), 5.5ms
Speed: 1.3ms preprocess, 5.5ms inference, 0.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 5.0ms
Speed: 1.3ms preprocess, 5.0ms inference, 18.6ms postprocess per image at shape (1, 3, 480, 640)

0: 448x640 (no detections), 11.6ms
Speed: 2.9ms preprocess, 11.6ms inference, 0.8ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 6.2ms
Speed: 1.5ms preprocess, 6.2ms inference

final text_encoder_type: bert-base-uncased



0: 448x640 (no detections), 9.6ms
Speed: 6.5ms preprocess, 9.6ms inference, 6.2ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 6.3ms
Speed: 1.8ms preprocess, 6.3ms inference, 18.7ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 6.3ms
Speed: 6.1ms preprocess, 6.3ms inference, 18.8ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 6.2ms
Speed: 2.0ms preprocess, 6.2ms inference, 18.8ms postprocess per image at shape (1, 3, 448, 640)

0: 640x640 (no detections), 5.0ms
Speed: 2.2ms preprocess, 5.0ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.5ms
Speed: 1.5ms preprocess, 4.5ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 640)

0: 480x640 (no detections), 5.5ms
Speed: 1.3ms preprocess, 5.5ms inference, 15.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.1ms
Speed: 2.2ms preprocess, 6.1ms inference, 14

final text_encoder_type: bert-base-uncased



0: 480x640 1 scissors, 13.9ms
Speed: 4.1ms preprocess, 13.9ms inference, 41.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.2ms
Speed: 4.2ms preprocess, 6.2ms inference, 36.6ms postprocess per image at shape (1, 3, 480, 640)

0: 448x640 (no detections), 24.2ms
Speed: 6.4ms preprocess, 24.2ms inference, 12.3ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 4.7ms
Speed: 1.5ms preprocess, 4.7ms inference, 2.0ms postprocess per image at shape (1, 3, 448, 640)

0: 640x640 (no detections), 13.4ms
Speed: 3.0ms preprocess, 13.4ms inference, 23.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 9.4ms
Speed: 5.5ms preprocess, 9.4ms inference, 18.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x608 (no detections), 20.3ms
Speed: 2.9ms preprocess, 20.3ms inference, 18.8ms postprocess per image at shape (1, 3, 640, 608)

0: 640x608 1 chair, 6.4ms
Speed: 2.5ms preprocess, 6.4ms inference, 13.7m

final text_encoder_type: bert-base-uncased



0: 416x640 (no detections), 11.8ms
Speed: 6.5ms preprocess, 11.8ms inference, 1.1ms postprocess per image at shape (1, 3, 416, 640)

0: 416x640 (no detections), 6.3ms
Speed: 2.5ms preprocess, 6.3ms inference, 18.7ms postprocess per image at shape (1, 3, 416, 640)

0: 480x640 (no detections), 15.5ms
Speed: 2.9ms preprocess, 15.5ms inference, 18.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.2ms
Speed: 2.9ms preprocess, 6.2ms inference, 18.7ms postprocess per image at shape (1, 3, 480, 640)

0: 640x448 (no detections), 5.1ms
Speed: 1.2ms preprocess, 5.1ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 (no detections), 4.5ms
Speed: 1.2ms preprocess, 4.5ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 448)

0: 448x640 (no detections), 13.1ms
Speed: 2.6ms preprocess, 13.1ms inference, 18.7ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 5.3ms
Speed: 1.4ms preprocess, 5.3ms inferen

final text_encoder_type: bert-base-uncased



0: 480x640 (no detections), 13.1ms
Speed: 2.3ms preprocess, 13.1ms inference, 10.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.2ms
Speed: 2.6ms preprocess, 6.2ms inference, 18.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 10 horses, 6.2ms
Speed: 3.0ms preprocess, 6.2ms inference, 25.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.2ms
Speed: 1.3ms preprocess, 6.2ms inference, 12.4ms postprocess per image at shape (1, 3, 480, 640)

0: 448x640 (no detections), 5.0ms
Speed: 1.4ms preprocess, 5.0ms inference, 0.7ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 1 bird, 4.7ms
Speed: 1.3ms preprocess, 4.7ms inference, 24.3ms postprocess per image at shape (1, 3, 448, 640)

0: 640x448 (no detections), 12.4ms
Speed: 0.9ms preprocess, 12.4ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 (no detections), 6.2ms
Speed: 3.0ms preprocess, 6.2ms inference, 18.7ms post

final text_encoder_type: bert-base-uncased



0: 448x640 (no detections), 13.1ms
Speed: 2.3ms preprocess, 13.1ms inference, 12.8ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 6.2ms
Speed: 3.0ms preprocess, 6.2ms inference, 18.8ms postprocess per image at shape (1, 3, 448, 640)

0: 512x640 (no detections), 13.3ms
Speed: 3.0ms preprocess, 13.3ms inference, 18.7ms postprocess per image at shape (1, 3, 512, 640)

0: 512x640 (no detections), 6.3ms
Speed: 3.1ms preprocess, 6.3ms inference, 18.7ms postprocess per image at shape (1, 3, 512, 640)

0: 448x640 (no detections), 4.8ms
Speed: 1.2ms preprocess, 4.8ms inference, 1.0ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 4.7ms
Speed: 0.9ms preprocess, 4.7ms inference, 0.7ms postprocess per image at shape (1, 3, 448, 640)

0: 480x640 (no detections), 11.6ms
Speed: 1.3ms preprocess, 11.6ms inference, 18.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 4.3ms
Speed: 3.1ms preprocess, 4.3ms infere

number of examples of (prop1&prop2)Uprop3: 13
final text_encoder_type: bert-base-uncased



0: 448x640 4 persons, 13.1ms
Speed: 6.5ms preprocess, 13.1ms inference, 8.5ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 6.2ms
Speed: 3.3ms preprocess, 6.2ms inference, 18.7ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 6.3ms
Speed: 2.9ms preprocess, 6.3ms inference, 18.7ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 6.3ms
Speed: 3.0ms preprocess, 6.3ms inference, 18.7ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 4.5ms
Speed: 1.0ms preprocess, 4.5ms inference, 1.1ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 4.6ms
Speed: 1.2ms preprocess, 4.6ms inference, 0.8ms postprocess per image at shape (1, 3, 448, 640)

0: 480x640 8 persons, 13.1ms
Speed: 2.8ms preprocess, 13.1ms inference, 24.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.3ms
Speed: 1.2ms preprocess, 6.3ms inference, 18.7ms pos

final text_encoder_type: bert-base-uncased



0: 640x448 (no detections), 12.8ms
Speed: 6.5ms preprocess, 12.8ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 (no detections), 6.2ms
Speed: 2.8ms preprocess, 6.2ms inference, 18.8ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 (no detections), 6.2ms
Speed: 3.0ms preprocess, 6.2ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 448)

0: 640x480 (no detections), 5.6ms
Speed: 3.1ms preprocess, 5.6ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 (no detections), 4.4ms
Speed: 1.3ms preprocess, 4.4ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 (no detections), 4.8ms
Speed: 1.2ms preprocess, 4.8ms inference, 3.7ms postprocess per image at shape (1, 3, 640, 480)

0: 448x640 (no detections), 13.1ms
Speed: 3.1ms preprocess, 13.1ms inference, 17.1ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 6.2ms
Speed: 1.3ms preprocess, 6.2ms inference

final text_encoder_type: bert-base-uncased



0: 640x512 (no detections), 13.3ms
Speed: 2.3ms preprocess, 13.3ms inference, 18.8ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 (no detections), 6.2ms
Speed: 2.3ms preprocess, 6.2ms inference, 18.8ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 (no detections), 5.3ms
Speed: 1.6ms preprocess, 5.3ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 512)

0: 480x640 (no detections), 5.8ms
Speed: 1.5ms preprocess, 5.8ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.1ms
Speed: 3.6ms preprocess, 6.1ms inference, 18.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 5.0ms
Speed: 2.3ms preprocess, 5.0ms inference, 1.4ms postprocess per image at shape (1, 3, 480, 640)

0: 448x640 (no detections), 13.3ms
Speed: 3.2ms preprocess, 13.3ms inference, 18.8ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 6.2ms
Speed: 2.3ms preprocess, 6.2ms inference

final text_encoder_type: bert-base-uncased



0: 640x480 (no detections), 13.2ms
Speed: 6.5ms preprocess, 13.2ms inference, 18.8ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 (no detections), 6.3ms
Speed: 2.9ms preprocess, 6.3ms inference, 18.8ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 (no detections), 6.2ms
Speed: 2.0ms preprocess, 6.2ms inference, 8.8ms postprocess per image at shape (1, 3, 640, 480)

0: 480x640 (no detections), 9.7ms
Speed: 1.8ms preprocess, 9.7ms inference, 17.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.1ms
Speed: 2.2ms preprocess, 6.1ms inference, 9.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.3ms
Speed: 2.8ms preprocess, 6.3ms inference, 18.7ms postprocess per image at shape (1, 3, 480, 640)

0: 640x384 (no detections), 13.2ms
Speed: 2.8ms preprocess, 13.2ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 (no detections), 6.2ms
Speed: 2.7ms preprocess, 6.2ms inferenc

final text_encoder_type: bert-base-uncased



0: 448x640 (no detections), 13.1ms
Speed: 5.9ms preprocess, 13.1ms inference, 18.7ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 1 bottle, 6.4ms
Speed: 1.8ms preprocess, 6.4ms inference, 25.6ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 6.3ms
Speed: 3.0ms preprocess, 6.3ms inference, 18.8ms postprocess per image at shape (1, 3, 448, 640)

0: 640x448 (no detections), 13.2ms
Speed: 3.0ms preprocess, 13.2ms inference, 15.1ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 (no detections), 4.3ms
Speed: 1.0ms preprocess, 4.3ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 (no detections), 4.5ms
Speed: 1.0ms preprocess, 4.5ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 448)

0: 448x640 (no detections), 13.0ms
Speed: 1.3ms preprocess, 13.0ms inference, 18.6ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 4.7ms
Speed: 3.0ms preprocess, 4.7ms inference, 0.

final text_encoder_type: bert-base-uncased



0: 480x640 3 persons, 13.0ms
Speed: 6.5ms preprocess, 13.0ms inference, 25.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 10 horses, 6.3ms
Speed: 2.6ms preprocess, 6.3ms inference, 25.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 4.4ms
Speed: 1.0ms preprocess, 4.4ms inference, 0.7ms postprocess per image at shape (1, 3, 480, 640)

0: 448x640 (no detections), 5.6ms
Speed: 1.4ms preprocess, 5.6ms inference, 0.8ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 6.1ms
Speed: 1.0ms preprocess, 6.1ms inference, 18.6ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 6.2ms
Speed: 2.8ms preprocess, 6.2ms inference, 3.6ms postprocess per image at shape (1, 3, 448, 640)

0: 512x640 (no detections), 13.2ms
Speed: 3.5ms preprocess, 13.2ms inference, 18.7ms postprocess per image at shape (1, 3, 512, 640)

0: 512x640 4 horses, 6.2ms
Speed: 2.9ms preprocess, 6.2ms inference, 25.7ms postproces

final text_encoder_type: bert-base-uncased



0: 640x448 (no detections), 11.7ms
Speed: 2.3ms preprocess, 11.7ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 (no detections), 6.2ms
Speed: 1.6ms preprocess, 6.2ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 448)

0: 480x640 2 horses, 13.2ms
Speed: 2.7ms preprocess, 13.2ms inference, 25.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 6.2ms
Speed: 3.1ms preprocess, 6.2ms inference, 4.4ms postprocess per image at shape (1, 3, 480, 640)

0: 640x448 (no detections), 5.3ms
Speed: 1.3ms preprocess, 5.3ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 (no detections), 5.2ms
Speed: 1.2ms preprocess, 5.2ms inference, 18.6ms postprocess per image at shape (1, 3, 640, 448)

0: 448x640 (no detections), 9.6ms
Speed: 2.0ms preprocess, 9.6ms inference, 1.1ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 6.3ms
Speed: 1.8ms preprocess, 6.3ms inference, 18.7ms postpr

final text_encoder_type: bert-base-uncased



0: 576x640 (no detections), 13.3ms
Speed: 6.5ms preprocess, 13.3ms inference, 18.8ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 (no detections), 6.3ms
Speed: 3.3ms preprocess, 6.3ms inference, 18.7ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 (no detections), 6.3ms
Speed: 2.5ms preprocess, 6.3ms inference, 2.0ms postprocess per image at shape (1, 3, 576, 640)

0: 480x640 (no detections), 5.5ms
Speed: 2.1ms preprocess, 5.5ms inference, 0.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.2ms
Speed: 3.4ms preprocess, 6.2ms inference, 18.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.3ms
Speed: 3.1ms preprocess, 6.3ms inference, 1.3ms postprocess per image at shape (1, 3, 480, 640)

0: 448x640 (no detections), 13.2ms
Speed: 2.9ms preprocess, 13.2ms inference, 18.7ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 6.2ms
Speed: 2.7ms preprocess, 6.2ms inference

final text_encoder_type: bert-base-uncased



0: 384x640 1 frisbee, 13.0ms
Speed: 6.5ms preprocess, 13.0ms inference, 11.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 6.2ms
Speed: 1.4ms preprocess, 6.2ms inference, 25.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 6.1ms
Speed: 3.3ms preprocess, 6.1ms inference, 18.7ms postprocess per image at shape (1, 3, 384, 640)

0: 640x640 (no detections), 13.4ms
Speed: 2.9ms preprocess, 13.4ms inference, 2.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 4.5ms
Speed: 2.1ms preprocess, 4.5ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 6.2ms
Speed: 2.2ms preprocess, 6.2ms inference, 18.6ms postprocess per image at shape (1, 3, 640, 640)

0: 640x480 (no detections), 9.5ms
Speed: 3.0ms preprocess, 9.5ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 (no detections), 6.3ms
Speed: 2.6ms preprocess, 6.3ms inference, 11.7ms pos

final text_encoder_type: bert-base-uncased



0: 640x448 (no detections), 15.6ms
Speed: 7.0ms preprocess, 15.6ms inference, 18.8ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 (no detections), 8.6ms
Speed: 2.8ms preprocess, 8.6ms inference, 18.8ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 (no detections), 6.2ms
Speed: 2.8ms preprocess, 6.2ms inference, 18.7ms postprocess per image at shape (1, 3, 640, 448)

0: 480x640 (no detections), 6.3ms
Speed: 1.3ms preprocess, 6.3ms inference, 1.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 5.7ms
Speed: 1.4ms preprocess, 5.7ms inference, 18.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 6.1ms
Speed: 3.0ms preprocess, 6.1ms inference, 5.2ms postprocess per image at shape (1, 3, 480, 640)

0: 448x640 (no detections), 13.3ms
Speed: 2.5ms preprocess, 13.3ms inference, 18.7ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 (no detections), 6.3ms
Speed: 3.0ms preprocess, 6.3ms inferenc

final text_encoder_type: bert-base-uncased


IndexError: list index out of range

In [8]:
result

{'cifar100': {PosixPath('/opt/Neuro-Symbolic-Video-Frame-Search/artifacts/test_benchmark_frame_video/cifar100/Fprop1'): {}},
 'F "keyboard"_100_2': {'benchmark_video': BenchmarkLTLFrame(ground_truth=True, ltl_formula='F "keyboard"', proposition=['keyboard'], number_of_frame=100, frames_of_interest=[[7], [14, 15], [43], [47], [49, 50], [56], [59], [61], [63], [81], [84], [86, 87, 88], [98]], labels_of_frames=['skunk', 'forest', 'snake', 'pear', 'willow_tree', 'shark', 'sea', 'keyboard', 'palm_tree', 'road', 'caterpillar', 'lion', 'shark', 'girl', 'keyboard', 'keyboard', 'cloud', 'orange', 'television', 'woman', 'wolf', 'leopard', 'rocket', 'flatfish', 'can', 'spider', 'turtle', 'bus', 'hamster', 'apple', 'lizard', 'snake', 'turtle', 'lizard', 'butterfly', 'lion', 'squirrel', 'cattle', 'lion', 'clock', 'skyscraper', 'elephant', 'caterpillar', 'keyboard', 'plate', 'kangaroo', 'whale', 'keyboard', 'castle', 'keyboard', 'keyboard', 'apple', 'mountain', 'chair', 'castle', 'dinosaur', 'keyboa

In [9]:
save_dict_to_pickle(result, path="/opt/Neuro-Symbolic-Video-Frame-Search/artifacts/", file_name="all_test_result.pkl")