In [None]:
!pip install -U transformers -q

In [None]:
!gdown 14MeYV2WBWwldMDGRrpG9s7vz8triwbWr
!unzip -qq L01.zip
!rm -rf L01.zip

In [None]:
import torch
import os
import json
import glob
from tqdm import tqdm
from transformers import AutoProcessor, AutoModelForImageTextToText

In [4]:
# Cấu hình - THAY ĐỔI BATCH Ở ĐÂY
BATCH_L = "L01"  # L01, L02, L03, ...

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu" 
processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-2B-hf")
model = AutoModelForImageTextToText.from_pretrained("OpenGVLab/InternVL3-2B-hf", torch_dtype=torch.float16).to(device)

In [6]:
def classify_keyframe(image_path):
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image_path},
                {"type": "text", "text": (
                    "Instruction:\n"
                    "– Look at the image above.\n"
                    "– Decide if it depicts a news anchor presenting in a professional TV studio.\n\n"
                    "Output requirements:\n"
                    "– YES if it clearly shows a news anchor in a TV studio.\n"
                    "– NO otherwise.\n"
                    "– Respond with exactly one token (YES or NO), nothing else."
                )}
            ]
        }
    ]
    
    inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(device)
    if "pixel_values" in inputs:
        inputs["pixel_values"] = inputs["pixel_values"].half()
    output = model.generate(**inputs, max_new_tokens=10, do_sample=False)
    answer = processor.decode(output[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip().upper()
    return 1 if "YES" in answer else 0

In [None]:
videos = sorted(glob.glob(os.path.join(BATCH_L, "V*")))

os.makedirs("results", exist_ok=True)

for video_dir in videos:
    video_name = os.path.basename(video_dir)
    keyframes = sorted(glob.glob(os.path.join(video_dir, "*.jpg")))
    
    video_results = []
    for keyframe_path in tqdm(keyframes, desc=video_name):
        keyframe_name = os.path.basename(keyframe_path)
        prediction = classify_keyframe(keyframe_path)
        video_results.append({"keyframe": keyframe_name, "prediction": prediction})
    
    # Lưu kết quả từng video thành file JSON riêng
    output_file = f"results/{BATCH_L}_{video_name}_news_anchor.json"
    with open(output_file, 'w') as f:
        json.dump(video_results, f, indent=2)

In [None]:
!rm -rf L01