In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Strategy 2 pipeline:
1) Divide video into 10s scenes
2) Extract 1–2 frames per chunk
3) Generate global description: video type + video description
4) First chunk uses (global desc + prompt + frames) → Description/OCR/Entities/Chronological actions
5) Global context is updated for every scene (frequencies, confront facts)
6) Save results in files (CSV + metadata TXT + context JSON)
"""

import os
import re
import csv
import json
import math
import time
import datetime as dt
import logging
import sys
from threading import Thread
from typing import Dict, Any, List, Tuple

import numpy as np
import torch
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode
from PIL import Image
from decord import VideoReader, cpu
from transformers import AutoModel, AutoTokenizer, TextIteratorStreamer

In [2]:
# ----------------------------
# Config
# ----------------------------
VIDEO_PATH = "documentary_480p.mp4"  # change as needed
SCENE_SECONDS = 10
GLOBAL_SAMPLE_SEGMENTS = 16
GLOBAL_FRAMES_PER_SEGMENT = 1     # 1 frame per global segment
SCENE_SEGMENTS = 10
SCENE_FRAMES_PER_SEGMENT = 1      # 1–2 frames per chunk -> here we take 2
TILE_MAX_NUM = 1
INPUT_SIZE = 448
USE_CUDA = torch.cuda.is_available()

# Decoding settings for safer generations
GEN_MAX_NEW_TOKENS = 768
GEN_TEMPERATURE = 0.7
GEN_TOP_P = 0.9
GEN_TOP_K = 50
GEN_REP_PENALTY = 1.15
GEN_NO_REPEAT_NGRAM = 6

In [3]:
# ----------------------------
# Logging (verbose)
# ----------------------------
def _setup_logger(name: str = "video_rag", level: int = logging.INFO) -> logging.Logger:
    logger = logging.getLogger(name)
    if logger.handlers:
        return logger
    logger.setLevel(level)
    handler = logging.StreamHandler(sys.stdout)
    formatter = logging.Formatter("%(asctime)s | %(levelname)-7s | %(message)s", "%Y-%m-%d %H:%M:%S")
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    return logger

LOGGER = _setup_logger()

In [4]:
# ----------------------------
# Model init (InternVL 3.5-8B)
# ----------------------------
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

path = "OpenGVLab/InternVL3_5-8B"
LOGGER.info("Loading model...")
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    load_in_8bit=False,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True,
    device_map="auto"
).eval()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
LOGGER.info("Model loaded.")

2025-09-13 15:39:47 | INFO    | Loading model...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

2025-09-13 15:39:53 | INFO    | Model loaded.


In [10]:
# ----------------------------
# Image / Video utilities
# ----------------------------
def build_transform(input_size):
    return T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
    ])

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf'); best_ratio = (1, 1); area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff; best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1)
        for i in range(1, n + 1) for j in range(1, n + 1)
        if i * j <= max_num and i * j >= min_num
    )
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        processed_images.append(resized_img.crop(box))
    if use_thumbnail and len(processed_images) != 1:
        processed_images.append(image.resize((image_size, image_size)))
    return processed_images

def build_transform_and_stack(frames_as_pil, input_size=448, max_num=1):
    transform = build_transform(input_size=input_size)
    tiles = []
    for img in frames_as_pil:
        img_tiles = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
        tile_tensor = [transform(tile) for tile in img_tiles]
        tile_tensor = torch.stack(tile_tensor)
        tiles.append(tile_tensor)
    pixel_values = torch.cat(tiles)
    return pixel_values

def seconds_to_tc(t):
    hours = int(t // 3600); minutes = int((t % 3600) // 60); seconds = t % 60
    return f"{hours:02d}:{minutes:02d}:{seconds:06.3f}"

def get_frame_indices(bound, fps, max_frame, first_idx=0, num_segments=32, frames_per_segment=1):
    """
    Evenly spaced segments; optionally take multiple frames per segment.
    Deterministic (no randomness).
    """
    if bound:
        start, end = bound[0], bound[1]
    else:
        start, end = -100000, 100000
    start_idx = max(first_idx, round(start * fps))
    end_idx = min(round(end * fps), max_frame)
    if end_idx <= start_idx:
        end_idx = min(start_idx + max(1, int(fps)), max_frame)
    seg_size = float(end_idx - start_idx) / num_segments
    frame_indices = []
    for idx in range(num_segments):
        seg_start = start_idx + seg_size * idx
        # evenly spread frames inside the segment
        for j in range(frames_per_segment):
            pos = seg_start + (seg_size * (j + 0.5) / frames_per_segment)
            frame_indices.append(int(pos))
    frame_indices = np.clip(frame_indices, start_idx, end_idx)
    return frame_indices

def load_video_pixel_values(video_path, bound=None, input_size=448, max_num=1,
                            num_segments=32, frames_per_segment=1):
    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
    max_frame = len(vr) - 1
    fps = float(vr.get_avg_fps())
    idxs = get_frame_indices(bound, fps, max_frame, first_idx=0,
                             num_segments=num_segments, frames_per_segment=frames_per_segment)
    frames_as_pil = [Image.fromarray(vr[int(i)].asnumpy()).convert('RGB') for i in idxs]
    pixel_values = build_transform_and_stack(frames_as_pil, input_size=input_size, max_num=max_num)
    width, height = vr[0].shape[1], vr[0].shape[0]
    return pixel_values, len(idxs), fps, width, height

# ----------------------------
# Generation helpers (Robust, Verbose, No duplicate use_cache)
# ----------------------------
import sys, time, re
from queue import Empty as _QueueEmpty
from threading import Thread

_REPEAT_CHAR_RE = re.compile(r"(.)\1{15,}", re.DOTALL)
_REPEAT_CHUNK_RE = re.compile(r"(.{3,60})\1{3,}", re.DOTALL)

def _looks_degenerate(s: str) -> bool:
    tail = s[-2000:]
    return bool(_REPEAT_CHAR_RE.search(tail) or _REPEAT_CHUNK_RE.search(tail))

def _trim_degenerate_tail(s: str) -> str:
    for cut in range(len(s), max(len(s) - 2000, 0), -1):
        tail = s[:cut]
        if not _looks_degenerate(tail):
            return tail.rstrip()
    return s.rstrip()

def chat_with_images(pixel_values, question, timeout=30):
    """
    Streamed verbose generation with safeguards against degenerate repetition and
    a robust thread/streamer loop that won't hang if model.chat raises.
    """
    t0 = time.perf_counter()
    if USE_CUDA:
        pixel_values = pixel_values.to(torch.bfloat16).cuda()
        LOGGER.info(f"chat_with_images | CUDA | tensor={tuple(pixel_values.shape)}")
    else:
        pixel_values = pixel_values.to(torch.bfloat16)
        LOGGER.info(f"chat_with_images | CPU | tensor={tuple(pixel_values.shape)}")

    question = question.strip() + (
        "\n\nCONSTRAINTS: Be concise. Do not repeat identical tokens/characters. "
        "If text is unreadable, write 'unknown' once."
    )
    LOGGER.info(f"Prompt length: {len(question)} chars")

    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=0.2)

    # IMPORTANT: do NOT pass use_cache here; InternVL passes it internally.
    generation_config = dict(
        max_new_tokens=GEN_MAX_NEW_TOKENS,
        do_sample=True,
        temperature=GEN_TEMPERATURE,
        top_p=GEN_TOP_P,
        top_k=GEN_TOP_K,
        repetition_penalty=GEN_REP_PENALTY,
        no_repeat_ngram_size=GEN_NO_REPEAT_NGRAM,
        renormalize_logits=True,
        streamer=streamer,
        # eos_token_id left to model defaults; avoid duplicating args InternVL may set.
    )

    worker_error = {"exc": None}

    def _run_chat():
        try:
            model.chat(
                tokenizer=tokenizer,
                pixel_values=pixel_values,
                question=question,
                history=None,
                return_history=False,
                generation_config=generation_config,
            )
        except Exception as e:
            worker_error["exc"] = e
            LOGGER.exception("model.chat raised an exception")

    thread = Thread(target=_run_chat, daemon=True)
    thread.start()

    chunks = []
    LOGGER.info(">>> Generation started...")
    while True:
        try:
            new_text = next(streamer)  # may raise StopIteration when done
            # Respect model separator if present
            if new_text == getattr(model, "conv_template", None).sep if hasattr(model, "conv_template") else False:
                LOGGER.debug("Separator reached, stopping.")
                break

            chunks.append(new_text)
            sys.stdout.write(new_text); sys.stdout.flush()

            if _looks_degenerate("".join(chunks)):
                LOGGER.warning("\n>>> Degenerate repetition detected — stopping early.")
                break

        except StopIteration:
            # Streamer signaled completion
            break
        except _QueueEmpty:
            # No token right now; check if worker died/crashed or finished
            if not thread.is_alive():
                # Worker finished (possibly with error) and no more tokens are coming
                break
            # else: keep waiting for tokens
            continue

    # Join the worker to surface any exception
    thread.join(timeout=1.0)
    if worker_error["exc"] is not None:
        # Surface a compact error into the text so upstream can still continue gracefully
        err_msg = f"[MODEL_ERROR] {type(worker_error['exc']).__name__}: {worker_error['exc']}"
        LOGGER.error(err_msg)
        if chunks:
            chunks.append("\n" + err_msg)
        else:
            return err_msg

    result = "".join(chunks).strip()
    if _looks_degenerate(result):
        clean = _trim_degenerate_tail(result)
        if clean != result:
            LOGGER.warning(">>> Trimmed trailing degenerate text.")
            result = clean + " [TRUNCATED]"

    LOGGER.info(f"\n>>> Generation finished in {time.perf_counter() - t0:.2f}s "
                f"({len(result)} chars).")
    return result

# ----------------------------
# Context state & parsing
# ----------------------------
def init_context_state() -> Dict[str, Any]:
    return {
        "video_type": None,
        "summary": None,
        "style": None,
        "entities_freq": {},  # name_or_value -> count
        "entity_types": {},   # name_or_value -> last type seen
        "actions_freq": {},   # action -> count
        "tags_freq": {},      # tag -> count
        "ocr_vocab": {},      # ocr token -> count
        "scenes_seen": 0
    }

def normalize_token(s: str) -> str:
    return re.sub(r"\s+", " ", s.strip()).lower()

def parse_scene_card(card: str) -> Dict[str, Any]:
    """
    Lightweight parser for the Scene Card sections.
    Robust to minor formatting variations.
    """
    sections = {
        "Meta": r"Meta:\s*(.*?)\n\n",
        "Description": r"Description:\s*(.*?)\n\n",
        "OnScreenText": r"OnScreenText:\s*(.*?)\n\n",
        "Entities": r"Entities:\s*(.*?)\n\n",
        "Tags": r"Tags:\s*(.*?)\n\n",
        "Shot": r"Shot:\s*(.*?)\n\n",
        "Confidence": r"Confidence:\s*(.*)$",
    }
    out = {}
    for key, pat in sections.items():
        m = re.search(pat, card, re.DOTALL | re.IGNORECASE)
        out[key] = m.group(1).strip() if m else ""
    return out

def parse_bullets(block: str) -> List[str]:
    items = []
    for line in block.splitlines():
        line = line.strip()
        if re.match(r"^[-•] ", line) or re.match(r"^\d+\.", line):
            items.append(re.sub(r"^[-•]\s*", "", line))
        elif line.startswith("  - "):
            items.append(line[4:])
        elif line.startswith("- "):
            items.append(line[2:])
    return [s.strip() for s in items if s.strip()]

def update_context_from_scene(state: Dict[str, Any], card_text: str, global_text_hint: str = ""):
    parsed = parse_scene_card(card_text)

    # Update OCR vocabulary
    ocr_block = parsed.get("OnScreenText", "")
    ocr_lines = [re.sub(r"^-\s*", "", l.strip()) for l in ocr_block.splitlines() if l.strip().startswith("-")]
    for ln in ocr_lines:
        for tok in re.findall(r"[A-Za-z0-9:#@\-_/]+", ln):
            nt = normalize_token(tok)
            state["ocr_vocab"][nt] = state["ocr_vocab"].get(nt, 0) + 1

    # Update actions
    desc_block = parsed.get("Description", "")
    # Extract "actions_chronological" sub-bullets
    act_match = re.search(r"actions_chronological:\s*(.*)", desc_block, re.DOTALL | re.IGNORECASE)
    if act_match:
        acts = parse_bullets(act_match.group(1))
        for a in acts:
            na = normalize_token(re.sub(r"^t≈\s*[^:]+:\s*", "", a))
            if na:
                state["actions_freq"][na] = state["actions_freq"].get(na, 0) + 1

    # Update entities
    ent_block = parsed.get("Entities", "")
    ent_items = []
    # naive parse: lines with "type:" or "name_or_value:"
    current = {}
    for line in ent_block.splitlines():
        l = line.strip()
        if l.startswith("- ") and "type:" in l:
            if current:
                ent_items.append(current); current = {}
            current = {"type": l.split("type:", 1)[1].strip()}
        elif "name_or_value:" in l:
            current["name_or_value"] = l.split("name_or_value:", 1)[1].strip()
        elif "attributes:" in l:
            attrs = re.findall(r"\[(.*)\]", l)
            current["attributes"] = attrs[0].split(",") if attrs else []
        elif l.startswith("- ") and current:
            # new bullet possibly starts; push previous
            ent_items.append(current); current = {}
    if current:
        ent_items.append(current)

    for ent in ent_items:
        name = normalize_token(ent.get("name_or_value", ""))
        typ = normalize_token(ent.get("type", ""))
        if name:
            state["entities_freq"][name] = state["entities_freq"].get(name, 0) + 1
            if typ:
                state["entity_types"][name] = typ

    # Update tags
    tags_block = parsed.get("Tags", "")
    # lines like: - scene_tags: [tag1, tag2, ...]
    for line in tags_block.splitlines():
        l = line.strip()
        m = re.search(r"scene_tags:\s*\[(.*)\]", l, re.IGNORECASE)
        if m:
            tags = [normalize_token(x) for x in m.group(1).split(",")]
            for t in tags:
                t = t.strip()
                if t:
                    state["tags_freq"][t] = state["tags_freq"].get(t, 0) + 1

    state["scenes_seen"] += 1

def confront_facts(state: Dict[str, Any], global_text: str) -> Dict[str, Any]:
    """
    Produce a simple, updated global context from current frequencies + global_text hints.
    """
    # Basic heuristics: top entities/actions/tags
    def top_k(d, k=10):
        return sorted(d.items(), key=lambda x: (-x[1], x[0]))[:k]

    top_entities = top_k(state["entities_freq"], 15)
    top_actions = top_k(state["actions_freq"], 15)
    top_tags = top_k(state["tags_freq"], 15)

    # Resolve a "likely_type" from global_text (very light heuristic)
    likely_type = None
    m = re.search(r"\[TYPE\]\s*(.*)", global_text, re.IGNORECASE)
    if m:
        likely_type = m.group(1).strip()

    # Build text block to feed next scene
    ctx_lines = []
    if likely_type:
        ctx_lines.append(f"Likely video type: {likely_type}")
    if state["summary"]:
        ctx_lines.append(f"Global summary: {state['summary']}")
    elif global_text:
        m2 = re.search(r"\[SUMMARY\]\s*(.*?)(?:\n\[|$)", global_text, re.IGNORECASE | re.DOTALL)
        if m2:
            ctx_lines.append("Global summary: " + " ".join(m2.group(1).split()))

    if top_entities:
        ents_str = ", ".join([f"{k} (x{v})" for k, v in top_entities])
        ctx_lines.append(f"Frequent entities: {ents_str}")
    if top_actions:
        acts_str = ", ".join([f"{k} (x{v})" for k, v in top_actions])
        ctx_lines.append(f"Frequent actions: {acts_str}")
    if top_tags:
        tags_str = ", ".join([f"{k} (x{v})" for k, v in top_tags])
        ctx_lines.append(f"Frequent tags: {tags_str}")

    return {
        "likely_type": likely_type,
        "context_text": "\n".join(ctx_lines) if ctx_lines else global_text
    }

# ----------------------------
# Strategy 2 pipeline
# ----------------------------
def chunk_bounds(duration_sec, chunk_sec):
    n = math.ceil(duration_sec / chunk_sec)
    return [(i * chunk_sec, min((i + 1) * chunk_sec, duration_sec)) for i in range(n) if min((i + 1) * chunk_sec, duration_sec) > i * chunk_sec]

def _filesize(path: str) -> str:
    try:
        sz = os.path.getsize(path)
        for unit in ["B", "KB", "MB", "GB", "TB"]:
            if sz < 1024.0:
                return f"{sz:,.2f} {unit}"
            sz /= 1024.0
    except Exception:
        return "n/a"
    return "n/a"

In [11]:
def process_strategy2(video_path: str):
    t0 = time.perf_counter()
    assert os.path.isfile(video_path), f"Video not found: {video_path}"
    LOGGER.info(f"Opening video: {video_path}")

    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
    fps = float(vr.get_avg_fps()); total_frames = len(vr)
    duration = total_frames / fps if fps > 0 else 0.0
    width, height = vr[0].shape[1], vr[0].shape[0]
    LOGGER.info(f"Video | fps={fps:.3f} frames={total_frames} dur={duration:.3f}s res={width}x{height}")

    # outputs
    base = os.path.splitext(os.path.basename(video_path))[0]
    stamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
    csv_name = f"{base}_{stamp}.csv"
    meta_name = f"{base}_{stamp}_metadata.txt"
    ctx_name = f"{base}_{stamp}_context.json"

    # 3) Global description (whole video sampling)
    LOGGER.info("Sampling frames for GLOBAL description...")
    global_px, global_n, _, _, _ = load_video_pixel_values(
        video_path,
        bound=(0, duration),
        input_size=INPUT_SIZE,
        max_num=TILE_MAX_NUM,
        num_segments=GLOBAL_SAMPLE_SEGMENTS,
        frames_per_segment=GLOBAL_FRAMES_PER_SEGMENT
    )
    LOGGER.info(f"Global sampling: frames={global_n}, segments={GLOBAL_SAMPLE_SEGMENTS}, fps={fps:.3f}")

    LOGGER.info("Generating GLOBAL description...")
    global_text = chat_with_images(global_px, GLOBAL_PROMPT)

    # Init context state
    context_state = init_context_state()
    # Try extracting quick type/summary from global_text
    m_type = re.search(r"\[TYPE\]\s*(.*)", global_text, re.IGNORECASE)
    m_sum = re.search(r"\[SUMMARY\]\s*(.*?)(?:\n\[|$)", global_text, re.IGNORECASE | re.DOTALL)
    if m_type: context_state["video_type"] = " ".join(m_type.group(1).split())
    if m_sum: context_state["summary"] = " ".join(m_sum.group(1).split())

    # 1) Divide into 10s scenes
    LOGGER.info("Computing 10s scene bounds...")
    bounds = chunk_bounds(duration, SCENE_SECONDS)
    total_scenes = len(bounds)
    LOGGER.info(f"{total_scenes} scene(s) planned.")

    # CSV rows
    rows = []

    # 4 & 5) Loop over scenes: extract & update context
    for i, (start_s, end_s) in enumerate(bounds, start=1):
        pct = (i / total_scenes) * 100 if total_scenes else 100
        LOGGER.info(f"[{i}/{total_scenes}] ({pct:4.1f}%) Scene {seconds_to_tc(start_s)}–{seconds_to_tc(end_s)}")

        # Build evolving global context text
        confronted = confront_facts(context_state, global_text)
        evolving_context_text = confronted["context_text"]

        try:
            # Sample frames for this scene (1–2 per segment as configured)
            px, sampled, fps_local, w, h = load_video_pixel_values(
                video_path,
                bound=(start_s, end_s),
                input_size=INPUT_SIZE,
                max_num=TILE_MAX_NUM,
                num_segments=SCENE_SEGMENTS,
                frames_per_segment=SCENE_FRAMES_PER_SEGMENT
            )
            LOGGER.info(f"  • Scene frames sampled: {sampled} (segments={SCENE_SEGMENTS}, frames/seg={SCENE_FRAMES_PER_SEGMENT})")

            # Build scene prompt using evolving global context
            scene_prompt = build_scene_prompt(
                global_context_text=evolving_context_text,
                scene_id=i,
                start_s=start_s,
                end_s=end_s,
                video_name=base,
                start_tc=seconds_to_tc(start_s),
                end_tc=seconds_to_tc(end_s)
            )

            # Generate RAG Scene Card
            scene_text = chat_with_images(px, scene_prompt)

            # Update context from this scene
            update_context_from_scene(context_state, scene_text, global_text_hint=global_text)

            # Row
            row = {
                "video_name": base,
                "video_path": os.path.abspath(video_path),
                "scene_id": i,
                "start_sec": round(float(start_s), 3),
                "end_sec": round(float(end_s), 3),
                "start_timecode": seconds_to_tc(start_s),
                "end_timecode": seconds_to_tc(end_s),
                "scene_duration_sec": round(float(end_s - start_s), 3),
                "fps": fps_local,
                "width": w,
                "height": h,
                "sampled_frames_for_scene": sampled,
                "generated_text": scene_text
            }
            rows.append(row)
            LOGGER.info(f"  • Scene {i} processed.")

        except Exception as e:
            LOGGER.exception(f"  × Scene {i} failed: {e}")
            rows.append({
                "video_name": base,
                "video_path": os.path.abspath(video_path),
                "scene_id": i,
                "start_sec": round(float(start_s), 3),
                "end_sec": round(float(end_s), 3),
                "start_timecode": seconds_to_tc(start_s),
                "end_timecode": seconds_to_tc(end_s),
                "scene_duration_sec": round(float(end_s - start_s), 3),
                "fps": fps, "width": width, "height": height,
                "sampled_frames_for_scene": 0,
                "generated_text": f"[ERROR] {e}"
            })

    # 6) Save results
    # CSV
    LOGGER.info("Writing CSV...")
    fieldnames = list(rows[0].keys()) if rows else [
        "video_name","video_path","scene_id","start_sec","end_sec",
        "start_timecode","end_timecode","scene_duration_sec","fps","width","height",
        "sampled_frames_for_scene","generated_text"
    ]
    with open(csv_name, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames); writer.writeheader()
        for r in rows: writer.writerow(r)
    LOGGER.info(f"CSV saved: {csv_name} ({_filesize(csv_name)})")

    # Context JSON (frequencies + confronted facts snapshot)
    confronted_final = confront_facts(context_state, global_text)
    ctx_payload = {
        "video_name": base,
        "created_at": stamp,
        "scenes_seen": context_state["scenes_seen"],
        "video_type": context_state["video_type"],
        "summary": context_state["summary"],
        "confronted": confronted_final,
        "entities_freq": context_state["entities_freq"],
        "entity_types": context_state["entity_types"],
        "actions_freq": context_state["actions_freq"],
        "tags_freq": context_state["tags_freq"],
        "ocr_vocab_top": sorted(context_state["ocr_vocab"].items(), key=lambda x: (-x[1], x[0]))[:100]
    }
    with open(ctx_name, "w", encoding="utf-8") as f:
        json.dump(ctx_payload, f, ensure_ascii=False, indent=2)
    LOGGER.info(f"Context JSON saved: {ctx_name} ({_filesize(ctx_name)})")

    # Metadata TXT (technical + model-global + confronted facts)
    technical_meta = {
        "file_name": os.path.basename(video_path),
        "abs_path": os.path.abspath(video_path),
        "created_at": stamp,
        "video_length_sec": round(duration, 3),
        "fps": fps,
        "width": width,
        "height": height,
        "num_scenes_10s": len(bounds),
        "scene_seconds": SCENE_SECONDS,
        "global_sample_frames": global_n
    }
    top_ents_str = ", ".join([f"{k} (x{v})" for k, v in sorted(context_state["entities_freq"].items(), key=lambda x: (-x[1], x[0]))[:20]])
    top_acts_str = ", ".join([f"{k} (x{v})" for k, v in sorted(context_state["actions_freq"].items(), key=lambda x: (-x[1], x[0]))[:20]])
    top_tags_str = ", ".join([f"{k} (x{v})" for k, v in sorted(context_state["tags_freq"].items(), key=lambda x: (-x[1], x[0]))[:20]])

    meta_text = [
        "# GENERAL VIDEO METADATA",
        "",
        "## Technical",
        json.dumps(technical_meta, indent=2),
        "",
        "## Model-Inferred (Global Sampling)",
        global_text.strip(),
        "",
        "## Aggregated Context (Confronted Facts)",
        f"Likely type: {confronted_final.get('likely_type') or context_state['video_type']}",
        confronted_final.get("context_text", ""),
        "",
        "## Top Entities",
        top_ents_str or "(none)",
        "",
        "## Top Actions",
        top_acts_str or "(none)",
        "",
        "## Top Tags",
        top_tags_str or "(none)",
        ""
    ]
    with open(meta_name, "w", encoding="utf-8") as f:
        f.write("\n".join(meta_text))
    LOGGER.info(f"Metadata TXT saved: {meta_name} ({_filesize(meta_name)})")

    LOGGER.info(f"Done in {time.perf_counter() - t0:.2f}s | CSV: {csv_name} | TXT: {meta_name} | JSON: {ctx_name}")
    return csv_name, meta_name, ctx_name

In [12]:
GLOBAL_PROMPT = (
    "You are analyzing sampled frames from the WHOLE video. Provide:\n"
    "- Video type/genre (e.g., sports/soccer match highlight, vlog, screencast, ad, tutorial, etc.)\n"
    "- High-level summary (4-5 sentences)\n"
    "- Likely action types present (bulleted)\n"
    "- Prominent entities/teams/brands (bulleted)\n"
    "- Visual style cues (e.g., broadcast overlay, handheld, studio)\n"
    "Return sections: [TYPE], [SUMMARY], [ACTIONS], [ENTITIES], [STYLE]."
)

In [13]:
# ----------------------------
# Prompts
# ----------------------------
GLOBAL_PROMPT = (
    "You are analyzing sampled frames from the WHOLE video. Provide:\n"
    "- Video type/genre (e.g., sports/soccer match highlight, vlog, screencast, ad, tutorial, etc.)\n"
    "- High-level summary (4-5 sentences)\n"
    "Return sections: [TYPE], [SUMMARY]"
)

def build_scene_prompt(global_context_text: str,
                       scene_id: int,
                       start_s: float,
                       end_s: float,
                       video_name: str,
                       start_tc: str,
                       end_tc: str) -> str:
    """
    RAG-friendly SCENE CARD (text, not JSON), includes global context and asks for
    Description, OCR, Entities, Actions (chronological).
    """
    return f"""
You will receive frames from a ~10-second SCENE of a video.
Use the GLOBAL CONTEXT to disambiguate when helpful, but do not contradict the visible frames.

GLOBAL CONTEXT (aggregated; may include inferred type/summary/entities/actions):
{global_context_text.strip()}

TASK:
Return a RAG SCENE CARD in plain text (no code fences). Keep it factual and concise.

RAG SCENE CARD
---
Meta:
- video_name: {video_name}
- scene_id: {scene_id}
- start_sec: {start_s:.3f}
- end_sec: {end_s:.3f}
- start_timecode: {start_tc}
- end_timecode: {end_tc}

Description:
- description: <4-5 lines of who/what/where and context>
- actions_chronological:
  - <t≈ +0-2s: action phrase>
  - <t≈ +2-4s: action phrase>
  - <...>

OnScreenText:
- lines:
  - <exact OCR line 1>
  - <exact OCR line 2>
  - ...

Entities:
- items:
  - type: <person/team/logo/brand/location/object/number/other>
    name_or_value: <best guess or exact text>
    attributes: [<short attrs like jersey #, color, role, number>]
  - ...

Tags:
- scene_tags: [<short keywords like teams, brands, numbers, jersey ids, location>, ...]

Shot:
- camera_motion: [<pan/tilt/handheld/static/zoom>, ...]
- camera_angle: [<wide/close-up/top-down>, ...]
- cuts_or_transitions: [<if any>, ...]

Confidence:
- overall: <0.0-1.0>
- ocr: <0.0-1.0>
- entity_detection: <0.0-1.0>

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullets for actions/tags/entities; keep chronology coarse (0–2s, 2–4s, etc.).
- If unknown or not visible, write 'unknown' or use empty lists [].
- Return ONLY the card content above (no extra commentary).
""".strip()

In [None]:
process_strategy2(VIDEO_PATH)

2025-09-13 15:42:42 | INFO    | Opening video: documentary_480p.mp4
2025-09-13 15:42:42 | INFO    | Video | fps=25.000 frames=21676 dur=867.040s res=854x358
2025-09-13 15:42:42 | INFO    | Sampling frames for GLOBAL description...
2025-09-13 15:42:44 | INFO    | Global sampling: frames=16, segments=16, fps=25.000
2025-09-13 15:42:44 | INFO    | Generating GLOBAL description...
2025-09-13 15:42:44 | INFO    | chat_with_images | CUDA | tensor=(16, 3, 448, 448)
2025-09-13 15:42:44 | INFO    | Prompt length: 344 chars
2025-09-13 15:42:44 | INFO    | >>> Generation started...


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


[TYPE]  
Documentary/Nature Film  

[SUMMARY]  
The video showcases a safari adventure in the wild, capturing scenes of vehicles navigating through open grasslands. It features close-ups of cheetahs interacting with humans and highlights various moments of wildlife observation at sunset. The footage emphasizes exploration and encounters with nature, focusing on both human activities and animal behavior in their natural habitat.2025-09-13 15:42:52 | INFO    | 
>>> Generation finished in 8.08s (431 chars).
2025-09-13 15:42:52 | INFO    | Computing 10s scene bounds...
2025-09-13 15:42:52 | INFO    | 87 scene(s) planned.
2025-09-13 15:42:52 | INFO    | [1/87] ( 1.1%) Scene 00:00:00.000–00:00:10.000
2025-09-13 15:42:53 | INFO    |   • Scene frames sampled: 10 (segments=10, frames/seg=1)
2025-09-13 15:42:53 | INFO    | chat_with_images | CUDA | tensor=(10, 3, 448, 448)
2025-09-13 15:42:53 | INFO    | Prompt length: 2135 chars
2025-09-13 15:42:53 | INFO    | >>> Generation started...


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 1
- start_sec: 0.000
- end_sec: 10.000 
- start_timecode: 00:00:00.000  
- end_timecode: 01:00:10.001  

Description:
- description: Aerial view of a dirt road in an open grassland, with sparse vegetation and distant hills under a clear sky.
- actions_chronological:
  - <t≈ +0-2s>: Camera pans over the landscape from above
  - <t≈+2-4s>: The camera continues to move forward along the road

OnScreenText:
- lines:
  - "100 YEARS AGO... THERE WERE APPROXIMATELY 65,000 CHEETAHS ON THIS PLANET."

Entities:
- items:
  - type: text
    name_or_value: "100 YEARS ago.. there were approximately 65,0o0 cheetahs on this planet."
    attributes: []

Tags:
- scene_tags: [nature, wildlife, safari]

Shot:
- camera_motion: [aerial]
- camera_angle: [top-down]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.95
- entity_detection: 0.85
2025-09-13 15:43:20 | INFO    | 
>>> Generation finished in 27.02s (910 chars).
2025-09-1

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 2  
- start_sec: 10.000  
- end_sec: 20.000   
- start_timecode: 00:00:10.00   
- end_timecode: 01:00:20.00  

Description:  
- description: Aerial view of a dirt road through an open grassland landscape, showcasing the vastness of the area during what appears to be sunset. The scene emphasizes wildlife and nature conservation themes.  
- actions_chronological:  
  - <t≈ +0-2s>: Camera continues moving forward along the road.  
  - <t≥ +2-4s>: No significant action changes observed.

OnScreenText:  
- lines: ["100 YEARS AGO, THERE WERE ABOUT 100,000 CHEETAHS ON THIS PLANET."]  

Entities:  
- items:  
  - type: text  
    name_or_value: "100 YEARS AGО, ТНЕRE WERE АВОUt 100,OOO СНЕЕТАНѕ ОN ТНІS РLАNΕ."  
    attributes: []  

Tags:  
- scene_tags: [nature, safari, wildlife]  

Shot:  
- camera_motion: [forward movement]  
- camera_angle: [wide]  
- cuts_or_transitions: []  

Confidence:  
- overall: 0.95  
- ocr

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 3
- start_sec: 20.000
- end_sec: 30.000 
- start_timecode: 00:00:20.001
- end_timecode: 01:00:30.001

Description:
- description: A dirt road runs through a vast grassland under a clear sky, leading toward distant hills. The area appears serene and largely untouched by human activity.
- actions_chronological:
  - <t≈ +0-2s>: Camera continues moving forward along the road.
  - <t≈+2-4s>: No significant action changes observed.

OnScreenText:
- lines:
  - TODAY, THERE ARE FEWER THAN 8,000 LEFT.

Entities:
- items:
  []

Tags:
- scene_tags: [nature, safari, wildlife]

Shot:
- camera_motion: [forward]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.95
- ocr: 0.9
- entity_detection: 0.7

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
2025-09-13 15:44:15 | INFO    | 
>>> Generation finished in 24.46s (871 chars).
2025-09-13 15:44:15 | INFO    |   • Scene 3 proc

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 4
- start_sec: 30.000
- end_sec: 40.000 
- start_timecode: 00:00:30.00
- end_timecode: 01:00:40.00

Description:
- description: A man sits on a wooden porch, with grasslands visible in the background.
- actions_chronological:
  - <t≈ +0-2s>: The camera remains stationary as it captures the subject sitting calmly.
  - <t≈+2-4s>: No significant action changes observed.

OnScreenText:
- lines: []

Entities:
- items: []
  
Tags:
- scene_tags: [nature, safari]

Shot:
- camera_motion: [static]
- camera_angle: [close-up]
- cuts_or_transitions: []

Confidence:
- overall: 0.95
- ocr: 0.75
- entity_detection: 0.852025-09-13 15:44:36 | INFO    | 
>>> Generation finished in 20.10s (679 chars).
2025-09-13 15:44:36 | INFO    |   • Scene 4 processed.
2025-09-13 15:44:36 | INFO    | [5/87] ( 5.7%) Scene 00:00:40.000–00:00:50.000
2025-09-13 15:44:36 | INFO    |   • Scene frames sampled: 10 (segments=10, frames/seg=1)
2025-09-13 15:44:3

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 5  
- start_sec: 40.000  
- end_sec: 50.000   
- start_timecode: 00:00:40.00  
- end_timecode: 01:00:50.00  

Description:  
- description: A man sits on a wooden deck, likely part of a safari adventure. The background shows an open grassland landscape. He appears to be speaking about wildlife or nature.  

Actions_chronological:  
  - <t≈ +0-2s>: the camera remains stationary as it captures the subject sitting calmly.  
  - <t≥+2-4s>: no significant action changes observed.  

OnScreenText:  
- lines: []  

Entities:  
- items:  
  - type: person  
    name_or_value: unknown  
    attributes: [wearing gray shirt]  

Tags:  
- scene_tags: [nature, safari, wildlife]  

Shot:  
- camera_motion: [static]  
- camera_angle: [close-up]  
- cuts_or_transitions: []  

Confidence:  
- overall: 0.9  
- ocr: 0.7  
- entity_detection: 0.8  

NOTES:  
- For OnScreenText, list ALL legible text exactly as written (preserve ca

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 6
- start_sec: 50.000
- end_sec: 60.000 
- start_timecode: 00:00:50.00
- end_timecode: 01:00:00

Description:
- description: A person sits calmly indoors with a natural landscape visible through the window, while an outdoor shot captures a cheetah gazing into the distance during sunset.
- actions_chronological:
  - <t≈ +0-2s>: camera remains stationary as it captures the subject sitting calmly.
  - <t≈+2-4s>: no significant action changes observed.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: []
  - type: animal
    name_or_value： cheetah
    attributes: []

Tags:
- scene_tags: [nature, safari, wildlife]

Shot:
- camera_motion: [static]
- camera_angle: [close-up]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.8

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullets 

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 7
- start_sec: 60.000
- end_sec: 70.000 
- start_timecode: 00:01:00.000  
- end_timecode: 01:10.000  

Description:
- description: A person and a cheetah face each other in an open grassland during sunset, highlighting human-wildlife interaction.
- actions_chronological:
  - <t≈ +0-2s>: camera remains stationary as the subjects sit calmly
  - <t≈+2-4s>: no significant action changes observed

OnScreenText:
- lines:
  - WILD CONNECTION

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [wearing cap]
  - type: animal
    name_or_value： cheetah
    attributes: []

Tags:
- scene_tags: [nature, safari, wildlife]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.95
- entity_detection: 0.85

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
2025-09-13 15:46:06 | INFO    | 
>>> Generation finishe

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 8
- start_sec: 70.000
- end_sec: 80.000 
- start_timecode: 00:01:10.000  
- end_timecode: 01:20.000  

Description:
- description: A vehicle drives through an open grassland at sunset in Free State, South Africa. The landscape is vast and serene with distant hills.
- actions_chronological:
  - <t≈ +0-2s>: camera continues moving forward along the road
  - <t≈+2-4s>: no significant action changes observed

OnScreenText:
- lines:
  - FREE STATE, SOUTH AFRICA

Entities:
- items:
  - type: object
    name_or_value: vehicle
    attributes: []
  - type: location
    name_or_valueFree State, South Africa
    attributes: []

Tags:
- scene_tags: [nature, safari]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.95
- ocr: 0.9
- entity_detection: 0.85
2025-09-13 15:46:31 | INFO    | 
>>> Generation finished in 24.38s (876 chars).
2025-09-13 15:46:31 | INFO    |   • Scene 8

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 9  
- start_sec: 80.000  
- end_sec: 90.000   
- start_timecode: 00:01:20.000    
- end_timecode: 01:30.000   

Description:  
- description: A safari vehicle drives along a dirt path during sunset, with individuals observing the landscape. One person is using binoculars to view wildlife in an open grassland setting.

Actions_chronological:
  - <t≈ +0-2s>: The camera remains stationary as it captures subjects sitting calmly.
  - <t≈+2-4s>: No significant action changes observed.

OnScreenText:
- lines: []  

Entities:
- items:
  - type: object
    name_or_value: Tiger Leader Vehicle
    attributes: [brand logo on side]
  - type: person
    name_or_value unknown
    attributes: [wearing cap]

Tags:
- scene_tags: [nature, safari, wildlife, observation] 

Shot:
- camera_motion: [static] 
- camera_angle: [wide] 
- cuts_or_transitions: []

Confidence:
- overall: 0.85
- ocr: 0.7
- entity_detection: 0.92025-09-13 15:4

[h264 @ 0x55fbd2316c40] mmco: unref short failure
[h264 @ 0x55fbd2316c40] mmco: unref short failure


2025-09-13 15:46:58 | INFO    |   • Scene frames sampled: 10 (segments=10, frames/seg=1)
2025-09-13 15:46:58 | INFO    | chat_with_images | CUDA | tensor=(10, 3, 448, 448)
2025-09-13 15:46:58 | INFO    | Prompt length: 3134 chars
2025-09-13 15:46:58 | INFO    | >>> Generation started...


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 10
- start_sec: 90.000
- end_sec: 100.000 
- start_timecode: 00:01:30.000  
- end_timecode: 01:40.000  

Description:
- description: A man and a woman are standing in front of rustic backgrounds, likely on a safari adventure. The setting suggests wildlife observation.
- actions_chronological:
  - <t≈ +0-2s>: camera remains stationary as subjects sit calmly
  - <t≈+2-4s>: no significant action changes observed

OnScreenText:
- lines: []

Entities:
- items: []
  
Tags:
- scene_tags: [nature, safari, wildlife]

Shot:
- camera_motion: [static]
- camera_angle: [close-up]
- cuts_or_transitions: []

Confidence:
- overall: 0.85
- ocr: 0.7
- entity_detection: 0.6
2025-09-13 15:47:18 | INFO    | 
>>> Generation finished in 20.67s (730 chars).
2025-09-13 15:47:18 | INFO    |   • Scene 10 processed.
2025-09-13 15:47:18 | INFO    | [11/87] (12.6%) Scene 00:01:40.000–00:01:50.000
2025-09-13 15:47:19 | INFO    |   • Scene frames samp

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


---
Meta:
- video_name: documentary_480p
- scene_id: 11
- start_sec: 100.000
- end_sec: 110.000 
- start_timecode: 00:01:40.000  
- end_timecode: 01:50.000  

Description:
A woman stands in a grassy field at sunset, observing the surroundings with her back to the camera. A lone cheetah is seen walking along a dirt path nearby.

Actions_chronological:
  - <t≈ +0-2s>: The camera remains stationary as it captures the subject standing calmly.
  - <t≈+2-4s>: No significant action changes observed.

OnScreenText:
- lines: []
    
Entities:
- items:
  - type: person
    name_or_value: Kristina Perlerius
    attributes: [long hair]

Tags:
- scene_tags: [nature, safari, wildlife, observation]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.95
- entity_detection: 0.9

---2025-09-13 15:47:42 | INFO    | 
>>> Generation finished in 23.18s (843 chars).
2025-09-13 15:47:42 | INFO    |   • Scene 11 processed.
2025-09-13 15:47:42 |

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 12
- start_sec: 110.000
- end_sec: 120.000 
- start_timecode: 00:01:50.000  
- end_timecode: 02:00.000  

Description:
- description: A person is seated indoors against a brick wall, then transitions to walking through an open grassland at sunset during a safari adventure.
- actions_chronological:
  - <t≈ +0-2s>: The camera remains stationary as the subject sits calmly indoors.
  - <t≈+2-4s>: The individual walks across a field during sunset.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: []
  - type: location
    name_or_value： indoor/outdoor setting
    attributes: []

Tags:
- scene_tags: [nature, safari]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.8

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullets for

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 13  
- start_sec: 120.000  
- end_sec: 130.000   
- start_timecode: 00:02:00.000     
- end_timecode: 05:02:10.000  

Description:  
- description: A man is sitting indoors with a scenic view of dry grasslands and hills in the background, suggesting a safari setting. The focus appears to be on wildlife observation or exploration.

Actions_chronological:
  - <t≈ +0-2s>: camera remains stationary as the subject sits calmly indoors.
  - <t≈+2-4s>: no significant action changes observed.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: []
    
Tags:
- scene_tags: [nature, safari, wildlife, observation]

Shot:
- camera_motion: [static]
- camera_angle: [close-up]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.8

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullets fo

[h264 @ 0x55fbcc14f080] mmco: unref short failure
[h264 @ 0x55fbcc14f080] mmco: unref short failure
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 14
- start_sec: 130.000
- end_sec: 140.000 
- start_timecode: 00:02:10.000  
- end_timecode: 01:02:20.000  

Description:
- description: A vehicle navigates through an open grassland during sunset, showcasing a safari adventure with emphasis on wildlife observation.
- actions_chronological:
  - <t≈ +0-2s>: camera continues moving forward along the road.
  - <t≈+2-4s>: no significant action changes observed.

OnScreenText:
- lines: []

Entities:
- items: []
  
Tags:
- scene_tags: [nature, safari, wildlife, observation]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.95
- ocr: 0.8
- entity_detection: 0.7

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullets for actions/tags/entities; keep chronology coarse (0–2s, 2–4s, etc.).
- If unknown or not visible, write 'unknown' or use empty lists [].
- Return ONLY t

[h264 @ 0x55fbd230d980] mmco: unref short failure
[h264 @ 0x55fbd230d980] mmco: unref short failure
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 15
- start_sec: 140.000
- end_sec: 150.000 
- start_timecode: 00:02:20.000  
- end_timecode: 01:02:30.000  

Description:
- description: A person in a cap observes cheetahs in the grasslands during sunset as part of a safari adventure.
- actions_chronological:
  - <t≈ +0-2s>: camera remains stationary as subject sits calmly
  - <t≈+2-4s>: individual walks across field

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [cap, sunglasses]
  - type: animal
    name_or_value： cheetah
    attributes: [multiple]

Tags:
- scene_tags: [nature, safari, wildlife, observation]

Shot:
- camera_motion: [static]
- camera_angle: [close-up]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.8

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullets for actions/tags/entities; keep chronology coa

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 16  
- start_sec: 150.000  
- end_sec: 160.000   
- start_timecode: 00:02:30.000    
- end_timecode: 01:02:40.000  

Description:  
- description: Two individuals walk across an open grassland during sunset, with a vehicle visible in the distance. They are likely on a safari adventure, observing wildlife and nature.

Actions_chronological:  
  - <t≈ +0-2s>: camera remains stationary as subjects sit calmly  
  - <t≥ +2-4s>: individual walks across field  

OnScreenText:  
- lines: []  

Entities:  
- items:  
  - type: person  
    name_or_value: unknown  
    attributes: [wearing casual attire]  

Tags:  
- scene_tags: [nature, safari, observation]

Shot:  
- camera_motion: [static]  
- camera_angle: [wide]  
- cuts_or_transitions: []  

Confidence:  
- overall: 0.9  
- ocr: 0.7  
- entity_detection: 0.8  
2025-09-13 15:50:02 | INFO    | 
>>> Generation finished in 23.58s (891 chars).
2025-09-13 15:50:02 | INFO

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 17
- start_sec: 160.000
- end_sec: 170.000 
- start_timecode: 00:02:40.000  
- end_timecode: 05:02:50.000

Description:
- description: A couple sits on a rock, watching the sunset over an open grassland with their vehicle nearby.
- actions_chronological:
  - <t≈ +0-2s>: The camera remains stationary as subjects sit calmly.
  - <t≈+2-4s>: Individual walks across field.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: []
  - type: object
    name_or_value： vehicle
    attributes: [camping]

Tags:
- scene_tags: [nature, safari, observation]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.8
- entity_detection: 0.9

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
2025-09-13 15:50:28 | INFO    | 
>>> Generation finished in 25.71s (900 chars).
2025-09-13 15:50:28

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 18
- start_sec: 170.000
- end_sec: 180.000 
- start_timecode: 00:02:50.000  
- end_timecode: 01:03:00.000  

Description:
- description: A couple stands on a hilltop at sunset, overlooking a vast landscape with distant hills.
- actions_chronological:
  - <t≈ +0-2s>: Two individuals stand side by side and look out over the horizon.
  - <t≈+2-4s>: The camera captures their silhouettes against the colorful sky.

OnScreenText:
- lines:
  - TIGER CANYON WILDLIFE CONSERVATION 2013

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: []
  - type: location
    name_or_value：Tiger Canyon Wildlife Conservation
    attributes: []

Tags:
- scene_tags: [nature, wildlife, observation]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.95
- entity_detection: 0.85
---2025-09-13 15:50:56 | INFO    | 
>>> Generation finished in 26.86s (925 cha

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


---
Meta:
- video_name: documentary_480p
- scene_id: 19
- start_sec: 180.000
- end_sec: 190.000 
- start_timecode: 00:03:00.000  
- end_timecode: 01:03:10.000  

Description:
- description: A safari vehicle drives along a dirt road in an open grassland, surrounded by sparse vegetation under a clear sky.
- actions_chronological:
  - <t≈ +0-2s>: camera continues moving forward along the road
  - <t≈+2-4s>: no significant action changes observed

OnScreenText:
- lines: []

Entities:
- items:
  - type: object
    name_or_value: vehicle
    attributes: [type: SUV]

Tags:
- scene_tags: [nature, safari, wildlife, observation]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.95
- ocr: 0.8
- entity_detection: 0.9
---2025-09-13 15:51:19 | INFO    | 
>>> Generation finished in 22.30s (776 chars).
2025-09-13 15:51:19 | INFO    |   • Scene 19 processed.
2025-09-13 15:51:19 | INFO    | [20/87] (23.0%) Scene 00:03:10.000–00:03:20.000
2025-09-

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 20  
- start_sec: 190.000  
- end_sec: 200.000   
- start_timecode: 00:03:10.000    
- end_timecode: 01:03:20.000  

Description:  
- description: Two individuals in a safari vehicle are observing the landscape as they drive through grasslands, with one person using a radio device.

Actions_chronological:
  - <t≈ +0-2s>: camera continues moving forward along the road.
  - <t≈+2-4s>: individual walks across field.

OnScreenText:
- lines: []

Entities:
- items:
  - type: object
    name_or_value: vehicle
    attributes: [type: SUV]
  - type: person
    name_or_value : unknown
    attributes: [wearing cap]

Tags:
- scene_tags: [nature, safari, wildlife, observation]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.8

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullet

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


---
Meta:
- video_name: documentary_480p
- scene_id: 21
- start_sec: 200.000
- end_sec: 210.000 
- start_timecode: 00:03:20.000  
- end_timecode: 01:03:30.000  

Description:
- description: A person walks across an open grassland, with rolling hills in the background under a clear blue sky.
- actions_chronological:
  - <t≈ +0-2s>: individual walks across field

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [backpack, cap]

Tags:
- scene_tags: [nature, safari, wildlife, observation]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.95
- ocr: 0.7
- entity_detection: 0.85

---2025-09-13 15:52:09 | INFO    | 
>>> Generation finished in 19.96s (698 chars).
2025-09-13 15:52:09 | INFO    |   • Scene 21 processed.
2025-09-13 15:52:09 | INFO    | [22/87] (25.3%) Scene 00:03:30.000–00:03:40.000
2025-09-13 15:52:09 | INFO    |   • Scene frames sampled: 10 (segments=10, frames/seg=

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 22
- start_sec: 210.000
- end_sec: 220.000 
- start_timecode: 00:03:30.000  
- end_timecode: 01:03:40.000  

Description:
- description: Two cheetahs are sitting calmly on a dirt landscape under a clear blue sky, likely part of a safari adventure showcasing wildlife observation.
- actions_chronological:
  - <t≈ +0-2s>: camera remains stationary as subjects sit calmly.

OnScreenText:
- lines:
  - RUNDE AND SABI
  - 2010-2019

Entities:
- items:
  - type: object
    name_or_value: cheetahs
    attributes: [spotted fur]

Tags:
- scene_tags: [nature, safari, wildlife, observation]

Shot:
- camera_motion: [static]
- camera_angle: [close-up]
- cuts_or_transitions: []

Confidence:
- overall: 0.95
- ocr: 0.9
- entity_detection: 0.92025-09-13 15:52:33 | INFO    | 
>>> Generation finished in 23.87s (801 chars).
2025-09-13 15:52:33 | INFO    |   • Scene 22 processed.
2025-09-13 15:52:33 | INFO    | [23/87] (26.4%) Scene 00:03:40.

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 23
- start_sec: 220.000
- end_sec: 230.000 
- start_timecode: 00:03:40.000  
- end_timecode: 01:03:50.000  

Description:
- description: Two cheetah cubs are interacting closely on a dirt hill under clear blue skies, likely part of a wildlife observation segment in a safari setting.
- actions_chronological:
  - <t≈ +0-2s: camera remains stationary as subjects sit calmly>
  - <t≈+2-4s: no significant action changes observed>

OnScreenText:
- lines: []

Entities:
- items:
  - type: animal
    name_or_value: cheetah cubs
    attributes: [young, spotted fur]

Tags:
- scene_tags: [nature, safari, wildlife, observation]

Shot:
- camera_motion: [static]
- camera_angle: [close-up]
- cuts_or_transitions: []

Confidence:
- overall: 0.95
- ocr: 0.7
- entity_detection: 0.9

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullets for actions/tags/entities; keep chronology coarse (

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---
Meta:  
- video_name: documentary_480p  
- scene_id: 24  
- start_sec: 230.000  
- end_sec: 240.000   
- start_timecode: 00:03:50.000    
- end_timecode: 01:04:00.000   

Description:  
- description: A cheetah is captured walking in the grasslands during sunset, showcasing its spotted fur and majestic presence. The setting appears to be part of a safari adventure emphasizing wildlife observation.

Actions_chronological:  
  - <t≈ +0-2s>: camera remains stationary as subject sits calmly indoors.
  - <t≈+2-4s>: no significant action changes observed.

OnScreenText:  
- lines: []  

Entities:  
- items:  
  - type: person  
    name_or_value: unknown  
    attributes: []  
  - type: animal  
    name_or_value : cheetah  
    attributes: [spotted fur]  

Tags:  
- scene_tags: [nature, safari, wildlife]

Shot:  
- camera_motion: [static]  
- camera_angle: [close-up]  
- cuts_or_transitions: []

Confidence:  
- overall: 0.9  
- ocr: 0.7  
- entity_detection: 0.8  
---20

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


---
Meta:
- video_name: documentary_480p
- scene_id: 25
- start_sec: 240.000
- end_sec: 250.000 
- start_timecode: 00:04:00.000  
- end_timecode: 01:04:10.000  

Description:
- description: A cheetah is silhouetted against a sunset in an open grassland, with tall grasses surrounding it.

Actions_chronological:
  - <t≈ +0-2s>: camera remains stationary as the subject sits calmly

OnScreenText:
- lines: []

Entities:
- items:
  - type: object
    name_or_value: cheetah
    attributes: []
    
Tags:
- scene_tags: [nature, wildlife]

Shot:
- camera_motion: [static]
- camera_angle: [close-up]
- cuts_or_transitions: []

Confidence:
- overall: 0.95
- ocr: 0.7
- entity_detection: 0.9
---2025-09-13 15:53:53 | INFO    | 
>>> Generation finished in 20.67s (688 chars).
2025-09-13 15:53:53 | INFO    |   • Scene 25 processed.
2025-09-13 15:53:53 | INFO    | [26/87] (29.9%) Scene 00:04:10.000–00:04:20.000
2025-09-13 15:53:54 | INFO    |   • Scene frames sampled: 10 (segments=10, frames/seg=1)
2025-09

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 26
- start_sec: 250.000
- end_sec: 260.000 
- start_timecode: 00:04:10.000  
- end_timecode: 01:04:20.000  

Description:
- description: A person sits calmly indoors, with a wooden structure and natural landscape visible through the window. The setting suggests wildlife observation or exploration.
- actions_chronological:
  - <t≈ +0-2s>: camera remains stationary as subject sits calmly

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: []
    
Tags:
- scene_tags: [nature, safari, wildlife]

Shot:
- camera_motion: [static]
- camera_angle: [close-up]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.8
2025-09-13 15:54:15 | INFO    | 
>>> Generation finished in 21.01s (767 chars).
2025-09-13 15:54:15 | INFO    |   • Scene 26 processed.
2025-09-13 15:54:15 | INFO    | [27/87] (31.0%) Scene 00:04:20.000–00:04:30.000
2025-09-13 15:54

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 27  
- start_sec: 260.000  
- end_sec: 270.000   
- start_timecode: 00:04:20.000     
- end_timecode: 01:04:30.000  

Description:  
- description: A close-up of a cheetah's face resting on the ground, emphasizing wildlife observation during a safari adventure. The setting suggests an outdoor environment with natural surroundings.  

Actions_chronological:  
- <t≈ +0-2s>: camera remains stationary as subject sits calmly  

OnScreenText:  
- lines: []  

Entities:  
- items:  
  - type: object  
    name_or_value: cheetah  
    attributes: [resting]  

Tags:  
- scene_tags: [nature, safari, wildlife, observation]  

Shot:  
- camera_motion: [static]  
- camera_angle: [close-up]  
- cuts_or_transitions: []  

Confidence:  
- overall: 0.9  
- ocr: 0.8  
- entity_detection: 0.9  

NOTES:  
- For OnScreenText, list ALL legible text exactly as written (preserve case).  
- Prefer short bullets for actions/tags/entitie

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 28  
- start_sec: 270.000  
- end_sec: 280.000   
- start_timecode: 00:04:30.000    
- end_timecode: 01:04:40.000  

Description:  
- description: A man sits calmly indoors, speaking with a wooden wall and railing in the background. The setting suggests an outdoor or safari-themed environment. He gestures slightly while talking.  
- actions_chronological:  
  - <t≈ +0-2s>: Man remains stationary while sitting and talking.  
  - <t≡ +2-4s>: Man continues to gesture subtly as he speaks.  

OnScreenText:  
- lines: []  

Entities:  
- items:  
  - type: person  
    name_or_value: unknown  
    attributes: [wearing gray shirt]  

Tags:  
- scene_tags: [nature, wildlife, observation]  

Shot:  
- camera_motion: [static]  
- camera_angle: [close-up]  
- cuts_or_transitions: []  

Confidence:  
- overall: 0.95  
- ocr: 0.90  
- entity_detection: 0.92  

NOTES:  
- For OnScreenText, list ALL legible text exactly as wr

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---
Meta:  
- video_name: documentary_480p  
- scene_id: 29  
- start_sec: 280.000  
- end_sec: 290.000   
- start_timecode: 00:04:40.000    
- end_timecode: 01:04:50.000  

Description:  
- description: A man interacts gently with a cheetah cub in an open grassland during sunset, showcasing human-animal connection on a safari adventure. The setting emphasizes nature and wildlife observation.

Actions_chronological:
  - <t≈ +0-2s>: Man sits calmly interacting with the cheetah.
  - <t≈+2-4s>: Cheetah nuzzles against the man as he pets it.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [wearing cap, blue shirt]
  - type: animal
    name_or_value：cheetah cub
    attributes: [young, spotted fur]

Tags:
- scene_tags: [nature, safari, wildlife, observation]

Shot:
- camera_motion: [static] 
- camera_angle: [close-up]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.8

NOTES:
- F

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 30
- start_sec: 290.000
- end_sec: 300.000 
- start_timecode: 00:04:50.000  
- end_timecode: 01:05:00.00  

Description:
- description: A man in a safari outfit sits calmly outdoors with a cheetah cub, gently petting it as they interact.
- actions_chronological:
  - <t≈ +0-2s>: Man remains stationary while sitting and talking.
  - <t≈+2-4s>: Cheetah approaches the man; he pets its head.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [wearing cap, blue shirt]
  - type: animal
    name_or_value：cheetah cub
    attributes: [young, spotted fur]

Tags:
- scene_tags: [nature, wildlife, observation]

Shot:
- camera_motion: [static]
- camera_angle: [close-up]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.8

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
2025-09-13 15:56:17 | INFO    | 
>>> 

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 31
- start_sec: 300.000
- end_sec: 310.000 
- start_timecode: 00:05:00.00
- end_timecode: 01:05:10.00

Description:
- description: A man sits calmly interacting with a cheetah in an open grassland during sunset, highlighting the close bond between humans and wildlife.
- actions_chronological:
  - <t≈ +0-2s>: Man remains stationary while sitting and petting the cheetah.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [wearing cap]
  - type: animal
    name_or_value：cheetah
    attributes: [spot pattern]

Tags:
- scene_tags: [nature, safari, wildlife, observation]

Shot:
- camera_motion: [static]
- camera_angle: [close-up]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.85

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullets for actions/tags/entities; keep chronology co

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 32
- start_sec: 310.000
- end_sec: 320.000 
- start_timecode: 00:05:10.00  
- end_timecode: 01:05:20.00  

Description:
- description: A vehicle drives along a dirt path in an expansive grassland at sunset, with vast open plains stretching into the distance.

- actions_chronological:
  - <t≈ +0-2s>: camera pans over the landscape from above.
  - <t≈+2-4s>: no significant action changes observed.

OnScreenText:
- lines: []

Entities:
- items: []
  
Tags:
- scene_tags: [nature, safari, wildlife, observation]

Shot:
- camera_motion: [pan]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.8

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
2025-09-13 15:57:12 | INFO    | 
>>> Generation finished in 22.72s (807 chars).
2025-09-13 15:57:12 | INFO    |   • Scene 32 processed.
2025-09-13 15:57:12 | INFO    | [33/87] (37.9%) Scene 00

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 33
- start_sec: 320.000
- end_sec: 330.000 
- start_timecode: 00:05:20.00
- end_timecode: 01:05:30.00

Description:
- description: Aerial view of a dry, grassy landscape with patches of vegetation and small water channels.
- actions_chronological:
  - <t≈ +0-2s>: camera pans over the landscape from above

OnScreenText:
- lines: []

Entities:
- items: []
  
Tags:
- scene_tags: [nature, safari, wildlife]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.8

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
2025-09-13 15:57:33 | INFO    | 
>>> Generation finished in 20.11s (704 chars).
2025-09-13 15:57:33 | INFO    |   • Scene 33 processed.
2025-09-13 15:57:33 | INFO    | [34/87] (39.1%) Scene 00:05:30.000–00:05:40.000


[h264 @ 0x55fbd2316c40] mmco: unref short failure


2025-09-13 15:57:33 | INFO    |   • Scene frames sampled: 10 (segments=10, frames/seg=1)
2025-09-13 15:57:33 | INFO    | chat_with_images | CUDA | tensor=(10, 3, 448, 448)
2025-09-13 15:57:33 | INFO    | Prompt length: 3551 chars
2025-09-13 15:57:33 | INFO    | >>> Generation started...


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 34
- start_sec: 330.000
- end_sec: 340.000 
- start_timecode: 00:05:30.00
- end_timecode: 01:05:40.00

Description:
- description: A herd of sheep grazing in a dry, grassy landscape with rocky patches.
- actions_chronological:
  - <t≈ +0-2s>: Sheep graze on the ground.

OnScreenText:
- lines: []

Entities:
- items:
  - type: animal
    name_or_value: sheep
    attributes: [herd]

Tags:
- scene_tags: [nature, wildlife]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.9

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullets for actions/tags/entities; keep chronology coarse (0–2s, 2–4s, etc.).
- If unknown or not visible, write 'unknown' or use empty lists [].
- Return ONLY the card content above (no extra commentary).2025-09-13 15:58:00 | INFO    | 
>>> Generation finished in

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 35
- start_sec: 340.000
- end_sec: 350.000 
- start_timecode: 00:05:40.00
- end_timecode: 01:05:50.00

Description:
- description: A woman speaks indoors while a cheetah is shown resting in the wild, observed by two individuals seated nearby.
- actions_chronological:
  - <t≈ +0-2s>: camera remains stationary as subjects sit calmly.
  - <t≈+2-4s>: no significant action changes observed.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [standing]
  - type: animal
    name_or_value： cheetah
    attributes: [resting]

Tags:
- scene_tags: [nature, wildlife, observation]

Shot:
- camera_motion: [static]
- camera_angle: [close-up]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.9

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullets for actions/tags/entities; keep chronology c

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 36  
- start_sec: 350.000  
- end_sec: 360.000   
- start_timecode: 00:05:50.00  
- end_timecode: 01:06:00.00  

Description:  
- description: A cheetah is seen resting in the tall grass, with two individuals observing nearby. The setting appears to be a savanna landscape during sunset, typical of a safari adventure.

Actions_chronological:  
  - <t≈ +0-2s>: Cheetah rests calmly among the vegetation.  
  - <t≡ +2-4s>: Individuals remain stationary while sitting and talking.

OnScreenText:  
- lines:  
  - MARA

Entities:  
- items:  
  - type: person  
    name_or_value: unknown  
    attributes: []  
  - type: animal  
    name_or_value： cheetah  
    attributes: [resting]  
  - type: location  
    name_or_value：“Savanna”  
    attributes: []

Tags:  
- scene_tags: [nature, wildlife, observation]

Shot:  
- camera_motion: [static]  
- camera_angle: [close-up]  
- cuts_or_transitions: []

Confidence:  
- overa

[h264 @ 0x55fbcc14f080] mmco: unref short failure
[h264 @ 0x55fbcc14f080] mmco: unref short failure


2025-09-13 15:59:01 | INFO    |   • Scene frames sampled: 10 (segments=10, frames/seg=1)
2025-09-13 15:59:01 | INFO    | chat_with_images | CUDA | tensor=(10, 3, 448, 448)
2025-09-13 15:59:01 | INFO    | Prompt length: 3517 chars
2025-09-13 15:59:01 | INFO    | >>> Generation started...


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 37
- start_sec: 360.000
- end_sec: 370.000 
- start_timecode: 00:06:00.000  
- end_timecode: 01:06:10.000  

Description:
- description: A man and woman sit beside a cheetah in a grassland setting during sunset, with the man speaking while gesturing.
- actions_chronological:
  - <t≈ +0-2s>: subjects sit calmly as they converse.
  - <t≈+2-4s>: no significant action changes observed.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [man]
  - type: person 
    name_or_value: Kristina Perlerius
    attributes: []
  - type: animal
    name_or_value : cheetah
    attributes: []

Tags:
- scene_tags: [nature, wildlife, safari]

Shot:
- camera_motion: [static]
- camera_angle: [close-up]
- cuts_or_transitions: []

Confidence:
- overall: 0.95
- ocr: 0.9
- entity_detection: 0.85

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer s

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 38
- start_sec: 370.000
- end_sec: 380.000 
- start_timecode: 00:06:10.000  
- end_timecode: 05:20.000

Description:
- description: A man is sitting calmly on a wooden porch, speaking directly to the camera, with dry grass and natural scenery in the background.
- actions_chronological:
  - <t≈ +0-2s>: Man remains stationary while talking.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [shirt]

Tags:
- scene_tags: [nature, wildlife, safari, observation]

Shot:
- camera_motion: [static]
- camera_angle: [close-up]
- cuts_or_transitions: []

Confidence:
- overall: 0.95
- ocr: 0.9
- entity_detection: 0.85
2025-09-13 15:59:56 | INFO    | 
>>> Generation finished in 21.19s (735 chars).
2025-09-13 15:59:56 | INFO    |   • Scene 38 processed.
2025-09-13 15:59:56 | INFO    | [39/87] (44.8%) Scene 00:06:20.000–00:06:30.000
2025-09-13 15:59:57 | INFO    |   • Scene frames

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 39
- start_sec: 380.000
- end_sec: 390.000 
- start_timecode: 00:06:20.000  
- end_timecode: 01:06:30.000  

Description:
- description: A cheetah is standing in a grassy field, partially obscured by tall vegetation.
- actions_chronological:
  - <t≈ +0-2s>: Cheetah remains stationary in the grass.

OnScreenText:
- lines: []

Entities:
- items: []
  
Tags:
- scene_tags: [nature, wildlife, safari]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.8

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullets for actions/tags/entities; keep chronology coarse (0–2s, 2–4s, etc.).
- If unknown or not visible, write 'unknown' or use empty lists [].
2025-09-13 16:00:22 | INFO    | 
>>> Generation finished in 24.82s (858 chars).
2025-09-13 16:00:22 | INFO    |   • Scene 39 processed.
2025-

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


---
Meta:
- video_name: documentary_480p
- scene_id: 40
- start_sec: 390.000
- end_sec: 400.000 
- start_timecode: 00:06:30.000  
- end_timecode: 01:06:40.000  

Description:
- description: A man is sitting on a wooden porch, speaking about wildlife observation in a grassland setting.
- actions_chronological:
  - <t≈ +0-2s>: Man remains stationary while talking.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [shirt]

Tags:
- scene_tags: [nature, wildlife, safari, observation]

Shot:
- camera_motion: [static]
- camera_angle: [close-up]
- cuts_or_transitions: []

Confidence:
- overall: 0.95
- ocr: 0.7
- entity_detection: 0.8

---2025-09-13 16:00:42 | INFO    | 
>>> Generation finished in 20.09s (695 chars).
2025-09-13 16:00:42 | INFO    |   • Scene 40 processed.
2025-09-13 16:00:42 | INFO    | [41/87] (47.1%) Scene 00:06:40.000–00:06:50.000
2025-09-13 16:00:43 | INFO    |   • Scene frames sampled: 10 (segments=10, frames/seg=1)


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 41
- start_sec: 400.000
- end_sec: 410.000 
- start_timecode: 00:06:40.000  
- end_timecode: 01:06:50.000  

Description:
- description: A cheetah is shown in close-up against a backdrop of trees and sky, possibly interacting with humans or being observed.
- actions_chronological:
  - <t≈ +0-2s>: Cheetah remains stationary while looking around.
  - <t≈+2-4s>: Cheetah opens mouth wide, displaying teeth.

OnScreenText:
- lines: []

Entities:
- items:
  - type: animal
    name_or_value: cheetah
    attributes: [spots]

Tags:
- scene_tags: [nature, wildlife, safari, observation]

Shot:
- camera_motion: [static]
- camera_angle: [close-up]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.9

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullets for actions/tags/entities; keep chronology coarse (0–2s, 2–4s, etc.).
- If unknown or not vi

[h264 @ 0x55fbcc14f080] mmco: unref short failure
[h264 @ 0x55fbcc14f080] mmco: unref short failure
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 42
- start_sec: 410.000
- end_sec: 420.000 
- start_timecode: 00:06:50.000  
- end_timecode: 01:07:00.000  

Description:
- description: A cheetah sits calmly in a grassy field under a clear blue sky, captured during a safari adventure.
- actions_chronological:
  - <t≈ +0-2s>: Cheetah remains stationary while looking around.

OnScreenText:
- lines: []

Entities:
- items: []
  
Tags:
- scene_tags: [nature, wildlife, safari]

Shot:
- camera_motion: [static]
- camera_angle: [close-up]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.9 
- entity_detection: 0.8

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullets for actions/tags/entities; keep chronology coarse (0–2s, 2–4s, etc.).
- If unknown or not visible, write 'unknown' or use empty lists [].
2025-09-13 16:01:39 | INFO    | 
>>> Generation finished in 25.32s (891 chars).
2025-09-13 16:01:39 | INFO  

[h264 @ 0x55fbd2311740] mmco: unref short failure
[h264 @ 0x55fbd2311740] mmco: unref short failure


2025-09-13 16:01:39 | INFO    |   • Scene frames sampled: 10 (segments=10, frames/seg=1)
2025-09-13 16:01:39 | INFO    | chat_with_images | CUDA | tensor=(10, 3, 448, 448)
2025-09-13 16:01:39 | INFO    | Prompt length: 3526 chars
2025-09-13 16:01:39 | INFO    | >>> Generation started...


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 43  
- start_sec: 420.000  
- end_sec: 430.000   
- start_timecode: 00:07:00.000     
- end_timecode: 01:07:10.000  

Description:  
- description: A cheetah stands in a grassy field as Kristina Perlerius observes from nearby, highlighting human-animal interaction during a safari.  
- actions_chronological:  
  - <t≈ +0-2s>: camera remains stationary capturing the subjects.  
  - <t≥ +2-4s>: no significant action changes observed.  

OnScreenText:  
- lines: []  

Entities:  
- items:  
  - type: person  
    name_or_value: Kristina Perlerius  
    attributes: [standing]  
  - type: animal  
    name_or_value： cheetah  
    attributes: [stationary, observing]  

Tags:  
- scene_tags: [nature, wildlife, safari, observation]  

Shot:  
- camera_motion: [static]  
- camera_angle: [wide]  
- cuts_or_transitions: []  

Confidence:  
- overall: 0.95  
- ocr: 0.90  
- entity_detection: 0.852025-09-13 16:02:07 | INFO  

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 44
- start_sec: 430.000
- end_sec: 440.000 
- start_timecode: 00:07:10.000  
- end_timecode: 01:07:20.000  

Description:
- description: Two individuals sit calmly in a grassland observing a cheetah standing nearby. The setting is serene and natural, capturing the essence of wildlife observation during sunset.
- actions_chronological:
  - <t≈ +0-2s>: Individuals remain stationary while sitting and watching the cheetah.
  - <t≈+2-4s>: No significant action changes observed as subjects continue to observe.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: Kristina Perlerius
    attributes: [female, sunglasses]
  - type: person 
    name_or_value: unknown
    attributes: [male, cap, sunglasses]
  -.type: animal
    name_or_value:: cheetah
    attributes: []
    
Tags:
- scene_tags: [nature, wildlife, safari, observation]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_tra

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 45  
- start_sec: 440.000  
- end_sec: 450.000   
- start_timecode: 00:07:20.000    
- end_timecode: 01:07:30.000  

Description:  
- description: A person sits calmly with a cheetah standing beside them in an open grassland during daylight, capturing human-animal interaction on a safari. The background reveals more cheetahs and expansive plains.

Actions_chronological:
  - <t≈ +0-2s>: camera remains stationary as subjects sit calmly.
  - <t≈+2-4s>: no significant action changes observed.

OnScreenText:  
- lines: []  

Entities:  
- items:
  - type: person
    name_or_value: unknown
    attributes: [wearing sunglasses, cap]
  - type: animal
    name_or_value : cheetah
    attributes: [standing, spotted fur]

Tags:  
- scene_tags: [nature, wildlife, safari, observation]  

Shot:  
- camera_motion: [static]  
- camera_angle: [wide]  
- cuts_or_transitions: []  

Confidence:  
- overall: 0.9  
- ocr: 0.8  
- enti

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 46  
- start_sec: 450.000  
- end_sec: 460.000   
- start_timecode: 00:07:30.000    
- end_timecode: 01:07:40.000  

Description:  
- description: A man walks across a grassy plain with two cheetahs during sunset, emphasizing the peaceful coexistence between humans and wildlife in their natural habitat.

Actions_chronological:  
  - <t≈ +0-2s>: Man continues walking while cheetah remains stationary.
  - <t≥ +2-4s>: No significant action changes observed.

OnScreenText:  
- lines: []  

Entities:  
- items:  
  - type: person  
    name_or_value: unknown  
    attributes: [wearing dark clothing]  
  - type: animal  
    name_or_value： cheetah  
    attributes: [two cheetahs accompanying the man]

Tags:  
- scene_tags: [nature, wildlife, safari, observation]

Shot:  
- camera_motion: [static]  
- camera_angle: [wide]  
- cuts_or_transitions: []

Confidence:  
- overall: 0.9  
- ocr: 0.8  
- entity_detection: 0.92

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 47
- start_sec: 460.000
- end_sec: 470.000 
- start_timecode: 00:07:40.000  
- end_timecode: 01:07:50.000  

Description:
- description: A person and a dog are walking through an open grassland during sunset, with distant hills in the background.
- actions_chronological:
  - <t≈ +0-2s>: Person walks with a dog across grassland
  - <t≈+2-4s>: No significant action changes observed

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: []
  - type: animal
    name_or_value：dog
    attributes: []

Tags:
- scene_tags: [nature, wildlife, safari]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.95
- entity_detection: 0.85
2025-09-13 16:03:57 | INFO    | 
>>> Generation finished in 24.01s (813 chars).
2025-09-13 16:03:57 | INFO    |   • Scene 47 processed.
2025-09-13 16:03:57 | INFO    | [48/87] (55.2%) Sc

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 48  
- start_sec: 470.000  
- end_sec: 480.000   
- start_timecode: 00:07:50.000    
- end_timecode: 01:08:00.000  

Description:  
- description: A man is sitting outdoors, speaking calmly against a wooden backdrop with natural scenery visible in the background.

Actions_chronological:  
  - <t≈ +0-2s>: Man remains stationary while talking.  
  - <t≥ +2-4s>: No significant action changes observed.  

OnScreenText:  
- lines: []  

Entities:  
- items:  
  - type: person  
    name_or_value: unknown  
    attributes: [wearing grey shirt]  

Tags:  
- scene_tags: [nature, wildlife, safari, observation]  

Shot:  
- camera_motion: [static]  
- camera_angle: [close-up]  
- cuts_or_transitions: []  

Confidence:  
- overall: 0.9  
- ocr: 0.8  
- entity_detection: 0.8  

NOTES:  
- For OnScreenText, list ALL legible text exactly as written (preserve case).  
- Prefer short bullets for actions/tags/entities; keep chr

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 49
- start_sec: 480.000
- end_sec: 490.000 
- start_timecode: 00:08:00.000  
- end_timecode: 01:08:10.000  

Description:
- description: Aerial view of a person interacting with an ostrich in a grassy, rocky landscape.
- actions_chronological:
  - <t≈ +0-2s>: camera pans over the landscape from above
  - <t≥ +2-4s>: no significant action changes observed

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: []
  - type: animal
    name_or_value：ostrich
    attributes: []

Tags:
- scene_tags: [nature, wildlife, safari]

Shot:
- camera_motion: [pan]
- camera_angle: [top-down]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.92025-09-13 16:04:51 | INFO    | 
>>> Generation finished in 23.47s (790 chars).
2025-09-13 16:04:51 | INFO    |   • Scene 49 processed.
2025-09-13 16:04:51 | INFO    | [50/87] (57.5%) Scene 00:08:10.000–00:08:2

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 50
- start_sec: 490.000
- end_sec: 500.000 
- start_timecode: 00:08:10.000  
- end_timecode: 01:08:20.000  

Description:
- description: A man walks alongside a cheetah in an open grassland during sunset, highlighting human-wildlife interaction on a safari.
- actions_chronological:
  - <t≈ +0-2s>: Man and cheetah walk together along the dirt path.
  - <t≈+2-4s>: Cheetah sniffs the ground while walking beside the man.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [wearing cap, dark clothing]
  - type: animal
    name_or_value： cheetah
    attributes: [adult]

Tags:
- scene_tags: [nature, wildlife, safari, observation]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.8

2025-09-13 16:05:17 | INFO    | 
>>> Generation finished in 25.52s (898 chars).
2025-09-13 16:05:17 

[h264 @ 0x55fbd0725f00] mmco: unref short failure
[h264 @ 0x55fbd0725f00] mmco: unref short failure


2025-09-13 16:05:18 | INFO    |   • Scene frames sampled: 10 (segments=10, frames/seg=1)
2025-09-13 16:05:18 | INFO    | chat_with_images | CUDA | tensor=(10, 3, 448, 448)
2025-09-13 16:05:18 | INFO    | Prompt length: 3541 chars
2025-09-13 16:05:18 | INFO    | >>> Generation started...


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---
Meta:  
- video_name: documentary_480p  
- scene_id: 51  
- start_sec: 500.000  
- end_sec: 510.000   
- start_timecode: 00:08:20.000    
- end_timecode: 01:08:30.000  

Description:  
- description: A man walks through a grassland with a cheetah following him, set against an expansive savanna landscape.

Actions_chronological:
  - <t≈ +0-2s>: Man walking while holding something.
  - <t≈+2-4s>: Cheetah follows closely behind the man.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [wearing cap]
  - type: animal
    name_or_value : cheetah
    attributes: []
    
Tags:
- scene_tags: [nature, wildlife, safari]

Shot:
- camera_motion: [static] 
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.8

---2025-09-13 16:05:42 | INFO    | 
>>> Generation finished in 23.84s (846 chars).
2025-09-13 16:05:42 | INFO    |   • Scene 51 processed.
2025-09-13 16:05:4

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 52
- start_sec: 510.000
- end_sec: 520.000 
- start_timecode: 00:08:30.000  
- end_timecode: 01:08:40.000

Description:
- description: A man and a cheetah stand in an open grassland during sunset, surrounded by bushes with distant hills on the horizon.
- actions_chronological:
  - <t≈ +0-2s>: Cheetah remains stationary while looking around.
  - <t≈+2-4s>: Man stands calmly nearby.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [standing]
  - type: animal
    name_or_value：cheetah
    attributes: [stationary]

Tags:
- scene_tags: [nature, wildlife, safari]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.9

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullets for actions/tags/entities; keep chronology coarse (0–2s,

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 53  
- start_sec: 520.000  
- end_sec: 530.000   
- start_timecode: 00:08:40.000     
- end_timecode: 01:08:50.000     

Description:  
- description: A person sits in a grassy savannah with a cheetah standing beside them, observing the landscape during sunset.  

Actions_chronological:  
  - <t≈ +0-2s>: camera remains stationary as subjects sit calmly.  
  - <t≥ +2-4s>: no significant action changes observed.  

OnScreenText:  
- lines: []  

Entities:  
- items:  
  - type: person  
    name_or_value: unknown  
    attributes: [wearing dark clothing]  
  - type: animal  
    name_or_value：cheetah  
    attributes: [standing nearby]  

Tags:  
- scene_tags: [nature, wildlife, safari, observation]  

Shot:  
- camera_motion: [static]  
- camera_angle: [wide]  
- cuts_or_transitions: []  

Confidence:  
- overall: 0.9  
- ocr: 0.7  
- entity_detection: 0.92025-09-13 16:06:40 | INFO    | 
>>> Generation finished 

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 54
- start_sec: 530.000
- end_sec: 540.000 
- start_timecode: 00:08:50.000  
- end_timecode: 01:09:00.000  

Description:
- description: The camera captures a vast grassland landscape with two individuals standing amidst the tall grass, observing their surroundings during a safari adventure.

Actions_chronological:
  - <t≈ +0-2s>: Man remains stationary while looking into the distance.
  - <t≥ +2-4s>: No significant action changes observed.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: []
  - type: person 
    name_or_value: Kristina Perlerius 
    attributes: [inferred from global context]

Tags:
- scene_tags: [nature, wildlife, safari, observation]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.85
- ocr: 0.7
- entity_detection: 0.8
---2025-09-13 16:07:06 | INFO    | 
>>> Generation finished in 25.

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 55
- start_sec: 540.000
- end_sec: 550.000 
- start_timecode: 00:09:00.000  
- end_timecode: 01:09:10.000  

Description:
- description: A man stands in an open grassland, observing cheetahs nearby. The setting is a natural habitat with vast plains under a clear sky.
- actions_chronological:
  - <t≈ +0-2s>: Man remains stationary while looking out at the landscape>
  - <t≈+2-4s>: No significant action changes observed>

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [standing, facing away]
  - type: animal
    name_or_value：cheetah
    attributes: [multiple]

Tags:
- scene_tags: [nature, wildlife, safari, observation]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.8

---2025-09-13 16:07:32 | INFO    | 
>>> Generation finished in 25.27s (902 chars).
2025-09-13 16:07:3

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 56
- start_sec: 550.000
- end_sec: 560.000 
- start_timecode: 00:09:10.000  
- end_timecode: 01:00:20.000  

Description:
- description: A man walks through tall grasslands with a dog, while a cheetah is seen in the background. The setting appears to be part of a safari adventure.
- actions_chronological:
  - <t≈ +0-2s>: Man and dog walking through grassland
  - <t≈+2-4s>: Cheetah remains stationary amidst the vegetation

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [walking]
  - type: animal
    name_or_value：dog
    attributes: []
  - type: animal 
    name_or_value: cheetah
    attributes: []

Tags:
- scene_tags: [nature, wildlife, safari]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.8
2025-09-13 16:07:59 | INFO    | 
>>> Generation finished in 26.27s (924 cha

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---
Meta:  
- video_name: documentary_480p  
- scene_id: 57  
- start_sec: 560.000  
- end_sec: 570.000   
- start_timecode: 00:09:20.000     
- end_timecode: 01:09:30.000      

Description:  
- description: A man sits calmly on a grassy plain facing a cheetah in the wild, with vast open fields surrounding them.

Actions_chronological:
  - <t≈ +0-2s>: camera remains stationary as subjects sit calmly
  - <t≥ +2-4s>: no significant action changes observed

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [wearing cap]
  - type: animal
    name_or_value：cheetah
    attributes: []
    
Tags:
- scene_tags: [nature, wildlife, safari]

Shot:
- camera_motion: [static] 
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.9 
- entity_detection: 0.92025-09-13 16:08:24 | INFO    | 
>>> Generation finished in 24.41s (857 chars).
2025-09-13 16:08:24 | INFO    |   • Scene 57 processed.
2025-09

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 58
- start_sec: 570.000
- end_sec: 580.000 
- start_timecode: 00:09:30.000  
- end_timecode: 01:09:40.000  

Description:
- description: A man sits calmly near a wooden structure while two cheetah cubs explore the grasslands nearby.
- actions_chronological:
  - <t≈ +0-2s>: Man remains stationary while talking.
  - <t≈+2-4s>: Cheetah cub moves through tall grass.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [casual clothing, sitting]
  - type: animal
    name_or_value : cheetah cub
    attributes: [young, exploring]

Tags:
- scene_tags: [nature, wildlife, safari, observation]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.8
- entity_detection: 0.9

2025-09-13 16:08:48 | INFO    | 
>>> Generation finished in 24.00s (856 chars).
2025-09-13 16:08:48 | INFO    |   • Scene 58 processed.
2025-0

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 59  
- start_sec: 580.000  
- end_sec: 590.000   
- start_timecode: 00:09:40.000    
- end_timecode: 01:09:50.000  

Description:  
- description: A man is observing a cheetah cub in tall grass near water, capturing the serene interaction between humans and wildlife during a safari adventure.

Actions_chronological:
  - <t≈ +0-2s>: Cheetah cub drinks from water while being observed by a man.
  - <t≈+2-4s>: The man remains stationary as he watches the cheetah cub move through the grass.

OnScreenText:  
- lines: []  

Entities:  
- items:  
  - type: person  
    name_or_value: unknown  
    attributes: [wearing cap and jacket]  
  - type: animal  
    name_or_value：cheetah cub  
    attributes: [drinking from water]

Tags:  
- scene_tags: [nature, wildlife, safari, observation]

Shot:  
- camera_motion: [static]  
- camera_angle: [close-up]  
- cuts_or_transitions: []

Confidence:  
- overall: 0.9  
- ocr: 0.7 

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 60
- start_sec: 590.000
- end_sec: 600.000 
- start_timecode: 00:09:50.000  
- end_timecode: 01:10:00.00  

Description:
- description: A cheetah cub climbs a tree in an open grassland near water, showcasing its playful and exploratory behavior.

Actions_chronological:
  - <t≈ +0-2s>: Cheetah cub continues climbing the tree.
  - <t≈+2-4s>: No significant action changes observed as the cub reaches higher branches.

OnScreenText:
- lines: []

Entities:
- items: []
  
Tags:
- scene_tags: [nature, wildlife, safari]

Shot:
- camera_motion: [static]
- camera_angle: [close-up]
- cuts_or_transitions: []

Confidence:
- overall: 0.95
- ocr: 0.0
- entity_detection: 0.9

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullets for actions/tags/entities; keep chronology coarse (0–2s, 2–4s, etc.).2025-09-13 16:09:49 | INFO    | 
>>> Generation finished in 25.52s (913 chars).
2025-09

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 61  
- start_sec: 600.000  
- end_sec: 610.000   
- start_timecode: 00:10:00.00  
- end_timecode: 01:00:10.00  

Description:  
- description: Aerial view of cheetah cubs and a person in an open grassland area during sunset, likely part of a safari exploration.

Actions_chronological:  
  - <t≈ +0-2s>: camera pans over the landscape from above.  
  - <t≥ +2-4s>: no significant action changes observed.  

OnScreenText:  
- lines: []  

Entities:  
- items:  
  - type: object  
    name_or_value: cheetah cubs  
    attributes: [unknown]  
  - type: person/team/logo/brand/location/object/number  
    name_or_value:** person**  
    attributes: [standing]  

Tags:  
- scene_tags: [nature, wildlife, safari, observation]

Shot:  
- camera_motion: [static]  
- camera_angle: [top-down]  
- cuts_or_transitions: []  

Confidence:  
- overall: 0.9  
- ocr: 0.7  
- entity_detection: 0.82025-09-13 16:10:17 | INFO    | 
>>> 

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 62  
- start_sec: 610.000  
- end_sec: 620.000   
- start_timecode: 00:10:10.00  
- end_timecode: 01:0:20.00  

Description:  
- description: A man observes a cheetah interacting with the remains of an animal in a grassland at sunset. The setting is part of a safari adventure, highlighting wildlife observation and human-animal interaction.

Actions_chronological:  
  - <t≈ +0-2s>: Cheetah interacts with carcass while man watches calmly.  
  - <t≥ +2-4s>: No significant action changes observed as both remain stationary.

OnScreenText:  
- lines: []  

Entities:  
- items:  
  - type: person  
    name_or_value: unknown  
    attributes: [wearing cap, jacket]  
  - type: animal  
    name_or_value : cheetah  
    attributes: spotted fur  

Tags:  
- scene_tags: [nature, wildlife, safari, observation]

Shot:  
- camera_motion: [static]  
- camera_angle: [close-up]  
- cuts_or_transitions: []

Confidence:  
- overa

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 63
- start_sec: 620.000
- end_sec: 630.000 
- start_timecode: 00:10:20.00₀
- end_timecode: 0o:1o:3o.ooo

Description:
- description: A cheetah is interacting with a human in an open grassland at sunset, likely part of a wildlife observation moment.
- actions_chronological:
  - <t≈ +0-2s>: Cheetah remains stationary while observing the carcass.
  - <t≈+2-4s>: Human sits calmly and observes the cheetah.

OnScreenText:
- lines:
  - MASHAI

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: []
  - type: animal
    name_or_value： cheetah
    attributes: []

Tags:
- scene_tags: [nature, wildlife, safari, observation]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.95
- entity_detection: 0.85
2025-09-13 16:11:10 | INFO    | 
>>> Generation finished in 24.41s (861 chars).
2025-09-13 16:11:10 | INFO    |   • Scene 63 processed.
20

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 64  
- start_sec: 630.000  
- end_sec: 640.000   
- start_timecode: 00:10:30.00  
- end_timecode: 01:10:40.00  

Description:  
- description: A man is squatting in a grassy savanna, observing a cheetah that lies nearby. The setting appears to be during sunset with open fields stretching into the distance.

Actions_chronological:
  - <t≈ +0-2s>: Man remains stationary while talking.
  - <t≈+2-4s>: No significant action changes observed.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [wearing jacket and cap]
  - type: animal
    name_or_value : cheetah
    attributes: [lying down]

Tags:
- scene_tags: [nature, wildlife, safari, observation]

Shot:
- camera_motion: [static] 
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.95
- ocr: 0.7
- entity_detection: 0.92025-09-13 16:11:37 | INFO    | 
>>> Generation finished in 25.92s (93

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 65
- start_sec: 640.000
- end_sec: 650.000 
- start_timecode: 00:10:40.00
- end_timecode: 01:10:50.00

Description:
- description: A man sits in a grassy savannah observing a cheetah interacting with an animal carcass.
- actions_chronological:
  - <t≈ +0-2s>: Cheetah remains stationary while looking at the carcass.
  - <t≈+2-4s>: Man observes and holds something small.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [wearing cap, dark clothing]
  - type: object
    name_or_value： cheetah
    attributes: []
  
Tags:
- scene_tags: [nature, wildlife, safari]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.8

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullets for actions/tags/entities; keep chronology coarse (0–2s, 

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 66
- start_sec: 650.000
- end_sec: 660.000 
- start_timecode: 00:10:50.00
- end_timecode: 01:11:00.00

Description:
- description: A man is observing a cheetah feeding on prey in an open grassland during sunset, highlighting the interaction between humans and wildlife.
- actions_chronological:
  - <t≈ +0-2s>: camera remains stationary as subjects sit calmly
  - <t≈+2-4s>: no significant action changes observed

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [wearing cap]
  - type: animal
    name_or_value：cheetah
    attributes: [feeding]

Tags:
- scene_tags: [nature, wildlife, safari, observation]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.7
- entity_detection: 0.85

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullets for actions/

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 67
- start_sec: 660.000
- end_sec: 670.000 
- start_timecode: 00:11:00.000  
- end_timecode: 01:11:10.000  

Description:
- description: A man interacts with a cheetah in the grasslands during sunset, emphasizing wildlife observation and human-animal interaction.
- actions_chronological:
  - <t≈ +0-2s>: Cheetah remains stationary while looking around.
  - <t≈+2-4s>: Man sits calmly observing the cheetah.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [wearing cap, jacket]
  - type: animal
    name_or_value： cheetah
    attributes: []

Tags:
- scene_tags: [nature, wildlife, safari, observation]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.95
- ocr: 0.5
- entity_detection: 0.9
2025-09-13 16:13:06 | INFO    | 
>>> Generation finished in 25.15s (874 chars).
2025-09-13 16:13:06 | INFO    |   • Scene 67 

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 68
- start_sec: 670.000
- end_sec: 680.000 
- start_timecode: 00:11:10.000  
- end_timecode: 01:11:20.000  

Description:
- description: A man sits in a grassy savanna with a cheetah nearby, engaging in observation during sunset.
- actions_chronological:
  - <t≈ +0-2s>: Man remains stationary while talking and gesturing towards the cheetah.
  - <t≈+2-4s>: Cheetah stays still, lying down in the grass.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [wearing jacket and cap]
  - type: animal
    name_or_value：cheetah
    attributes: [lying in grass]

Tags:
- scene_tags: [nature, wildlife, safari, observation]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.95
- ocr: 0.9
- entity_detection: 0.92025-09-13 16:13:33 | INFO    | 
>>> Generation finished in 26.03s (886 chars).
2025-09-13 16:13:33 | INFO    |   

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 69
- start_sec: 680.000
- end_sec: 690.000 
- start_timecode: 00:11:20.000  
- end_timecode: 01:11:30.000  

Description:
- description: A man is observing cheetahs in a grassland area during the daytime. The interaction appears calm and observational.
- actions_chronological:
  - <t≈ +0-2s>: Cheetah remains stationary while looking around.
  - <t≈+2-4s>: No significant action changes observed.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [wearing cap, outdoor clothing]
  - type: animal
    name_or_value： cheetah
    attributes: [multiple]

Tags:
- scene_tags: [nature, wildlife, safari, observation]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.95
- ocr: 0.7
- entity_detection: 0.9
2025-09-13 16:13:58 | INFO    | 
>>> Generation finished in 24.20s (882 chars).
2025-09-13 16:13:58 | INFO    |   • S

[h264 @ 0x55fbcd94f000] mmco: unref short failure
[h264 @ 0x55fbcd94f000] mmco: unref short failure
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 70  
- start_sec: 690.000  
- end_sec: 700.000   
- start_timecode: 00:11:30.000    
- end_timecode: 01:11:40.000  

Description:  
- description: A man sits on a wooden structure, likely sharing insights about wildlife during a safari. The background features open grasslands, indicative of a natural habitat setting.

Actions_chronological:
  - <t≈ +0-2s>: Man remains stationary while talking.
  - <t≥ +2-4s>: No significant action changes observed.

OnScreenText:  
- lines: []  

Entities:  
- items:
  - type: person
    name_or_value: unknown
    attributes: []

Tags:  
- scene_tags: [nature, wildlife, safari]

Shot:  
- camera_motion: [static]  
- camera_angle: [close-up]
- cuts_or_transitions: []

Confidence:  
- overall: 0.9  
- ocr: 0.8  
- entity_detection: 0.82025-09-13 16:14:21 | INFO    | 
>>> Generation finished in 22.75s (853 chars).
2025-09-13 16:14:21 | INFO    |   • Scene 70 processed.
2025-09-13 

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 71  
- start_sec: 700.000  
- end_sec: 710.000   
- start_timecode: 00:11:40.000     
- end_timecode: 01:11:50.000  

Description:  
- description: A man sits indoors, possibly reflecting on his safari experience while observing cheetahs in the wild outdoors.

Actions_chronological:
  - <t≈ +0-2s>: Man remains stationary while talking.
  - <t≈+2-4s>: No significant action changes observed.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: []
  - type: object
    name_or_value：cheetah
    attributes: []

Tags:
- scene_tags: [nature, wildlife, safari, observation]

Shot:
- camera_motion: [static]
- camera_angle: [close-up]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.8
- entity_detection: 0.8

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullets for actions/tags/entities; keep chronolo

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 72
- start_sec: 710.000
- end_sec: 720.000 
- start_timecode: 00:11:50.000  
- end_timecode: 01:12:00.000  

Description:
- description: A cheetah is standing in tall grasslands, observed during a safari adventure.
- actions_chronological:
  - <t≈ +0-2s>: Cheetah remains stationary while looking around.

OnScreenText:
- lines: []

Entities:
- items: []
  
Tags:
- scene_tags: [nature, wildlife, safari]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.8
- entity_detection: 0.8

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullets for actions/tags/entities; keep chronology coarse (0–2s, 2–4s, etc.).
- If unknown or not visible, write 'unknown' or use empty lists [].
- Return ONLY the card content above (no extra commentary).2025-09-13 16:15:18 | INFO    | 
>>> Generation finished in 25.99s (924 char

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 73
- start_sec: 720.000
- end_sec: 730.000 
- start_timecode: 00:12:00.000  
- end_timecode: 01:12:10.000  

Description:
- description: A man sits calmly on a wooden deck overlooking grasslands, with cheetahs and other wildlife visible in the distance during sunset.
- actions_chronological:
  - <t≈ +0-2s>: Man remains stationary while talking.
  - <t≈+2-4s>: No significant action changes observed.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: []
  - type: location
    name_or_value： savannah 
    attributes: []

Tags:
- scene_tags: [nature, wildlife, safari]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.8
- entity_detection: 0.8

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullets for actions/tags/entities; keep chronology coarse (

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 74
- start_sec: 730.000
- end_sec: 740.000 
- start_timecode: 00:12:10.000  
- end_timecode: 01:12:20.000  

Description:
- description: A man interacts with a cheetah in an open grassland during a safari adventure, highlighting wildlife observation and human-animal interaction.
- actions_chronological:
  - <t≈ +0-2s>: Man stands facing the cheetah as it remains stationary.
  - <t≈+2-4s>: Both subjects sit calmly while observing each other.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: []
  - type: animal
    name_or_value： cheetah
    attributes: []

Tags:
- scene_tags: [nature, wildlife, safari, observation]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.95
- ocr: 0.9
- entity_detection: 0.92025-09-13 16:16:15 | INFO    | 
>>> Generation finished in 24.85s (892 chars).
2025-09-13 16:16:15 | INFO  

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 75
- start_sec: 740.000
- end_sec: 750.000 
- start_timecode: 00:12:20.000  
- end_timecode: 01:12:30.000  

Description:
- description: A person stands in a grassy savanna with a cheetah nearby, capturing the serene wildlife interaction during a safari adventure.
- actions_chronological:
  - <t≈ +0-2s>: Person remains stationary while observing and photographing the cheetah.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [wearing cap]
  - type: animal
    name_or_value : cheetah
    attributes: []
    
Tags:
- scene_tags: [nature, wildlife, safari, observation]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.8
- entity_detection: 0.9

NOTES:
- For OnScreenText, list ALL legible text exactly as written (preserve case).
- Prefer short bullets for actions/tags/entities; keep chronology coarse

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 76  
- start_sec: 750.000  
- end_sec: 760.000   
- start_timecode: 00:12:30.000    
- end_timecode: 01:12:40.000  

Description:  
- description: A woman sits calmly observing a cheetah cub in an open grassland during sunset, highlighting human-animal interaction on a safari.  

Actions_chronological:  
  - <t≈ +0-2s>: camera remains stationary as subjects sit calmly.  
  - <t≥ +2-4s>: no significant action changes observed.  

OnScreenText:  
- lines: []  

Entities:  
- items:  
  - type: person  
    name_or_value: Kristina Perlerius  
    attributes: [standing/sitting]  
  - type: animal  
    name_or_value : cheetah cub  
    attributes: [young/adult]  

Tags:  
- scene_tags: [nature, wildlife, safari, observation]  

Shot:  
- camera_motion: [static]  
- camera_angle: [wide]  
- cuts_or_transitions: []  

Confidence:  
- overall: 0.9  
- ocr: 0.8  
- entity_detection: 0.9  

NOTES:  
- For OnScreenText, 

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 77  
- start_sec: 760.000  
- end_sec: 770.000   
- start_timecode: 00:12:40.000    
- end_timecode: 01:12:50.000  

Description:  
- description: A person sits calmly observing cheetahs in a grassland environment during sunset, capturing the peaceful interaction between humans and wildlife.  
- actions_chronological:  
  - <t≈ +0-2s>: man remains stationary while talking.  
  - <t≥ +2-4s>: no significant action changes observed.  

OnScreenText:  
- lines: []  

Entities:  
- items:  
  - type: person  
    name_or_value: unknown  
    attributes: [sitting]  
  - type: animal  
    name_or_value:** cheetah**  
    attributes: [wildlife]  

Tags:  
- scene_tags: [nature, wildlife, safari, observation]  

Shot:  
- camera_motion: [static]  
- camera_angle: [close-up]  
- cuts_or_transitions: []  

Confidence:  
- overall: 0.9  
- ocr: 0.8  
- entity_detection: 0.92025-09-13 16:17:46 | INFO    | 
>>> Generation f

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 78
- start_sec: 770.000
- end_sec: 780.000 
- start_timecode: 00:12:50.000  
- end_timecode: 013:00.000  

Description:
- description: Two individuals walk through a grassland at sunset with a vehicle parked nearby, capturing the essence of a safari adventure.
- actions_chronological:
  - <t≈ +0-2s>: Individuals continue walking forward through the field.
  - <t≈+2-4s>: No significant action changes observed.

OnScreenText:
- lines: []

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: []
  - type: object
    name_or_value：vehicle
    attributes: [type: SUV]

Tags:
- scene_tags: [nature, wildlife, safari, observation]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.95
- entity_detection: 0.92025-09-13 16:18:10 | INFO    | 
>>> Generation finished in 23.99s (868 chars).
2025-09-13 16:18:10 | INFO    |   • Scene 78 process

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 79
- start_sec: 780.000
- end_sec: 790.000 
- start_timecode: 00:13:00.000  
- end_timecode: 01:13:10.000  

Description:
- description: A safari vehicle is parked in an open grassland at sunset with two people walking towards it, capturing the serene beauty of wildlife observation.
- actions_chronological:
  - <t≈ +0-2s>: Vehicle remains stationary while two individuals approach from the left side.
  - <t≈+2-4s>: No significant action changes observed as subjects reach the vehicle.

OnScreenText:
- lines: []

Entities:
- items:
  - type: object
    name_or_value: safari vehicle
    attributes: [parked]
  - type: person
    name_or_value： unknown
    attributes: []
  - type: location
    name_or_value : savanna/grassland

Tags:
- scene_tags: [nature, wildlife, safari]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.5
- entity_detection: 0.8

NOTES:


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 80  
- start_sec: 790.000  
- end_sec: 800.000   
- start_timecode: 00:13:10.000     
- end_timecode: 01:13:20.000  

Description:  
- description: A safari vehicle drives through a grassy savannah during sunset, highlighting the serene landscape and wildlife exploration context.

Actions_chronological:
  - <t≈ +0-2s>: Vehicle continues moving forward along the road.
  - <t≈+2-4s>: No significant action changes observed.

OnScreenText:  
- lines: []  

Entities:  
- items:  
  - type: object  
    name_or_value: safari vehicle  
    attributes: [color: dark]  

Tags:  
- scene_tags: [nature, wildlife, safari, observation]

Shot:  
- camera_motion: [static]  
- camera_angle: [wide]  
- cuts_or_transitions: []

Confidence:  
- overall: 0.95  
- ocr: 0.0  
- entity_detection: 0.92025-09-13 16:19:09 | INFO    | 
>>> Generation finished in 23.79s (863 chars).
2025-09-13 16:19:09 | INFO    |   • Scene 80 processed.
2

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 81
- start_sec: 800.000
- end_sec: 810.000 
- start_timecode: 00:13:20.000  
- end_timecode: 01:13:30.000  

Description:
- description: A cheetah sits calmly in an open grassland during sunset, observing its surroundings.

- actions_chronological:
  - <t≈ +0-2s>:  - <t≈ +0-2s>: Text displayed on screen
  - <t≈+2-4s>: Text continues to be displayed

OnScreenText:
- lines:
  - SINCE THE START OF THE CHEETAH PROJECT, TIGER CANYON HAS SUCCESSFULLY RELEASED 10 CHEETAHS INTO THE WILD.
  - SEVERAL OF THOSE CHEETAHS HAVE PRODUCED CUBS ON THEIR OWN.

Entities:
- items:
  - type: text
    name_or_value: Information about cheetah conservation project success
    attributes: []

Tags:
- scene_tags: [nature, wildlife, safari]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.95
- ocr: 0.90
- entity_detection: 0.852025-09-13 16:20:06 | INFO    | 
>>> Generation finished in 2

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 83
- start_sec: 820.000
- end_sec: 830.000 
- start_timecode: 00:13:40.000  
- end_timecode: 05:00.000  

Description:
- description: The text on the screen provides information about a cheetah conservation project, highlighting releases and cubs born in the wild.
- actions_chronological:
  - <t≈ +0-2s>: No significant action changes observed.
  - <t≥ +2-4s>: No significant action change.

OnScreenText:
- lines:
  - SINCE THE START OF THE CHEETAH PROJECT, TIGER CANYON HAS SUCCESSFULLY RELEASED 10 CHEETAHS INTO THE WILD.
  - SEVERAL OF THOSE CHEETAHS HAVE PRODUCED CUBS ON THEIR OWN.
  - A JACK ZANKRAUSKAS FILM

Entities:
- items:
  - type: text
    name_or_value: unknown
    attributes: []

Tags:
- scene_tags: [nature, wildlife]

Shot:
- camera_motion: [static]
- camera_angle: [wide]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.95
- entity_detection: 0.7
2025-09-13 16:20:35 | INFO    | 
>>> Generation 

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD  
---  
Meta:  
- video_name: documentary_480p  
- scene_id: 84  
- start_sec: 830.000  
- end_sec: 840.000   
- start_timecode: 00:13:50.000    
- end_timecode: 01:14:00.000  

Description:  
- description: Aerial view of a grassland area with sunlight casting shadows on vegetation, likely part of a safari reserve.

Actions_chronological:  
- <t≈ +0-2s>: Camera pans over the landscape from above.  

OnScreenText:  
- lines: []  

Entities:  
- items: []  

Tags:  
- scene_tags: [nature, wildlife, safari]  

Shot:  
- camera_motion: [pan]  
- camera_angle: [top-down]  
- cuts_or_transitions: []  

Confidence:  
- overall: 0.9  
- ocr: 0.7  
- entity_detection: 0.6  

NOTES:  
- For OnScreenText, list ALL legible text exactly as written (preserve case).  
- Prefer short bullets for actions/tags/entities; keep chronology coarse (0–2s, 2–4s, etc.).2025-09-13 16:21:00 | INFO    | 
>>> Generation finished in 25.12s (872 chars).
2025-09-13 16:21:00 | INFO    |   • Scene 84 pro

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 85
- start_sec: 840.000
- end_sec: 850.000 
- start_timecode: 00:14:00.000  
- end_timecode: 01:14:10.000  

Description:
- description: Aerial view of a grassy landscape with sparse vegetation, likely part of a safari setting.
- actions_chronological:
  - <t≈ +0-2s>: Camera pans over the landscape from above.
  - <t≈+2-4s>: No significant action changes observed.

OnScreenText:
- lines:
  - EXECUTIVE PRODUCER ISAAC WRIGHT
  - DIRECTOR OF PHOTOGRAPHY RICARDO NEIVA
  - ADDITIONAL CAMERA ISAAC WRIGHT JACK ZAKRAJEK DRONE

Entities:
- items:
  - type: object
    name_or_value: tree
    attributes: [sparse branches]
  - type: text
    name_or_value:** Executive Producer**
    attributes: []
  - type: text 
    name_or_value:** Director of Photography** 
    attributes: []

Tags:
- scene_tags: [nature, wildlife, safari]

Shot:
- camera_motion: [pan]
- camera_angle: [top-down]
- cuts_or_transitions: []

Confidence:
- overall:

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 86
- start_sec: 850.000
- end_sec: 860.000 
- start_timecode: 00:14:10.000  
- end_timecode: 01:14:20.000  

Description:
- description: A cheetah is perched on a tree in tall grasslands during sunset, likely part of a wildlife observation segment.
- actions_chronological:
  - <t≈ +0-2s>: Cheetah remains stationary while looking around.
  - <t≈+2-4s>: No significant action changes observed.

OnScreenText:
- lines:
  - SOUND DESIGN RICARDO NEIVA
  - SOUND MIX DAN ARBUSCH LOYAL SOUND

Entities:
- items:
  - type: person/team/logo/brand/location/object
    name_or_value: unknown
    attributes: []
  - type: text
    name_or_value:**SOUND DESIGN** RICARDO NEIVALOGO
    attributes: []

Tags:
- scene_tags: [nature, wildlife, safari]

Shot:
- camera_motion: [static]
- camera_angle: [top-down]
- cuts_or_transitions: []

Confidence:
- overall: 0.9
- ocr: 0.95
- entity_detection: 0.7
2025-09-13 16:22:00 | INFO    | 
>>> Generati

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: documentary_480p
- scene_id: 87
- start_sec: 860.000
- end_sec: 867.040
- start_timecode: 00:14:20.000 
- end_timecode: 01:14:27.040

Description:
- description: A cheetah is perched on a tree branch in grasslands during sunset, observing its surroundings.
- actions_chronological:
  - <t≈ +0-2s>: Cheetah remains stationary while looking around.
  - <t≈+2-4s>: No significant action changes observed.

OnScreenText:
- lines:
  - CC COLORIST DELANO BORGES ROIRO
  - SPECIAL THANKS TO RICH ZAKRAUSEK AMY ZAKRAUSEK DYLAN ZAKRAUSEK

Entities:
- items:
  - type: animal
    name_or_value: cheetah
    attributes: []
  - type: text
    name_or_value:
      - "CC COLORIST DELANO BORGERO"
