In [None]:
# Strategy 2 is:
# 1. Generate 1 global summary of the video by picking frames from whole video and generating global summary.
# 2. Divide video into scenes of 10 secondes.
# 3. Run over each scene and take 8 frames and then do generation with global context + prompt + frames
# 4. Save results in files.

In [1]:
# pip install transformers decord einops timm accelerate flash-attn

In [1]:
import os
import csv
import math
import json
import time
import datetime as dt
from threading import Thread

import numpy as np
import torch
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode
from PIL import Image
from decord import VideoReader, cpu
from transformers import AutoModel, AutoTokenizer, TextIteratorStreamer

In [2]:
# ----------------------------
# Config
# ----------------------------
VIDEO_PATH = "video_foot_highlights.mp4"   # <— change or pass from argv if you prefer
SCENE_SECONDS = 10                  # fixed 10-second chunks as requested
GLOBAL_SAMPLE_SEGMENTS = 16         # frames to sample for "general metadata"
PER_SCENE_SAMPLE_SEGMENTS = 8       # frames to sample for each scene
TILE_MAX_NUM = 1                    # number of tiles per frame (keep 1 for speed)
INPUT_SIZE = 448
USE_CUDA = torch.cuda.is_available()

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

In [3]:
# ----------------------------
# Model init (your settings)
# ----------------------------
path = 'OpenGVLab/InternVL3_5-8B'
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    load_in_8bit=False,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True,
    device_map="auto"
).eval()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# ----------------------------
# Prompts (Global + Scene Card for RAG)
# ----------------------------
GLOBAL_PROMPT = (
    "You are analyzing sampled frames from the WHOLE video. Provide:\n"
    "- Video type/genre (e.g., sports/soccer match highlight, vlog, screencast, ad, tutorial, etc.)\n"
    "- Global video summary with only IMPORTANT informations.\n"
    "- Likely action types present (bulleted)\n"
    "- Prominent entities/teams/brands (bulleted)\n"
    "- Visual style cues (e.g., broadcast overlay, handheld, studio)\n"
    "Return sections: [TYPE], [SUMMARY], [ACTIONS], [ENTITIES], [STYLE]."
)

def build_scene_prompt(global_summary_text: str,
                       scene_id: int,
                       start_s: float,
                       end_s: float,
                       video_name: str,
                       start_tc: str,
                       end_tc: str) -> str:
    """
    Build a RAG-friendly SCENE CARD prompt.
    The model should return plain text in MARKDOWN format with clear sections and bullets.
    """
    return f"""
You will receive frames from a ~10-second SCENE in a larger video.
Use the GLOBAL CONTEXT below to disambiguate entities/actions when helpful,
but do not contradict the visible frames.

GLOBAL CONTEXT (model-inferred summary of the whole video):
{global_summary_text.strip()}

TASK:
Describe ONLY this scene and produce a **RAG SCENE CARD** in plain text.
The output MUST follow the format below exactly. Keep it factual and concise.

FORMAT TO RETURN (plain text, no markdown code fences):
RAG SCENE CARD
---
Meta:
- video_name: {video_name}
- scene_id: {scene_id}
- start_sec: {start_s:.3f}
- end_sec: {end_s:.3f}
- start_timecode: {start_tc}
- end_timecode: {end_tc}

Context:
- global_type: <from context if relevant, else unknown>
- global_summary: <1-2 lines linking this scene to the whole video, or 'unknown'>

OnScreenText:
- lines:
  - <exact OCR line 1>
  - <exact OCR line 2>
  - ...

Description:
- dense: <who/what/where, key objects, visual context in 3-5 lines> in chronological order.
- actions:
  - <verb / short action phrase>
  - <...>

Entities:
- items:
  - type: <person/team/logo/brand/location/object/number/other>
    name_or_value: <best guess or exact text>
    attributes: [<short attrs like jersey #, color, role, number>]
  - ...

Tags:
- scene_tags: [<short keywords like teams, brands, numbers, clothes, location>, ...]

Confidence:
- overall: <0.0-1.0>
- ocr: <0.0-1.0>
- entity_detection: <0.0-1.0>

NOTES:
- For OnScreenText, list ALL elegible text exactly as written (preserve case).
- Prefer short bullets and compact phrases for actions/tags/entities.
- If unknown or not visible, write 'unknown' or use empty lists [].
- Return ONLY the card content in this exact structure (no extra commentary).
""".strip()

In [None]:
# ----------------------------
# Main processing (VERBOSE)
# ----------------------------
import logging
import sys
import os

def _setup_logger(name: str = "video_rag", level: int = logging.INFO) -> logging.Logger:
    logger = logging.getLogger(name)
    if logger.handlers:
        return logger  # already configured
    logger.setLevel(level)
    handler = logging.StreamHandler(sys.stdout)
    formatter = logging.Formatter(
        fmt="%(asctime)s | %(levelname)-7s | %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S"
    )
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    return logger

LOGGER = _setup_logger()

# ----------------------------
# Image / Video utilities
# ----------------------------
def build_transform(input_size):
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1)
        if i * j <= max_num and i * j >= min_num
    )
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size
    )

    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def build_transform_and_stack(frames_as_pil, input_size=448, max_num=1):
    transform = build_transform(input_size=input_size)
    tiles = []
    for img in frames_as_pil:
        img_tiles = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
        tile_tensor = [transform(tile) for tile in img_tiles]
        tile_tensor = torch.stack(tile_tensor)
        tiles.append(tile_tensor)
    pixel_values = torch.cat(tiles)
    return pixel_values

def seconds_to_tc(t):
    # HH:MM:SS.mmm
    hours = int(t // 3600)
    minutes = int((t % 3600) // 60)
    seconds = t % 60
    return f"{hours:02d}:{minutes:02d}:{seconds:06.3f}"

def get_frame_indices(bound, fps, max_frame, first_idx=0, num_segments=32, frames_per_segment=1):
    """
    Select frame indices evenly across the [start, end] bound.
    Instead of just one per segment, allow multiple frames per segment.
    """
    if bound:
        start, end = bound[0], bound[1]
    else:
        start, end = -100000, 100000

    start_idx = max(first_idx, round(start * fps))
    end_idx = min(round(end * fps), max_frame)
    if end_idx <= start_idx:
        end_idx = min(start_idx + max(1, int(fps)), max_frame)  # ensure ~1s coverage

    seg_size = float(end_idx - start_idx) / num_segments
    frame_indices = []

    for idx in range(num_segments):
        seg_start = start_idx + seg_size * idx
        seg_end = start_idx + seg_size * (idx + 1)

        if frames_per_segment == 1:
            # original behavior (center of segment)
            frame_idx = int(seg_start + (seg_size / 2))
            frame_indices.append(frame_idx)
        else:
            # evenly spaced frames inside the segment
            for j in range(frames_per_segment):
                pos = seg_start + (seg_size * (j + 0.5) / frames_per_segment)
                frame_indices.append(int(pos))

    return np.clip(frame_indices, start_idx, end_idx)


def load_video_pixel_values(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
    max_frame = len(vr) - 1
    fps = float(vr.get_avg_fps())
    frame_indices = get_frame_indices(bound, fps, max_frame, num_segments=num_segments, frames_per_segment=2)
    frames_as_pil = [Image.fromarray(vr[int(idx)].asnumpy()).convert('RGB') for idx in frame_indices]
    pixel_values = build_transform_and_stack(frames_as_pil, input_size=input_size, max_num=max_num)
    return pixel_values, len(frame_indices), fps, vr[0].shape[1], vr[0].shape[0]  # width, height

# ----------------------------
# Generation helpers (VERBOSE)
# ----------------------------
def chat_with_images(pixel_values, question, timeout=30):
    """
    Runs model.chat with streaming and returns the consolidated string.
    Logs live output so you can see what the model generates.
    """
    t0 = time.perf_counter()
    # Move tensor to GPU/CPU
    if USE_CUDA:
        pixel_values = pixel_values.to(torch.bfloat16).cuda()
        LOGGER.info(f"chat_with_images | Using CUDA | tensor shape={tuple(pixel_values.shape)}")
    else:
        pixel_values = pixel_values.to(torch.bfloat16)
        LOGGER.info(f"chat_with_images | Using CPU | tensor shape={tuple(pixel_values.shape)}")

    LOGGER.info(f"Prompt length: {len(question)} chars")
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=timeout)
    generation_config = dict(max_new_tokens=1024, do_sample=False, streamer=streamer)

    # Run chat in background thread
    thread = Thread(target=model.chat, kwargs=dict(
        tokenizer=tokenizer, pixel_values=pixel_values, question=question,
        history=None, return_history=False, generation_config=generation_config,
    ))
    thread.start()

    chunks = []
    LOGGER.info(">>> Model generation started (streaming)...")
    for new_text in streamer:
        if new_text == getattr(model, "conv_template", None).sep if hasattr(model, "conv_template") else False:
            LOGGER.debug(">>> Separator token reached, stopping stream.")
            break
        chunks.append(new_text)
        # Print live output as it's generated
        sys.stdout.write(new_text)
        sys.stdout.flush()

    result = "".join(chunks).strip()
    total_time = time.perf_counter() - t0
    LOGGER.info(f"\n>>> Model generation finished in {total_time:.2f}s "
                f"({len(result)} chars total).")
    return result

def chunk_bounds(duration_sec, chunk_sec):
    LOGGER.info(f"Computing chunk bounds (duration={duration_sec:.3f}s, chunk={chunk_sec}s)")
    n = math.ceil(duration_sec / chunk_sec)
    bounds = []
    for i in range(n):
        start = i * chunk_sec
        end = min((i + 1) * chunk_sec, duration_sec)
        if end > start:
            bounds.append((start, end))
    LOGGER.info(f"Planned {len(bounds)} scene(s) of ~{chunk_sec}s each")
    return bounds

def _filesize(path: str) -> str:
    try:
        sz = os.path.getsize(path)
        for unit in ["B", "KB", "MB", "GB", "TB"]:
            if sz < 1024.0:
                return f"{sz:,.2f} {unit}"
            sz /= 1024.0
    except Exception:
        return "n/a"
    return "n/a"

def process_video(video_path, verbose: bool = True):
    t0 = time.perf_counter()
    assert os.path.isfile(video_path), f"Video not found: {video_path}"
    LOGGER.info(f"Opening video: {video_path}")

    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
    fps = float(vr.get_avg_fps())
    total_frames = len(vr)
    duration = total_frames / fps if fps > 0 else 0.0
    width = vr[0].shape[1]
    height = vr[0].shape[0]
    LOGGER.info(f"Video loaded | fps={fps:.3f}, frames={total_frames}, "
                f"duration={duration:.3f}s, resolution={width}x{height}")

    # file naming
    base = os.path.splitext(os.path.basename(video_path))[0]
    stamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
    csv_name = f"{base}_{stamp}.csv"
    meta_name = f"{base}_{stamp}_metadata.txt"
    LOGGER.info(f"Output files -> CSV: {csv_name} | Metadata: {meta_name}")

    # Global metadata (model-aided)
    LOGGER.info("Sampling whole-video frames for global metadata...")
    tg0 = time.perf_counter()
    global_px, global_sampled, _, _, _ = load_video_pixel_values(
        video_path,
        bound=(0, duration),
        input_size=INPUT_SIZE,
        max_num=TILE_MAX_NUM,
        num_segments=GLOBAL_SAMPLE_SEGMENTS
    )
    LOGGER.info(f"Global sampling complete | sampled_frames={global_sampled}, "
                f"segments={GLOBAL_SAMPLE_SEGMENTS}")
    LOGGER.info("Generating model-inferred global metadata...")
    global_text = chat_with_images(global_px, GLOBAL_PROMPT)
    LOGGER.info(f"Global metadata generation done in {time.perf_counter() - tg0:.2f}s "
                f"({len(global_text)} chars)")

    # Per-scene extraction
    LOGGER.info("Processing scenes (per 10s chunk)...")
    bounds = chunk_bounds(duration, SCENE_SECONDS)
    total_scenes = len(bounds)
    rows = []

    for i, (start_s, end_s) in enumerate(bounds, start=1):
        ts_scene_start = time.perf_counter()
        pct = (i / total_scenes) * 100 if total_scenes else 100
        LOGGER.info(f"[{i}/{total_scenes}] ({pct:5.1f}%) Scene timecodes "
                    f"{seconds_to_tc(start_s)} → {seconds_to_tc(end_s)} "
                    f"({end_s - start_s:.3f}s)")
        try:
            # Load frames for the scene
            tl0 = time.perf_counter()
            px, sampled, fps_local, w, h = load_video_pixel_values(
                video_path,
                bound=(start_s, end_s),
                input_size=INPUT_SIZE,
                max_num=TILE_MAX_NUM,
                num_segments=PER_SCENE_SAMPLE_SEGMENTS
            )
            LOGGER.info(f"  • Frames sampled for scene: {sampled} (segments={PER_SCENE_SAMPLE_SEGMENTS}) "
                        f"| res={w}x{h} | load_time={time.perf_counter() - tl0:.2f}s")

            # Model generation
            tg0 = time.perf_counter()
            LOGGER.info("  • Generating scene text with model...")
            # NEW: scene prompt includes global summary + strict JSON spec
            scene_prompt = build_scene_prompt(
                global_summary_text=global_text,
                scene_id=i,
                start_s=start_s,
                end_s=end_s,
                video_name=base,
                start_tc=seconds_to_tc(start_s),
                end_tc=seconds_to_tc(end_s)
            )
            scene_text = chat_with_images(px, scene_prompt)
            LOGGER.info(f"  • Generation done in {time.perf_counter() - tg0:.2f}s "
                        f"({len(scene_text)} chars)")

            row = {
                "video_name": base,
                "video_path": os.path.abspath(video_path),
                "scene_id": i,
                "start_sec": round(float(start_s), 3),
                "end_sec": round(float(end_s), 3),
                "start_timecode": seconds_to_tc(start_s),
                "end_timecode": seconds_to_tc(end_s),
                "scene_duration_sec": round(float(end_s - start_s), 3),
                "fps": fps_local,
                "width": w,
                "height": h,
                "sampled_frames_for_scene": sampled,
                "generated_text": scene_text
            }
            rows.append(row)
            LOGGER.info(f"  • Scene {i} completed in {time.perf_counter() - ts_scene_start:.2f}s")

        except Exception as e:
            LOGGER.exception(f"  × Scene {i} failed due to error: {e}. Continuing with next scene.")
            # still record a stub row to keep alignment of scene ids
            rows.append({
                "video_name": base,
                "video_path": os.path.abspath(video_path),
                "scene_id": i,
                "start_sec": round(float(start_s), 3),
                "end_sec": round(float(end_s), 3),
                "start_timecode": seconds_to_tc(start_s),
                "end_timecode": seconds_to_tc(end_s),
                "scene_duration_sec": round(float(end_s - start_s), 3),
                "fps": fps, "width": width, "height": height,
                "sampled_frames_for_scene": 0,
                "generated_text": f"[ERROR] {e}"
            })

    # Write CSV
    LOGGER.info("Writing CSV...")
    tcsv0 = time.perf_counter()
    fieldnames = list(rows[0].keys()) if rows else [
        "video_name","video_path","scene_id","start_sec","end_sec",
        "start_timecode","end_timecode","scene_duration_sec","fps","width","height",
        "sampled_frames_for_scene","generated_text"
    ]
    with open(csv_name, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for r in rows:
            writer.writerow(r)
    LOGGER.info(f"CSV saved to {csv_name} ({_filesize(csv_name)}) in {time.perf_counter() - tcsv0:.2f}s")

    # Write general metadata text
    LOGGER.info("Writing general metadata text file...")
    tmeta0 = time.perf_counter()
    technical_meta = {
        "file_name": os.path.basename(video_path),
        "abs_path": os.path.abspath(video_path),
        "created_at": stamp,
        "video_length_sec": round(duration, 3),
        "fps": fps,
        "width": width,
        "height": height,
        "num_scenes_10s": len(bounds),
        "scene_seconds": SCENE_SECONDS
    }
    meta_header = [
        "# GENERAL VIDEO METADATA",
        "",
        "## Technical",
        json.dumps(technical_meta, indent=2),
        "",
        "## Model-Inferred (Global Sampling)",
        global_text.strip(),
        ""
    ]
    with open(meta_name, "w", encoding="utf-8") as f:
        f.write("\n".join(meta_header))
    LOGGER.info(f"Metadata saved to {meta_name} ({_filesize(meta_name)}) in {time.perf_counter() - tmeta0:.2f}s")

    total_time = time.perf_counter() - t0
    LOGGER.info(f"Done in {total_time:.2f}s | CSV: {csv_name} | Metadata: {meta_name}")
    return csv_name, meta_name

In [None]:
process_video(VIDEO_PATH)

2025-09-12 17:03:47 | INFO    | Opening video: video_foot_highlights.mp4
2025-09-12 17:03:47 | INFO    | Video loaded | fps=25.000, frames=22684, duration=907.360s, resolution=640x360
2025-09-12 17:03:47 | INFO    | Output files -> CSV: video_foot_highlights_20250912_170347.csv | Metadata: video_foot_highlights_20250912_170347_metadata.txt
2025-09-12 17:03:47 | INFO    | Sampling whole-video frames for global metadata...
2025-09-12 17:03:49 | INFO    | Global sampling complete | sampled_frames=32, segments=16
2025-09-12 17:03:49 | INFO    | Generating model-inferred global metadata...
2025-09-12 17:03:49 | INFO    | chat_with_images | Using CUDA | tensor shape=(32, 3, 448, 448)
2025-09-12 17:03:49 | INFO    | Prompt length: 413 chars
2025-09-12 17:03:49 | INFO    | >>> Model generation started (streaming)...


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


[TYPE]
Soccer match highlight

[SUMMARY]
The video captures key moments from a soccer match between two teams, showing the progression of the game, goals scored, and reactions from players and fans.

[ ACTIONS ]
- Players running and dribbling the ball
- Goal attempts and shots on goal
- Celebrations after scoring
- Defensive actions and tackles
- Audience cheering and reactions

[ ENTITIES ]
- Team in blue and red uniforms
- Team in white uniforms
- Referee in yellow
- Soccer ball
- Stadium and audience
- Broadcast overlays showing scores and time

[ STYLE ]
- Broadcast overlay with score and time information
- Professional sports broadcast quality
- Dynamic camera angles capturing both gameplay and audience reactions2025-09-12 17:03:56 | INFO    | 
>>> Model generation finished in 7.52s (728 chars total).
2025-09-12 17:03:56 | INFO    | Global metadata generation done in 8.89s (728 chars)
2025-09-12 17:03:56 | INFO    | Processing scenes (per 10s chunk)...
2025-09-12 17:03:56 | INFO 

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: video_foot_highlights
- scene_id: 1
- start_sec: 0.000
- end_sec: 10.000
- start_timecode: 00:00:00.000
- end_timecode: 00:00:10.000

Context:
- global_type: Soccer match highlight
- global_summary: Captures key moments from a soccer match, showing gameplay and audience reactions.

OnScreenText:
- lines:
  - VIVEZ LE FOOTBALL SUR DAZN SPORTS
  - LIGUE: 1-2 LIGUE: 2
  - LIGUE: 1 LIGUE: 2
  - LIGUE: 1 LIGUE: 2
  - LIGUE: 1 LIGUE: 2
  - LIGUE: 1 LIGUE: 2
  - LIGUE: 1 LIGUE: 2
  - DAZN SPORTS
  - 15€/MOIS sans engagement

Description:
- dense: Soccer match in progress with players in white and blue/red uniforms on the field. Referee in yellow is adjusting his earpiece. Stadium filled with spectators.
- actions:
  - Players positioning on the field
  - Referee adjusting earpiece

Entities:
- items:
  - type: person
    name_or_value: Referee
    attributes: [role: referee, uniform: yellow]
  - type: team
    name_or_value: Team in white uniforms
    at

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: video_foot_highlights
- scene_id: 2
- start_sec: 10.000
- end_sec: 20.000
- start_timecode: 00:00:10.000
- end_timecode: 00:00:20.000

Context:
- global_type: Soccer match highlight
- global_summary: This scene captures a moment from a soccer match, showing players on the field and reactions from the sidelines.

OnScreenText:
- lines:
  - DIGNITY
  - DON SPORTS

Description:
- dense: Players in white and blue uniforms are on the field, with a referee in yellow nearby. Coaches and staff are seen on the sidelines, some standing and others seated.
- actions:
  - Players walking on the field
  - Coaches conversing and observing the game

Entities:
- items:
  - type: Person/Team
    name_or_value: Team in white uniforms
    attributes: [jersey color: white]
  - type: Person/Team
    name_or_value: Team in blue uniforms
    attributes: [jersey color: blue]
  - type: Person/Role
    name_or_value: Referee
    attributes: [uniform: yellow]
  - type: Perso

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: video_foot_highlights
- scene_id: 3
- start_sec: 20.000
- end_sec: 30.000
- start_timecode: 00:00:20.000
- end_timecode: 00:00:30.000

Context:
- global_type: Soccer match highlight
- global_summary: Captures key moments from a soccer match, including player actions and audience reactions.

OnScreenText:
- lines:
  - Lamine
  - Confinado de la曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心曜心

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: video_foot_highlights
- scene_id: 4
- start_sec: 30.000
- end_sec: 40.000
- start_timecode: 00:00:30.000
- end_timecode: 00:00:40.000

Context:
- global_type: Soccer match highlight
- global_summary: This scene captures gameplay from a soccer match, showing players in action and the ongoing score.

OnScreenText:
- lines:
  - RAVI VALLECANO 0 - 0 FC BARCELONA
  - LALIGA

Description:
- dense: Players in blue and red uniforms are attacking the goal defended by players in white uniforms. The stadium is filled with spectators.
- actions:
  - Players running and dribbling the ball
  - Goal attempts and defensive actions

Entities:
- items:
  - type: Team
    name_or_value: RAVI VALLECANO
    attributes: [blue and red uniforms]
  - type: Team
    name_or_value: FC BARCELONA
    attributes: [white uniforms]
  - type: Referee
    name_or_value: unknown
    attributes: [yellow uniform]
  - type: Object
    name_or_value: Soccer ball
    attributes: []
  - 

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: video_foot_highlights
- scene_id: 5
- start_sec: 40.000
- end_sec: 50.000
- start_timecode: 00:00:40.000
- end_timecode: 00:00:50.000

Context:
- global_type: Soccer match highlight
- global_summary: This scene captures a moment from a soccer match, showing the progression of play and defensive actions.

OnScreenText:
- lines:
  - RAYO 0 - 0 LA LIGA
  - 03:21 - 0
  - BETLAND
  - #RayBarça

Description:
- dense: The team in white is attacking near the goal, while the team in blue and red defends. The goalkeeper is positioned near the goalpost.
- actions:
  - Players running and positioning
  - Defensive block by the team in blue and red
  - Goalkeeper preparing for a potential save

Entities:
- items:
  - type: Team
    name_or_value: Team in white uniforms
    attributes: [color: white]
  - type: Team
    name_or_value: Team in blue and red uniforms
    attributes: [color: blue and red]
  - type: Person
    name_or_value: Referee
    attributes: [

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: video_foot_highlights
- scene_id: 6
- start_sec: 50.000
- end_sec: 60.000
- start_timecode: 00:00:50.000
- end_timecode: 00:01:00.000

Context:
- global_type: Soccer match highlight
- global_summary: Captures key moments from a soccer match, focusing on gameplay and audience reactions.

OnScreenText:
- lines:
  - BAR 0
  - RMA 0
  - 03:53: 0

Description:
- dense: A player in a blue and red uniform dribbles the ball near the opponent's goal, with the goalkeeper in yellow preparing to defend. The stadium is filled with spectators.
- actions:
  - Dribbling
  - Goal attempt

Entities:
- items:
  - type: Team
    name_or_value: Team in blue and red uniforms
    attributes: [blue and red jerseys]
  - type: Team
    name_or_value: Team in white uniforms
    attributes: [white jerseys]
  - type: Person
    name_or_value: Goalkeeper
    attributes: [yellow jersey, gloves]
  - type: Object
    name_or_value: Soccer ball
    attributes: []
  - type: Locatio

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: video_foot_highlights
- scene_id: 7
- start_sec: 60.000
- end_sec: 70.000
- start_timecode: 00:01:00.000
- end_timecode: 00:01:10.000

Context:
- global_type: Soccer match highlight
- global_summary: Captures key moments from a soccer match, showing gameplay and audience reactions.

OnScreenText:
- lines:
  - 05:39 0
  - 05:40 0
  - 05:41 0
  - 05:42 0
  - 05:43 0
  - 05:44 0
  - 05:45 0
  - 05:46 0
  - 05:47 0
  - 05:48 0
  - betway
  - CONMEBOL
  - #RayBarça

Description:
- dense: Soccer players in blue and red uniforms are on the field, with a player in white raising his arms. The stadium is filled with spectators.
- actions:
  - Players running and dribbling the ball
  - Player in white raising arms

Entities:
- items:
  - type: person
    name_or_value: unknown
    attributes: [blue and red uniform]
  - type: person
    name_or_value: unknown
    attributes: [white uniform]
  - type: object
    name_or_value: soccer ball
    attributes: []
  

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


RAG SCENE CARD
---
Meta:
- video_name: video_foot_highlights
- scene_id: 8
- start_sec: 70.000
- end_sec: 80.000
- start_timecode: 