0) Setup (Colab) — installs and environment check

In [21]:
# Install system OCR tool (Tesseract)
!apt-get -y install tesseract-ocr >/dev/null

# Install all required Python packages
!pip -q install opencv-python pillow pytesseract torch torchvision matplotlib pandas tqdm ultralytics

import torch, platform
print("Torch:", torch.__version__, "| CUDA available:", torch.cuda.is_available())
print("Python:", platform.python_version())


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m0.6/1.1 MB[0m [31m18.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hTorch: 2.8.0+cu126 | CUDA available: False
Python: 3.12.11


In [25]:
# ======= IMPORTS =======
from pathlib import Path
import cv2
from datetime import timedelta
import pandas as pd
import pytesseract

# ======= USER INPUTS =======
VIDEO_PATH   = Path("demovidlos.mp4")  # Set your video file here
DESIRED_FPS  = 2.0                     # Desired frames per second

# Uncomment and set tesseract path if on Windows
# pytesseract.pytesseract.tesseract_cmd = r"C:\Users\sivar\AppData\Local\Programs\Tesseract-OCR"

# ======= OUTPUT FOLDERS =======
OUT_ROOT   = Path("/content/dataset_pipeline")
FRAMES_DIR = OUT_ROOT / "images"
LABELS_DIR = OUT_ROOT / "labels"  # Reserved for future use if needed
META_CSV   = OUT_ROOT / "metadata.csv"

for d in [OUT_ROOT, FRAMES_DIR, LABELS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("Video:", VIDEO_PATH)
print("Output root:", OUT_ROOT)

# ======= FRAME EXTRACTION + OCR (ORIGINAL SHAPE) =======
def extract_frames_with_ocr(video_path, out_dir):
    """
    Extract frames every (1 / DESIRED_FPS) seconds at original resolution.
    Apply OCR and save metadata (image, timestamp, OCR text).
    """
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        raise RuntimeError(f"Cannot open video: {video_path}")

    fps = cap.get(cv2.CAP_PROP_FPS) or 0.0
    print("Original FPS",fps)
    frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or 0
    duration = (frames / fps) if (fps > 0 and frames > 0) else None

    interval = 1.0 / DESIRED_FPS
    t = 0.0
    rows, saved = [], 0

    while True:
        if duration is not None and t > duration:
            break

        cap.set(cv2.CAP_PROP_POS_MSEC, t * 1000.0)
        ret, frame = cap.read()
        if not ret:
            break

        fname = f"sec_{int(round(t * 1000)):010d}.jpg"
        fpath = out_dir / fname
        cv2.imwrite(str(fpath), frame)
        saved += 1

        # Apply OCR to the full frame
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        ocr_text = pytesseract.image_to_string(gray).strip()

        rows.append({
            "image": fname,
            "time_seconds": round(t, 3),
            "timecode": str(timedelta(seconds=float(t))),
            "ocr_text": ocr_text,
        })

        t += interval

    cap.release()

    if rows:
        pd.DataFrame(rows).to_csv(META_CSV, index=False)

    print(f"✅ Saved {saved} frames → {out_dir}")
    print(f"📄 Wrote metadata with OCR → {META_CSV}")

# ======= RUN EXTRACTION =======
extract_frames_with_ocr(VIDEO_PATH, FRAMES_DIR)


Video: demovidlos.mp4
Output root: /content/dataset_pipeline
Original FPS 23.976023976023978
✅ Saved 28 frames → /content/dataset_pipeline/images
📄 Wrote metadata with OCR → /content/dataset_pipeline/metadata.csv


In [26]:
from ultralytics import YOLO
import cv2, time, os
from tqdm import tqdm

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [32]:
# Input/output paths
INPUT_VIDEO  = VIDEO_PATH     # <- change this
OUTPUT_VIDEO = "output_annotated.mp4"

# Pick a model (smallest = yolov8n.pt). Others: yolov8s.pt / m / l / x
MODEL_NAME = "yolov8n.pt"      # downloads automatically on first run

model = YOLO(MODEL_NAME)


In [33]:
cap = cv2.VideoCapture(INPUT_VIDEO)
if not cap.isOpened():
    raise RuntimeError(f"Could not open {INPUT_VIDEO}")

# Get video properties
fps     = cap.get(cv2.CAP_PROP_FPS) or 30.0
width   = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height  = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
length  = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

# Video writer
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out    = cv2.VideoWriter(OUTPUT_VIDEO, fourcc, fps, (width, height))

# Process
pbar = tqdm(total=length if length > 0 else None, desc="Processing")
frame_idx = 0
t0 = time.time()

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Inference (you can set conf, iou, device="0" for GPU, etc.)
    results = model.predict(source=frame, conf=0.25, iou=0.45, verbose=False)
    # results is a list; get first result and draw
    annotated = results[0].plot()  # draws boxes, labels, confidences

    out.write(annotated)

    frame_idx += 1
    pbar.update(1)

cap.release()
out.release()
pbar.close()

elapsed = time.time() - t0
print(f"Saved: {OUTPUT_VIDEO} | Frames: {frame_idx} | Avg FPS: {frame_idx/max(elapsed,1e-6):.2f}")


Processing: 100%|██████████| 332/332 [01:11<00:00,  4.67it/s]

Saved: output_annotated.mp4 | Frames: 332 | Avg FPS: 4.67



