***Setup (Colab) — installs and environment check***

In [18]:
# Install system OCR tool (Tesseract)
!apt-get -y install tesseract-ocr >/dev/null

# Install all required Python packages
!pip -q install opencv-python pillow pytesseract torch torchvision matplotlib pandas tqdm ultralytics

import torch, platform
print("Torch:", torch.__version__, "| CUDA available:", torch.cuda.is_available())
print("Python:", platform.python_version())


Torch: 2.8.0+cu126 | CUDA available: False
Python: 3.12.11


**📦 Library Imports – Load Required Packages**

In [19]:
# ======= IMPORTS =======
from pathlib import Path
import cv2
import time
import os
from datetime import timedelta
import pandas as pd
import pytesseract
from tqdm import tqdm
from ultralytics import YOLO
from PIL import Image

**User Configuration – Set Paths and Parameters**

In [20]:
# ======= USER INPUTS =======
VIDEO_PATH  = "demovidlos.mp4"  # Set your video file here
DESIRED_FPS  = 2.0                     # Desired frames per second
IMAGE_PATH   = "catimage.jpg"  # <-- give your image path here
IMAGE_PATH_For_ocr   = "ocrtestingimage.png"   # <-- give your image path here
image_path   = "ocrtestingimage.png"

OUTPUT_VIDEO = "output_Video_annotated.mp4"

OUTPUT_IMAGE = "annotated_detected_image.jpg"

**🎞️ Extract Video Frames + 🧾 Apply OCR + 📁 Save Metadata**

In [21]:
# Uncomment and set tesseract path if on Windows
# pytesseract.pytesseract.tesseract_cmd = r"C:\Users\sivar\AppData\Local\Programs\Tesseract-OCR"

# ======= OUTPUT FOLDERS =======
OUT_ROOT   = Path("/content/dataset_pipeline")
FRAMES_DIR = OUT_ROOT / "images"
LABELS_DIR = OUT_ROOT / "labels"  # Reserved for future use if needed
META_CSV   = OUT_ROOT / "metadata.csv"

for d in [OUT_ROOT, FRAMES_DIR, LABELS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("Video:", VIDEO_PATH)
print("Output root:", OUT_ROOT)

# ======= FRAME EXTRACTION + OCR (ORIGINAL SHAPE) =======
def extract_frames_with_ocr(video_path, out_dir):
    """
    Extract frames every (1 / DESIRED_FPS) seconds at original resolution.
    Apply OCR and save metadata (image, timestamp, OCR text).
    """
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        raise RuntimeError(f"Cannot open video: {video_path}")

    fps = cap.get(cv2.CAP_PROP_FPS) or 0.0
    print("Original FPS",fps)
    frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or 0
    duration = (frames / fps) if (fps > 0 and frames > 0) else None

    interval = 1.0 / DESIRED_FPS
    t = 0.0
    rows, saved = [], 0

    while True:
        if duration is not None and t > duration:
            break

        cap.set(cv2.CAP_PROP_POS_MSEC, t * 1000.0)
        ret, frame = cap.read()
        if not ret:
            break

        fname = f"sec_{int(round(t * 1000)):010d}.jpg"
        fpath = out_dir / fname
        cv2.imwrite(str(fpath), frame)
        saved += 1

        # Apply OCR to the full frame
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        ocr_text = pytesseract.image_to_string(gray).strip()

        rows.append({
            "image": fname,
            "time_seconds": round(t, 3),
            "timecode": str(timedelta(seconds=float(t))),
            "ocr_text": ocr_text,
        })

        t += interval

    cap.release()

    if rows:
        pd.DataFrame(rows).to_csv(META_CSV, index=False)

    print(f"✅ Saved {saved} frames → {out_dir}")
    print(f"📄 Wrote metadata with OCR → {META_CSV}")

# ======= RUN EXTRACTION =======
extract_frames_with_ocr(VIDEO_PATH, FRAMES_DIR)


Video: demovidlos.mp4
Output root: /content/dataset_pipeline
Original FPS 23.976023976023978
✅ Saved 28 frames → /content/dataset_pipeline/images
📄 Wrote metadata with OCR → /content/dataset_pipeline/metadata.csv


🧠 Load YOLOv8 Model + 🎥 Define I/O Paths

In [22]:
# Input/output paths
INPUT_VIDEO  = VIDEO_PATH     # <- change this
OUTPUT_VIDEO = "output_video_after_detected.mp4"

# Pick a model (smallest = yolov8n.pt). Others: yolov8s.pt / m / l / x
MODEL_NAME = "yolov8n.pt"      # downloads automatically on first run

model = YOLO(MODEL_NAME)


**🎯 Run Object Detection on Image & Video using YOLOv8**

In [23]:
# CPU (no GPU)
DEVICE = "cpu"

# ---------------- LOAD MODEL ----------------
model = YOLO(MODEL_NAME)

# ---------------- IMAGE DETECTION ----------------
def run_image_inference():
    if not Path(IMAGE_PATH).exists():
        print(f"[IMAGES] Not found: {IMAGE_PATH}")
        return
    res = model.predict(source=IMAGE_PATH, device=DEVICE, verbose=False)[0]
    annotated = res.plot()
    cv2.imwrite(OUTPUT_IMAGE, annotated)
    print(f"[IMAGES] Saved annotated -> {OUTPUT_IMAGE}")
    for box in res.boxes:
        cls_id = int(box.cls[0])
        conf   = float(box.conf[0])
        x1, y1, x2, y2 = box.xyxy[0].tolist()
        print(f" - {model.names[cls_id]} (conf {conf:.2f}) at [{int(x1)},{int(y1)},{int(x2)},{int(y2)}]")

# ---------------- VIDEO DETECTION ----------------
def run_video_inference():
    cap = cv2.VideoCapture(INPUT_VIDEO)
    if not cap.isOpened():
        print(f"[VIDEO] Could not open {INPUT_VIDEO}")
        return

    fps    = cap.get(cv2.CAP_PROP_FPS) or 30.0
    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total  = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or 0

    out = cv2.VideoWriter(
        OUTPUT_VIDEO, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height)
    )

    pbar   = tqdm(total=total if total > 0 else None, desc="Video")
    frame_idx = 0
    t0 = time.time()

    while True:
        ok, frame = cap.read()
        if not ok:
            break
        res = model.predict(source=frame, device=DEVICE, verbose=False)[0]
        annotated = res.plot()
        out.write(annotated)
        frame_idx += 1
        pbar.update(1)

    cap.release()
    out.release()
    pbar.close()

    elapsed = time.time() - t0
    print(f"[VIDEO] Saved -> {OUTPUT_VIDEO} | Frames: {frame_idx} | Avg FPS: {frame_idx/max(elapsed,1e-6):.2f}")

# ---------------- RUN BOTH ----------------
if __name__ == "__main__":
    run_image_inference()
    run_video_inference()
    print("✅ Done")


[IMAGES] Saved annotated -> annotated_detected_image.jpg
 - cat (conf 0.89) at [162,13,453,211]


Video: 100%|██████████| 332/332 [00:52<00:00,  6.31it/s]

[VIDEO] Saved -> output_video_after_detected.mp4 | Frames: 332 | Avg FPS: 6.31
✅ Done





**📤 Upload an Image File from Your Local Device**

In [26]:
from google.colab import files

# Upload image file (png, jpg, etc.)
uploaded = files.upload()
image_path = list(uploaded.keys())[0]  # Take first uploaded file
print("Uploaded:", image_path)


Saving ocrtestingimage.png to ocrtestingimage (1).png
Uploaded: ocrtestingimage (1).png


**🔍 Perform OCR on Uploaded Image & 💾 Save Extracted Text**

In [25]:

# Open image

img = Image.open(image_path)

# Extract text
text = pytesseract.image_to_string(img, lang="eng")

# Save text to file
out_txt = Path(image_path).with_suffix(".txt")
out_txt.write_text(text, encoding="utf-8")

print("✅ OCR complete. Saved text to:", out_txt)
print("\n--- Extracted Text ---\n")
print(text)


✅ OCR complete. Saved text to: ocrtestingimage.txt

--- Extracted Text ---

rt cv2

<M

_ It was the best of

: times, it was the worst

of times, it was the age
of wisdom, it was the
age of foolishness...

 

 

 

 



In [None]:
cap = cv2.VideoCapture(INPUT_VIDEO)
if not cap.isOpened():
    raise RuntimeError(f"Could not open {INPUT_VIDEO}")

# Get video properties
fps     = cap.get(cv2.CAP_PROP_FPS) or 30.0
width   = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height  = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
length  = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

# Video writer
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out    = cv2.VideoWriter(OUTPUT_VIDEO, fourcc, fps, (width, height))

# Process
pbar = tqdm(total=length if length > 0 else None, desc="Processing")
frame_idx = 0
t0 = time.time()

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Inference (you can set conf, iou, device="0" for GPU, etc.)
    results = model.predict(source=frame, conf=0.25, iou=0.45, verbose=False)
    # results is a list; get first result and draw
    annotated = results[0].plot()  # draws boxes, labels, confidences

    out.write(annotated)

    frame_idx += 1
    pbar.update(1)

cap.release()
out.release()
pbar.close()

elapsed = time.time() - t0
print(f"Saved: {OUTPUT_VIDEO} | Frames: {frame_idx} | Avg FPS: {frame_idx/max(elapsed,1e-6):.2f}")
