In [None]:
pip install opencv-python numpy pytesseract notebook ipykernel

In [1]:
import cv2
import numpy as np
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
import json
import os


def detect_shot_cuts(video_path, threshold=0.3):
    cap = cv2.VideoCapture(video_path)
    prev_hist = None
    cut_count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        hist = cv2.calcHist([gray], [0], None, [256], [0, 256])
        hist = cv2.normalize(hist, hist).flatten()

        if prev_hist is not None:
            diff = cv2.compareHist(prev_hist, hist, cv2.HISTCMP_BHATTACHARYYA)
            if diff > threshold:
                cut_count += 1

        prev_hist = hist

    cap.release()
    return cut_count


def compute_average_motion(video_path):
    cap = cv2.VideoCapture(video_path)
    ret, prev_frame = cap.read()
    if not ret:
        return 0.0

    prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
    motion_values = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        flow = cv2.calcOpticalFlowFarneback(
            prev_gray, gray, None,
            pyr_scale=0.5,
            levels=3,
            winsize=15,
            iterations=3,
            poly_n=5,
            poly_sigma=1.2,
            flags=0
        )

        magnitude, _ = cv2.cartToPolar(flow[..., 0], flow[..., 1])
        motion_values.append(np.mean(magnitude))
        prev_gray = gray

    cap.release()
    return float(np.mean(motion_values)) if motion_values else 0.0


def compute_text_presence_ratio(video_path, frame_interval=10):
    cap = cv2.VideoCapture(video_path)
    total_frames = 0
    text_frames = 0
    frame_count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        if frame_count % frame_interval != 0:
            continue

        total_frames += 1
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        text = pytesseract.image_to_string(gray)

        if len(text.strip()) > 5:
            text_frames += 1

    cap.release()
    if total_frames == 0:
        return 0.0
    return text_frames / total_frames


def extract_video_features(video_path):
    if not os.path.exists(video_path):
        raise FileNotFoundError("Video file not found")

    features = {
        "video_path": video_path,
        "shot_cut_count": detect_shot_cuts(video_path),
        "average_motion": compute_average_motion(video_path),
        "text_present_ratio": compute_text_presence_ratio(video_path)
    }

    return features


if __name__ == "__main__":
    video_path = "small.mp4"  # Video sample path
    features = extract_video_features(video_path)

    print(json.dumps(features, indent=4))


{
    "video_path": "small.mp4",
    "shot_cut_count": 21,
    "average_motion": 3.7163584232330322,
    "text_present_ratio": 0.0
}
