In [8]:
import os
import cv2
import numpy as np


### 1. Extract Frames from Video


In [9]:
def extract_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)
    cap.release()
    return frames


### 2. Remove Duplicate Frames


In [10]:

def mse(imageA, imageB):
    # Compute the mean squared error between the two images
    err = np.sum((imageA.astype("float") - imageB.astype("float")) ** 2)
    err /= float(imageA.shape[0] * imageA.shape[1])
    return err

def remove_duplicates(frames, threshold=1000):
    unique_frames = []
    for i in range(len(frames) - 1):
        if mse(frames[i], frames[i + 1]) > threshold:
            unique_frames.append(frames[i])
    unique_frames.append(frames[-1])  # Always add the last frame
    return unique_frames


### 

### 3. Object Detection on Frames


In [11]:
def perform_object_detection(frames, yolo_model, conf_threshold=0.5, nms_threshold=0.4):
    detections = []
    for frame in frames:
        # YOLO model requires input images in a specific format
        blob = cv2.dnn.blobFromImage(frame, 1 / 255.0, (416, 416), swapRB=True, crop=False)
        yolo_model.setInput(blob)

        # Getting the names of all layers
        layer_names = yolo_model.getLayerNames()
        # Extracting the names of the output layers
        output_layers = [layer_names[i[0] - 1] for i in yolo_model.getUnconnectedOutLayers()]

        # Forward pass
        layer_outputs = yolo_model.forward(output_layers)

        # Initialization for each frame's detection
        frame_detections = []

        # Each layer's output
        for output in layer_outputs:
            # Each detection in the output
            for detection in output:
                scores = detection[5:]
                class_id = np.argmax(scores)
                confidence = scores[class_id]

                # Filtering out weak predictions
                if confidence > conf_threshold:
                    # Object detected
                    center_x, center_y, width, height = detection[0:4] * np.array([frame.shape[1], frame.shape[0], frame.shape[1], frame.shape[0]])
                    x = int(center_x - (width / 2))
                    y = int(center_y - (height / 2))

                    # Adding the valid detections to the frame's list
                    frame_detections.append({'class_id': class_id, 'confidence': float(confidence), 'box': [x, y, int(width), int(height)]})

        # Non-Max Suppression
        indices = cv2.dnn.NMSBoxes([d['box'] for d in frame_detections], [d['confidence'] for d in frame_detections], conf_threshold, nms_threshold)

        # Final filtered detections for the frame
        final_detections = [frame_detections[i[0]] for i in indices]
        detections.append(final_detections)

    return detections


### 4. Generate Annotation File


In [12]:
import pandas as pd

def generate_annotation_file(detections, output_dir, video_id):
    annotations = []
    for frame_index, frame_detections in enumerate(detections):
        for det in frame_detections:
            # det might include: class, confidence, bbox coordinates
            annotations.append([video_id, frame_index, det['class'], det['confidence'], det['bbox']])
    df = pd.DataFrame(annotations, columns=['video_id', 'frame', 'class', 'confidence', 'bbox'])
    file_path = os.path.join(output_dir, f"{video_id}_annotations.csv")
    df.to_csv(file_path, index=False)
    return file_path


### 5.Generate Information File


In [13]:
def generate_information_file(video_path, output_dir, video_id):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = total_frames / fps
    cap.release()

    info = {
        'video_id': video_id,
        'fps': fps,
        'total_frames': total_frames,
        'duration': duration
    }
    df = pd.DataFrame([info])
    file_path = os.path.join(output_dir, f"{video_id}_info.csv")
    df.to_csv(file_path, index=False)
    return file_path


## Proccess Video Function

In [14]:
def process_video(video_path, yolo_model, output_dir, video_id):
    """
    Process a video file to perform object detection and generate annotation and information files.

    :param video_path: Path to the video file.
    :param yolo_model: Pre-loaded YOLO model for object detection.
    :param output_dir: Directory to save the output files.
    :param video_id: A unique identifier for the video.
    :return: Paths to the generated annotation and information files.
    """
    # Step 1: Extract Frames from Video
    frames = extract_frames(video_path)

    # Step 2: Remove Duplicate Frames
    unique_frames = remove_duplicates(frames)

    # Step 3: Object Detection on Frames
    detections = perform_object_detection(unique_frames, yolo_model)

    # Step 4: Generate Annotation File
    annotation_file_path = generate_annotation_file(detections, output_dir, video_id)

    # Step 5: Generate Information File
    info_file_path = generate_information_file(video_path, output_dir, video_id)

    return annotation_file_path, info_file_path


In [15]:
yolo_model = cv2.dnn.readNetFromDarknet('yolo/yolov3.cfg', 'yolo/yolov3.weights')

output_dir = 'datasets/summary_videos'
video_path = 'datasets/ydata-tvsum50-v1_1/video/_xMr-HKMfVA.mp4'
video_id = '1'

annotation_path, info_path = process_video(video_path, yolo_model, output_dir, video_id)

IndexError: invalid index to scalar variable.