In [1]:
import os
import cv2
import tensorflow as tf
import numpy as np
import math
from collections import deque
from moviepy.editor import VideoFileClip, AudioFileClip

To run YOLO_V3 in your machine you can get yolov3.cfg, yolov3.txt, yolov3.weights, coco.names from 
https://github.com/arunponnusamy/object-detection-opencv

Capturing videos

In [2]:
def read_video(video_path):
    cap = cv2.VideoCapture(video_path)
    
    if not cap.isOpened():
        print("Error: Could not open video.")
        return None
    
    # Get video properties
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    print(f"Video properties: Width={frame_width}, Height={frame_height}, FPS={fps}, Total Frames={total_frames}")
    
    return cap, frame_width, frame_height, fps, total_frames


iterating over each videos

In [3]:
def read_videos_from_folder(folder_path):
    videos = []
    
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".mp4"):  # Assuming all videos are in .mp4 format
            video_path = os.path.join(folder_path, file_name)
            cap, frame_width, frame_height, fps, total_frames = read_video(video_path)
            if cap is not None:
                videos.append((cap, frame_width, frame_height, fps, total_frames, file_name))
    
    return videos

folder_path = "ds1"
videos = read_videos_from_folder(folder_path)

Video properties: Width=1280, Height=720, FPS=23, Total Frames=625
Video properties: Width=1920, Height=1080, FPS=25, Total Frames=1066
Video properties: Width=1920, Height=1080, FPS=23, Total Frames=2072
Video properties: Width=3840, Height=1634, FPS=23, Total Frames=1630
Video properties: Width=3840, Height=2160, FPS=29, Total Frames=4807
Video properties: Width=1920, Height=1080, FPS=25, Total Frames=1987
Video properties: Width=1920, Height=1080, FPS=23, Total Frames=1440
Video properties: Width=3840, Height=2160, FPS=29, Total Frames=6050
Video properties: Width=1920, Height=1080, FPS=59, Total Frames=4213
Video properties: Width=3840, Height=2160, FPS=29, Total Frames=1799
Video properties: Width=1920, Height=1080, FPS=25, Total Frames=2959


initializing pretrained yoloV3(You Only Look Once) model for object detection

Download above specified dependencies and specify their path

In [4]:
# Define the paths to YOLO config file, pre-trained weights, and class names file
config_path = 'yolov3.cfg'
weights_path = 'yolov3.weights'
classes_path = 'yolov3.txt'

# Load YOLO model and classes
net = cv2.dnn.readNet(weights_path, config_path)

# Load class names
with open(classes_path, 'r') as f:
    classes = [line.strip() for line in f.readlines()]

# Function to get output layers
def get_output_layers(net):
    layer_names = net.getLayerNames()
    try:
        output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]
    except:
        output_layers = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]

    return output_layers

# Function to draw predictions
def draw_prediction(img, class_id, confidence, x, y, x_plus_w, y_plus_h):
    # label = str(classes[class_id])
    # color = (255, 0, 0)  # You can customize the color here if needed
    # cv2.rectangle(img, (x, y), (x_plus_w, y_plus_h), color, 2)
    # cv2.putText(img, label, (x - 10, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
    pass


Function for object detection taking each frame as input and returing top-left_x and top-right_y, width and height of object

In [5]:
def detect_objects(image):
    # Get frame dimensions
    Width = image.shape[1]
    Height = image.shape[0]

    # Create blob from image
    blob = cv2.dnn.blobFromImage(image, 0.00392, (416, 416), (0, 0, 0), True, crop=False)

    # Set blob as input to the network
    net.setInput(blob)

    # Forward pass to get output layers
    outs = net.forward(get_output_layers(net))

    # Lists to store detected objects
    class_ids = []
    confidences = []
    boxes = []

    # Confidence and NMS thresholds
    conf_threshold = 0.995
    nms_threshold = 1

    # Post-processing
    for out in outs:
        for detection in out:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > conf_threshold:
                center_x = int(detection[0] * Width)
                center_y = int(detection[1] * Height)
                w = int(detection[2] * Width)
                h = int(detection[3] * Height)
                x = center_x - w // 2
                y = center_y - h // 2
                class_ids.append(class_id)
                confidences.append(float(confidence))
                boxes.append([x, y, w, h])

    # Non-maximum suppression
    indices = cv2.dnn.NMSBoxes(boxes, confidences, conf_threshold, nms_threshold)

    # Draw bounding boxes and labels
    for i in indices:
        try:
            box = boxes[i]
        except:
            i = i[0]
            box = boxes[i]
        
        x = box[0]
        y = box[1]
        w = box[2]
        h = box[3]
        draw_prediction(image, class_ids[i], confidences[i], round(x), round(y), round(x+w), round(y+h))
    return boxes

Function for video-processing taking input video

In [6]:
def process_video(input_video_path, output_video_path, arr):
    # Open input video
    cap = cv2.VideoCapture(input_video_path)

    # Get video properties
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Create VideoWriter object to write output video
    out = cv2.VideoWriter(output_video_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (720, 1280))

    # Process each frame
    total =0
    i=0
    # num = 0
    prevx = frame_width//2
    prevy = frame_height//2
    while cap.isOpened():
        # if(num>400):
        #     break
        ret, frame = cap.read()
        if not ret:
            break
        
        # Storing last 10 elements in queue and if the distance between the centroid of ten points and current 
        # center is greater than threshold then empty the queue and make new centroid
        qx = deque(maxlen=10)
        qy = deque(maxlen=10)
        # Detect objects and get bounding boxes
        boxes = detect_objects(frame)
        j=0
        if boxes is not None:# Crop and resize frame based on bounding box
            max_area = 0
            max_area_box = None
            for box in boxes:
                x, y, w, h = box
                area = w * h
                if area > max_area:
                    max_area = area
                    max_area_box = box
            
            
            x1 = x + w // 2
            y1 = y + h // 2
            if(len(qy)!=0):
                prevy = sum(qy)//len(qy)
                prevx = sum(qx)//len(qx)
            
            disx = abs(x1 - prevx)
            disx *= disx
            disy = abs(y1 - prevy)
            disy *= disy
            
            # prevx = x1
            # prevy = y1
            
            dis = math.sqrt(disx+disy)
            arr.append(dis)
                    
            if max_area_box is not None:
                if(dis<15):
                    x3,y3,w3,h3 = max_area_box
                    center_x = prevx + w3//2
                    center_y = prevy + h3//2
                    qy.append(y3)
                    qx.append(x3)
                    # prevx = (prevx + x3)/2 
                    # prevy = (prevy + y3)/2
                else:
                    # If distance greater than threshold(15) came with hit and trail then empty the queue
                    x, y, w, h = max_area_box
                    center_x = x + w // 2
                    center_y = y + h // 2
                    prevx = x
                    prevy = y
                    qy.clear()
                    qx.clear()

                # Calculate new dimensions
                new_height = 1280
                new_width = 720

                # Crop and resize frame based on bounding box
                cropped_frame = frame[max(0, center_y - new_height // 2):min(frame_height, center_y + new_height // 2),
                                      max(0, center_x - new_width // 2):min(frame_width, center_x + new_width // 2)]
                
                # Check if cropped frame is empty
                if cropped_frame.size == 0:
                    print("Cropped frame is empty for box:", box)
                    continue
                
                resized_frame = cv2.resize(cropped_frame, (720,1280))

                # Write resized frame to output video
                j+=1
                total+=1
                out.write(resized_frame)
            print(f'{i} {j}')
            # num += 1
            i+=1

    # Release video capture and writer objects
    print(total)
    cap.release()
    out.release()
    
    video_with_audio = VideoFileClip(input_video_path)

    # Load the video without audio (v2)
    video_without_audio = VideoFileClip(output_video_path)

    # Extract the audio from the first video
    audio = video_with_audio.audio

    # Set the extracted audio to the second video
    final_video = video_without_audio.set_audio(audio)

    # Export the final video
    final_video.write_videofile("output_video1.mp4", codec="libx264", audio_codec="aac")

Function to get path of video

In [7]:
current_dir = os.getcwd()

# Construct the path to the video file relative to the current directory
input_video_path = os.path.join(current_dir, 'ds1', '2FgBOgck_K0.mp4')

output_video_path = 'output_video.mp4'

code block to execute transformation

In [8]:
arr = []
process_video(input_video_path, output_video_path, arr)

0 1
1 1
2 1
3 1
4 1
5 1
6 1
7 1
8 1
9 1
10 1
11 1
12 1
13 1
14 1
15 1
16 1
17 1
18 1
19 1
20 1
21 1
22 1
23 1
24 1
25 1
26 1
27 1
28 1
29 1
30 1
31 1
32 1
33 1
34 1
35 1
36 1
37 1
38 1
39 1
40 1
41 1
42 1
43 1
44 1
45 1
46 1
47 1
48 1
49 1
50 1
51 1
52 1
53 1
54 1
55 1
56 1
57 1
58 1
59 1
60 1
61 1
62 1
63 1
64 1
65 1
66 1
67 1
68 1
69 1
70 1
71 1
72 1
73 1
74 1
75 1
76 1
77 1
78 1
79 1
80 1
81 1
82 1
83 1
84 1
85 1
86 1
87 1
88 1
89 1
90 1
91 1
92 1
93 1
94 1
95 1
96 1
97 1
98 1
99 1
100 1
101 1
102 1
103 1
104 1
105 1
106 1
107 1
108 1
109 1
110 1
111 1
112 1
113 1
114 1
115 1
116 1
117 1
118 1
119 1
120 1
121 1
122 1
123 1
124 1
125 1
126 1
127 1
128 1
129 1
130 1
131 1
132 1
133 1
134 1
135 1
136 1
137 1
138 1
139 1
140 1
141 1
142 1
143 1
144 1
145 1
146 1
147 1
148 1
149 1
150 1
151 1
152 1
153 1
154 1
155 1
156 1
157 1
158 1
159 1
160 1
161 1
162 1
163 1
164 1
165 1
166 1
167 1
168 1
169 1
170 1
171 1
172 1
173 1
174 1
175 1
176 1
177 1
178 1
179 1
180 1
181 1
182 1
183 1
184 1


                                                                   

MoviePy - Done.
Moviepy - Writing video output_video1.mp4



                                                              

Moviepy - Done !
Moviepy - video ready output_video1.mp4
