Develop an object detection application that will take in as inputs three (3) videos of your favorite locations at PAU using Yolo architecture. The application should have a GUI that will enable the user to interact with the program. Each video should not be more than 1-minute

Hint: 
Use your creative design knowledge to enhance the user experience.


In [19]:
import cv2 as cv
import numpy as np
import tkinter as tk
from tkinter import filedialog
from PIL import Image, ImageTk

net = cv.dnn.readNet('cfg/yolov3.weights', 'cfg/yolov3.cfg')
with open('cfg/coco.names', 'r') as f:
    classes = f.read().splitlines()
colors = np.random.uniform(0, 255, size=(len(classes), 3))

root = tk.Tk()
root.geometry("1000x600")
root.title("Object Detection Application")
root.config(background='Black')

video_label = tk.Label(root)
video_label.pack()

def upload_video():
    file_path = filedialog.askopenfilename(filetypes=[("Video files", "*.mp4 *.avi")])
    if file_path:
        play_video(file_path)

def play_video(path):
    cap = cv.VideoCapture(path)

    def show_frame():
        ret, frame = cap.read()
        if not ret:
            cap.release()
            return

        height, width, _ = frame.shape

        # YOLO preprocessing
        blob = cv.dnn.blobFromImage(frame, 1/255.0, (416, 416), swapRB=True, crop=False)
        net.setInput(blob)
        output_layers_names = net.getUnconnectedOutLayersNames()
        layer_outputs = net.forward(output_layers_names)

        boxes = []
        confidences = []
        class_ids = []

        # Parse detections
        for output in layer_outputs:
            for detection in output:
                scores = detection[5:]
                class_id = np.argmax(scores)
                confidence = scores[class_id]
                if confidence > 0.5:
                    center_x = int(detection[0] * width)
                    center_y = int(detection[1] * height)
                    w = int(detection[2] * width)
                    h = int(detection[3] * height)
                    x = int(center_x - w / 2)
                    y = int(center_y - h / 2)
                    boxes.append([x, y, w, h])
                    confidences.append(float(confidence))
                    class_ids.append(class_id)

        indexes = cv.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)

        if len(indexes) > 0:
            for i in indexes.flatten():
                x, y, w, h = boxes[i]
                label = f"{classes[class_ids[i]]} {round(confidences[i], 2)}"
                color = colors[class_ids[i]]
                cv.rectangle(frame, (x, y), (x + w, y + h), color, 2)
                cv.putText(frame, label, (x, y - 10), cv.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)

        # Convert to Tkinter format and show
        frame_rgb = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
        img = ImageTk.PhotoImage(Image.fromarray(frame_rgb))
        video_label.config(image=img)
        video_label.image = img

        # Schedule next frame
        root.after(1, show_frame)

    show_frame()
    
upload_button = tk.Button(root, text="Upload Video", command=upload_video)
upload_button.pack(pady=10)

root.mainloop()