#Preparacion del entorno

In [None]:
!pip install -q gdown mediapy ultralytics transformers timm
import mediapy as media
import gdown
import ultralytics
from ultralytics import YOLO
ultralytics.checks()

from IPython.display import Image, display
import cv2
import matplotlib.pyplot as plt
import numpy as np
from google.colab.patches import cv2_imshow
from transformers import pipeline
from PIL import Image, ImageDraw, ImageFont
import timm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m750.8/750.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
%%capture
!wget -O 'MobileNetSSD_deploy.prototxt' https://raw.githubusercontent.com/TheNsBhasin/DNN_Object_Detection/master/MobileNetSSD_deploy.prototxt.txt
!wget -O 'MobileNetSSD_deploy.caffemodel' https://github.com/TheNsBhasin/DNN_Object_Detection/blob/master/MobileNetSSD_deploy.caffemodel?raw=true
!wget "https://github.com/openmaptiles/fonts/raw/master/roboto/Roboto-Regular.ttf" -O "Roboto-Regular.ttf"

gdown.download('https://drive.google.com/uc?id=1ZBgd3ubeQdzf6Ztuv3LqgrFmp7VsrdwG', '1.mp4', quiet=True)

!ffmpeg -y -i 1.mp4 -vf "scale=600:-1" -an -t 30 1e.mp4

#Funcion de procesamiento de video

In [None]:
def process_video(filename_in, filename_out, process_frame, *args, **kwargs):
    with media.VideoReader(filename_in) as r:
        with media.VideoWriter(filename_out, shape=r.shape, fps=r.fps, bps=r.bps) as w:
            for image in r:
                processed_image = process_frame(image, *args, **kwargs)
                w.add_image(processed_image)

#Yolo

In [None]:
model_yolo = YOLO('yolov8n')

In [None]:
def procesar_frame_Yolo(fotograma):
  fotograma = cv2.cvtColor(fotograma, cv2.COLOR_BGR2RGB)

  results = model_yolo(fotograma)

  detections_car = []

  for box in results[0].boxes:
    class_name = model_yolo.names[int(box.data[0][5])]
    confidence = box.data[0][4]

    if class_name == "car" and confidence > 0.5:
        detections_car.append(box)

  results[0].boxes = detections_car

  annotated_frame = results[0].plot()

  return cv2.cvtColor(annotated_frame, cv2.COLOR_RGB2BGR)

In [None]:
entrada = "1e.mp4"
salida_yolo = "yolo.mp4"

process_video(entrada, salida_yolo, procesar_frame_Yolo)

media.show_video(media.read_video(salida_yolo), fps=30)

#MobileNetSSD

In [None]:
modelo_mobilenet = 'MobileNetSSD_deploy.caffemodel'
configuracion_mobilenet = 'MobileNetSSD_deploy.prototxt'
clases = ["background", "aeroplane", "bicycle", "bird", "boat",
          "bottle", "bus", "car", "cat", "chair", "cow", "diningtable",
          "dog", "horse", "motorbike", "person", "pottedplant", "sheep",
          "sofa", "train", "tvmonitor"]

net = cv2.dnn.readNetFromCaffe(configuracion_mobilenet, modelo_mobilenet)

In [None]:
def procesar_frame_mobilenet(imagen):
  (h, w) = imagen.shape[:2]
  blob = cv2.dnn.blobFromImage(cv2.resize(imagen, (300, 300)), 0.007843, (300, 300), 127.5)

  net.setInput(blob)
  detecciones = net.forward()

  for i in np.arange(0, detecciones.shape[2]):
    confianza = detecciones[0, 0, i, 2]
    idx = int(detecciones[0, 0, i, 1])

    if confianza > 0.1 and clases[idx] == "car":
      box = detecciones[0, 0, i, 3:7] * np.array([w, h, w, h])
      (startX, startY, endX, endY) = box.astype("int")

      etiqueta = "{}: {:.2f}%".format(clases[idx], confianza * 100)
      cv2.rectangle(imagen, (startX, startY), (endX, endY), (0, 255, 0), 2)
      y = startY - 15 if startY - 15 > 15 else startY + 15
      cv2.putText(imagen, etiqueta, (startX, y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

  return imagen


In [None]:
salida_mobilenet = "mobilenet.mp4"

process_video(entrada, salida_mobilenet, procesar_frame_mobilenet)

media.show_video(media.read_video(salida_mobilenet), fps=30)

#DETR

In [None]:
font = ImageFont.truetype("Roboto-Regular.ttf", 40)

model_detr = pipeline("object-detection", model="facebook/detr-resnet-50")

In [None]:
def draw_bounding_box(im, score, label, xmin, ymin, xmax, ymax):
    im_with_rectangle = ImageDraw.Draw(im)
    im_with_rectangle.rounded_rectangle([xmin, ymin, xmax, ymax], outline="red", width=5, radius=10)

    im_with_rectangle.text((xmin + 10, ymin - 40), f"{label} ({score:.2f})", fill="white", font=font)

    return im

def procesar_frame_detr(imagen):
  imagen_pil = Image.fromarray(imagen)
  bounding_boxes = model_detr(imagen_pil)

  for bounding_box in bounding_boxes:
    if bounding_box['label'] == "car" and bounding_box['score'] > 0.5:
      box = bounding_box['box']
      imagen_pil = draw_bounding_box(imagen_pil, bounding_box['score'], bounding_box['label'],
                                      int(box['xmin']), int(box['ymin']), int(box['xmax']),
                                      int(box['ymax']))

  return np.array(imagen_pil)

In [None]:
salida_detr = "detr.mp4"

process_video(entrada, salida_detr, procesar_frame_detr)

media.show_video(media.read_video(salida_detr), fps=30)

#Conclusiones

Segun los resultados obtenidos podemos concluir que los modelos mas eficaces a la hora de identificar los autos en este problema los modelos de Yolo y DETR son los mas eficaces mientras que para la ejecucion en tiempo real posiblemente los mas indicados sean Mobile Ner SSD y Yolo sin embargo si se desea analizar los videos utilizaría DETR ya que identifica los objetos con una gran eficacia.