# Entrega YOLO + OCR

In [2]:
from ultralytics import YOLO
import cv2
import csv
import pytesseract

## Parte 1: Detectar personas, bicicletas y coches con su matrícula y su texto de matrícula

In [21]:
# Parámetros
WRITE_CSV = True
WRITE_VIDEO = True
SHOW_VIDEO = False

csv_file = None
csv_writer = None

TEST_VIDEO_PATH = "test-video.MP4"

# Definiciones para los objetos
PERSON_CLASS_ID = 0
BICYCLE_CLASS_ID = 1
CAR_CLASS_ID = 2
LICENSE_PLATE_CLASS_ID = 3

classNames = ["person", "bicycle", "car", "license plate"]

# Modelos de visión
yolo_11_model = YOLO('yolo11n.pt')

# El modelo fue entrenado con tamaño de imagen 640
LICENSE_PLATE_SIZE = 640
license_plate_model = YOLO('license-plate-model/weights/best.pt')

if WRITE_CSV:
    csv_file = open('detections.csv', 'w', newline='', encoding='utf-8')
    csv_writer = csv.writer(csv_file)
    # Write header
    csv_writer.writerow([
        'fotograma', 'tipo_objeto', 'confianza', 'identificador_tracking',
        'x1', 'y1', 'x2', 'y2', 'matrícula_en_su_caso', 'confianza_matricula',
        'mx1', 'my1', 'mx2', 'my2', 'texto_matricula'
    ])

ANNOTATION_COLOR = (255, 255, 255)

test_video = cv2.VideoCapture(TEST_VIDEO_PATH)

# Get video properties
output_video = None
if WRITE_VIDEO:
    fps = int(test_video.get(cv2.CAP_PROP_FPS))
    width = int(test_video.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(test_video.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Create VideoWriter object
    codec = cv2.VideoWriter.fourcc(*'mp4v')
    output_video = cv2.VideoWriter('output_video.mp4', codec, fps, (width, height))

    # Check if VideoWriter opened successfully
    if not output_video.isOpened():
        print("Error: Could not open VideoWriter")
        exit()

frame_count = 0
has_frame, frame = test_video.read()
while has_frame:
    frame_count += 1
    class_count = [0] * len(classNames)
    results = yolo_11_model.track(source=frame, persist=True, verbose=False, classes=[0,1,2], device=0, conf=0.2)
    if results and len(results) > 0:
        annotated_frame = results[0].plot()
        boxes = results[0].boxes.xyxy.cpu().numpy().astype(int)
        confs = results[0].boxes.conf.cpu().numpy()
        classes = results[0].boxes.cls.cpu().numpy().astype(int)

        track_ids = []
        if results[0].boxes.id is not None:
            track_ids = results[0].boxes.id.cpu().numpy().astype(int)
        else:
            track_ids = [-1] * len(boxes)

        car_crops = []
        offsets = []
        car_info = []

        for idx, (box, conf, class_id, track_id) in enumerate(zip(boxes, confs, classes, track_ids)):
            x1, y1, x2, y2 = box
            x1, y1 = max(0, x1), max(0, y1)
            x2, y2 = min(frame.shape[1], x2), min(frame.shape[0], y2)
            object_type = classNames[class_id]
            class_count[class_id] += 1

            if WRITE_CSV and class_id in [PERSON_CLASS_ID, BICYCLE_CLASS_ID]:
                csv_writer.writerow([
                    frame_count, object_type, f"{conf:.3f}", track_id,
                    x1, y1, x2, y2,
                    'No', '', '', '', '', '', ''
                ])
            if class_id == CAR_CLASS_ID:
                original_crop = frame[y1:y2, x1:x2]
                rgb_crop = cv2.cvtColor(original_crop, cv2.COLOR_BGR2RGB)
                car_crops.append(cv2.resize(rgb_crop, (LICENSE_PLATE_SIZE, LICENSE_PLATE_SIZE)))

                offsets.append((x1, y1, x2, y2))
                car_info.append({
                        'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2,
                        'conf': conf,
                        'id': track_id
                    })

        if car_crops and car_info:
            lp_results = license_plate_model.track(source=car_crops, verbose=False, classes=[0], device=0, conf=0.25, batch=8)
            lp_confs = lp_results[0].boxes.conf.cpu().numpy()

            for i, res in enumerate(lp_results):
                car = car_info[i]
                if WRITE_CSV and len(res.boxes) == 0:
                    # No plates detected
                    csv_writer.writerow([
                        frame_count, 'car', f"{car['conf']:.3f}", car['id'],
                        car['x1'], car['y1'], car['x2'], car['y2'],
                        'No', '', '', '', '', '', ''
                    ])
                elif len(res.boxes) > 0:
                    lp_boxes = res.boxes.xyxy.cpu().numpy()
                    class_count[3] += 1

                    x1, y1, x2, y2 = offsets[i]
                    orig_width = x2 - x1
                    orig_height = y2 - y1
                    # Escala para convertir el tamaño y la posición del bounding box de la matrícula al del video original
                    scale_x = orig_width / LICENSE_PLATE_SIZE
                    scale_y = orig_height / LICENSE_PLATE_SIZE

                    for lp_box, lp_conf in zip(lp_boxes, lp_confs):
                        lx1, ly1, lx2, ly2 = lp_box
                        abs_lx1 = int(lx1 * scale_x + x1)
                        abs_ly1 = int(ly1 * scale_y + y1)
                        abs_lx2 = int(lx2 * scale_x + x1)
                        abs_ly2 = int(ly2 * scale_y + y1)

                        abs_lx1 = max(0, min(abs_lx1, frame.shape[1]))
                        abs_ly1 = max(0, min(abs_ly1, frame.shape[0]))
                        abs_lx2 = max(0, min(abs_lx2, frame.shape[1]))
                        abs_ly2 = max(0, min(abs_ly2, frame.shape[0]))

                        cv2.rectangle(annotated_frame, (abs_lx1, abs_ly1), (abs_lx2, abs_ly2), (0, 0, 255), 2)
                        plate_crop_rgb = cv2.cvtColor(frame[abs_ly1:abs_ly2, abs_lx1:abs_lx2], cv2.COLOR_BGR2RGB)

                        plate_text = pytesseract.image_to_string(plate_crop_rgb).strip()
                        if WRITE_CSV:
                            csv_writer.writerow([
                                frame_count, 'car', f"{car['conf']:.3f}", car['id'],
                                car['x1'], car['y1'], car['x2'], car['y2'],
                                'Yes', f"{lp_conf:.3f}",
                                abs_lx1, abs_ly1, abs_lx2, abs_ly2,
                                plate_text
                            ])
        cv2.putText(annotated_frame, f"Detections in current frame:", (50,50), cv2.FONT_HERSHEY_SIMPLEX, .5, ANNOTATION_COLOR, 2)
        cv2.putText(annotated_frame, f"- People: {class_count[PERSON_CLASS_ID]}", (50, 75), cv2.FONT_HERSHEY_SIMPLEX, .5, ANNOTATION_COLOR, 2)
        cv2.putText(annotated_frame, f"- Bicycles: {class_count[BICYCLE_CLASS_ID]}", (50, 100), cv2.FONT_HERSHEY_SIMPLEX, .5, ANNOTATION_COLOR, 2)
        cv2.putText(annotated_frame, f"- Cars: {class_count[CAR_CLASS_ID]}", (50, 125), cv2.FONT_HERSHEY_SIMPLEX, .5, ANNOTATION_COLOR, 2)
        cv2.putText(annotated_frame, f"  - License plates: {class_count[LICENSE_PLATE_CLASS_ID]}", (50, 150), cv2.FONT_HERSHEY_SIMPLEX, .5, ANNOTATION_COLOR, 2)
        cv2.putText(annotated_frame, f"FRAME: {frame_count}", (frame.shape[1] - 225,50), cv2.FONT_HERSHEY_SIMPLEX, 1, ANNOTATION_COLOR, 2)
        if SHOW_VIDEO:
            cv2.imshow("License Plate YOLO", annotated_frame)
        if WRITE_VIDEO:
            output_video.write(annotated_frame)
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    has_frame, frame = test_video.read()

test_video.release()
if WRITE_VIDEO:
    output_video.release()
if WRITE_CSV:
    csv_file.close()
cv2.destroyAllWindows()

# Parte 2: Comparativa entre modelos de OCR para leer matrículas

Esta celda solo sirve para recortar las matrículas y guardarlas en el directorio `ocr-test/crops`

In [None]:
import os
from ultralytics import YOLO
import cv2

license_plate_model = YOLO('license-plate-model/weights/best.pt')

# Generador para obtener la imagen de la matrícula junto a su etiqueta con el texto
def image_and_labels_loader(directory):
    valid_extensions = [".jpg", ".png", ".jpeg"]
    for entry in os.scandir(directory):
        if entry.is_file():
            name, ext = os.path.splitext(entry.name)
            if ext in valid_extensions:
                label_path = os.path.join(OCR_TEST_LABELS_DIR, f"{name}.txt")
                with open(label_path, "r", encoding="utf-8") as f:
                    image_label = f.readline().rstrip("\n")
                image = cv2.imread(entry.path)
                if image is None:
                    continue
                results = license_plate_model.predict(source=cv2.cvtColor(image, cv2.COLOR_BGR2RGB), verbose=False, classes=[0], device=0)
                if results is not None and len(results[0].boxes) > 0:
                    for box in results[0].boxes:
                        x1, y1, x2, y2 = map(int,box.xyxy[0].tolist())
                        yield image[y1:y2, x1:x2], image_label, name

# Guardar las matrículas recortadas en disco directamente
def crop_all_images():
    for crop,_,image_name in image_and_labels_loader(OCR_TEST_IMAGES_DIR):
        cv2.imwrite(f"{OCR_TEST_CROPS_DIR}/{image_name}.jpg", crop)

Esta celda define todas las cosas necesarias para realizar el test. Esto facilita cambiar de entorno para cada modelo

In [3]:
import os
import csv
OCR_TEST_IMAGES_DIR = "ocr-test/images"
OCR_TEST_LABELS_DIR = "ocr-test/labels"
OCR_TEST_CROPS_DIR = "ocr-test/crops"
TEST_NUMBER = 100

OCR_TEST_CSV_FILE_NAME = "ocr-test/results.csv"

def cropped_image_and_labels_loader(directory):
    valid_extensions = [".jpg", ".png", ".jpeg"]
    for entry in os.scandir(directory):
        if entry.is_file():
            name, ext = os.path.splitext(entry.name)
            if ext in valid_extensions:
                label_path = os.path.join(OCR_TEST_LABELS_DIR, f"{name}.txt")
                with open(label_path, "r", encoding="utf-8") as f:
                    image_label = f.readline().rstrip("\n")
                image = cv2.imread(entry.path)
                yield image, image_label

def write_results(model_name, correct_predictions, image_count):
    file_exists = os.path.isfile(OCR_TEST_CSV_FILE_NAME)
    with open(OCR_TEST_CSV_FILE_NAME, "a", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        if not file_exists:
            writer.writerow(['Model', 'Accuracy'])
        writer.writerow([model_name, round(correct_predictions/image_count, 3)])


### Prueba con Tesseract

In [10]:
import pytesseract

success_count = 0
custom_config = r'-c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 --psm 7'

for image, label in cropped_image_and_labels_loader(OCR_TEST_CROPS_DIR):
    if len(image.shape) == 3:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        gray = image
    _, image_threshold = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    text = pytesseract.image_to_string(image_threshold, config=custom_config)
    if text == '':
        continue
    text = text.replace(" ", "").strip()
    if text == label:
        success_count += 1

write_results("Tesseract", success_count, TEST_NUMBER)


### Prueba con EasyOCR

In [10]:
import easyocr
import cv2

success_count = 0
reader = easyocr.Reader(['en'])

for image, label in cropped_image_and_labels_loader(OCR_TEST_CROPS_DIR):
    if len(image.shape) == 3:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        gray = image
    _, image_threshold = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    # Detect text
    text = reader.readtext(image, detail=0)
    if len(text) > 1:
        text = ''.join(text)
    elif len(text) == 1:
        text = text[0]
    if len(text) == 0:
        continue
    text = text.upper().replace(" ", "").strip()
    if text == label:
        success_count += 1
write_results("EasyOCR", success_count, TEST_NUMBER)
