In [None]:
!pip install -q ultralytics
!pip install -q tools
!pip install -q --upgrade pymupdf
import fitz

In [3]:
from ultralytics import YOLO
import os
import json
import tempfile
from PIL import Image

In [4]:
#from google.colab import drive
#drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
model_path = "YOLOv8n_finetuned_onselected.pt"
#test_files_path = "/content/drive/MyDrive/HACKATHON_ARMETA/for_submit_files"
#output_json_path = "/content/drive/MyDrive/HACKATHON_ARMETA/submit_jsons/predictions_ft.json"

test_files_path = "test_files"
output_json_path = "output_json/annotations.json"

In [6]:
# YOLO inference settings
BATCH_SIZE = -1       # adjust down if GPU OOM (8 -> 4 -> 2 -> 1)
IMG_SIZE = 832      # adjust down if GPU OOM
CONF_THRES = 0.25
DEVICE = 0           # 0 = first GPU, 'cpu' to force CPU

In [7]:
# Class id -> label name for OUTPUT
id2label = {
    0: "signature",
    1: "stamp",
    2: "qr",     # note: you trained on 'qrcode', but output must be 'qr'
}

In [8]:
model = YOLO(model_path)

#Helper functions

In [9]:
def collect_pdf_paths(root_dir):
    pdfs = []
    for fname in os.listdir(root_dir):
        if fname.lower().endswith(".pdf"):
            pdfs.append(os.path.join(root_dir, fname))
    pdfs.sort()
    return pdfs

In [10]:
import os
import fitz          # from pymupdf
from PIL import Image

def pdfs_to_images(pdf_paths, tmp_dir):
    """
    Convert each PDF into PNG images, one per page,
    resized to (1684,1190) or (1190,1684) depending on orientation.
    Returns a list of dicts:
      [{
         "img_path": "/tmp/.../локалсмета-_page_3.png",
         "pdf_key": "локалсмета-.pdf",
         "page_num": 3
       }, ...]
    """
    pages_info = []

    for pdf_path in pdf_paths:
        pdf_filename = os.path.basename(pdf_path)      # e.g. "локалсмета-.pdf"
        pdf_stem, _ = os.path.splitext(pdf_filename)   # e.g. "локалсмета-"

        doc = fitz.open(pdf_path)
        for page_index in range(len(doc)):
            page = doc[page_index]

            # 1) Render page to a pixmap (any reasonable matrix/dpi)
            pix = page.get_pixmap()   # default resolution is fine; we will resize

            # 2) Convert pixmap to PIL image
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

            # 3) Decide target size based on orientation
            if pix.width >= pix.height:
                target_size = (1684, 1190)   # landscape
            else:
                target_size = (1190, 1684)   # portrait

            img = img.resize(target_size, Image.BILINEAR)

            # 4) Save image
            page_num = page_index + 1
            img_name = f"{pdf_stem}_page_{page_num}.png"
            img_path = os.path.join(tmp_dir, img_name)
            img.save(img_path)

            pages_info.append({
                "img_path": img_path,
                "pdf_key": pdf_filename,  # keep ".pdf" in key
                "page_num": page_num
            })

        doc.close()

    return pages_info


In [12]:
def run_inference_and_build_json(pages_info):
    """
    Run YOLO on all page images and build final nested JSON structure.
    """
    # Final output structure
    # {
    #   "локалсмета-.pdf": {
    #       "page_3": {
    #           "annotations": [ { "annotation_1": {...} }, ... ],
    #           "page_size": { "width": ..., "height": ... }
    #       },
    #       ...
    #   },
    #   ...
    # }
    predictions = {}

    # List of image paths in the same order as pages_info
    image_paths = [p["img_path"] for p in pages_info]

    # Run YOLO with streaming and batching on GPU
    results = model.predict(
        source=image_paths,
        stream=True,
        conf=CONF_THRES,
        batch=BATCH_SIZE,
        device=DEVICE,
        imgsz=IMG_SIZE,
        verbose=True,
    )

    # Iterate over pages and corresponding results
    for page_info, result in zip(pages_info, results):
        img_path = page_info["img_path"]
        pdf_key = page_info["pdf_key"]          # e.g. "локалсмета-.pdf"
        page_num = page_info["page_num"]        # e.g. 3
        page_key = f"page_{page_num}"

        # Get page size in pixels from the rendered image
        with Image.open(img_path) as im:
            page_width, page_height = im.size

        # Ensure entries exist
        if pdf_key not in predictions:
            predictions[pdf_key] = {}

        if page_key not in predictions[pdf_key]:
            predictions[pdf_key][page_key] = {
                "annotations": [],
                "page_size": {
                    "width": page_width,
                    "height": page_height
                }
            }

        page_entry = predictions[pdf_key][page_key]

        # If there are no detections, skip
        if result.boxes is None or len(result.boxes) == 0:
            continue

        # Boxes in normalized xywh format (center_x, center_y, width, height)
        boxes_xywhn = result.boxes.xywhn.cpu().numpy()
        classes = result.boxes.cls.cpu().numpy()

        for i, (cx_n, cy_n, w_n, h_n) in enumerate(boxes_xywhn):
            cls_id = int(classes[i])
            category = id2label.get(cls_id, f"class_{cls_id}")

            # Convert from normalized [0,1] to pixel coordinates
            w = float(w_n * page_width)
            h = float(h_n * page_height)
            x = float(cx_n * page_width - w / 2.0)  # top-left x
            y = float(cy_n * page_height - h / 2.0) # top-left y
            area = w * h

            ann_index = len(page_entry["annotations"]) + 1
            ann_key = f"annotation_{ann_index}"

            ann_obj = {
                ann_key: {
                    "category": category,
                    "bbox": {
                        "x": x,
                        "y": y,
                        "width": w,
                        "height": h
                    },
                    "area": area
                }
            }

            page_entry["annotations"].append(ann_obj)

    return predictions

In [13]:
def run():
    pdf_paths = collect_pdf_paths(test_files_path)
    if not pdf_paths:
        print(f"No PDF files found in {test_files_path}")
        return

    with tempfile.TemporaryDirectory() as tmp_dir:
        print(f"Using temporary directory for page images: {tmp_dir}")

        # 1) Convert all PDFs to images
        pages_info = pdfs_to_images(pdf_paths, tmp_dir)
        print(f"Total pages to process: {len(pages_info)}")

        # 2) Run YOLO and build JSON
        predictions = run_inference_and_build_json(pages_info)

    # 3) Save JSON
    with open(output_json_path, "w", encoding="utf-8") as f:
        json.dump(predictions, f, ensure_ascii=False, indent=2)

    print(f"Saved predictions to {output_json_path}")

In [14]:
run()

Using temporary directory for page images: /tmp/tmpr47wbb9s
Total pages to process: 129

0: 832x832 1 qr, 3.7ms
1: 832x832 1 qr, 3.7ms
2: 832x832 1 qr, 3.7ms
3: 832x832 1 qr, 3.7ms
4: 832x832 1 qr, 3.7ms
5: 832x832 1 qr, 3.7ms
6: 832x832 1 qr, 3.7ms
7: 832x832 1 signature, 1 qr, 3.7ms
8: 832x832 16 qrs, 3.7ms
9: 832x832 1 qr, 3.7ms
10: 832x832 1 qr, 3.7ms
11: 832x832 1 qr, 3.7ms
12: 832x832 1 qr, 3.7ms
13: 832x832 1 qr, 3.7ms
14: 832x832 1 qr, 3.7ms
15: 832x832 1 qr, 3.7ms
16: 832x832 1 signature, 1 qr, 3.7ms
17: 832x832 16 qrs, 3.7ms
18: 832x832 1 qr, 3.7ms
19: 832x832 1 qr, 3.7ms
20: 832x832 1 qr, 3.7ms
21: 832x832 1 qr, 3.7ms
22: 832x832 1 qr, 3.7ms
23: 832x832 1 qr, 3.7ms
24: 832x832 2 qrs, 3.7ms
25: 832x832 1 signature, 1 qr, 3.7ms
26: 832x832 16 qrs, 3.7ms
27: 832x832 1 signature, 1 qr, 3.7ms
28: 832x832 16 qrs, 3.7ms
29: 832x832 3 signatures, 1 stamp, 3.7ms
30: 832x832 (no detections), 3.7ms
31: 832x832 1 signature, 1 qr, 3.7ms
32: 832x832 1 qr, 3.7ms
33: 832x832 1 qr, 3.7ms
34: