In [8]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
import matplotlib.pyplot as plt
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import pandas as pd
import os

local_model_path = "../model/"
# Load model and processor from local path
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    local_model_path, torch_dtype="auto", device_map="cpu", local_files_only=True
)
processor = AutoProcessor.from_pretrained(local_model_path)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [12]:
def load_and_process_files(file_paths, resize_to=(512, 512), dpi=150):
    if isinstance(file_paths, str):
        file_paths = [file_paths]

    all_images = []

    for path in file_paths:
        ext = os.path.splitext(path)[1].lower()

        if ext == '.pdf':
            # Handle PDF pages
            doc = fitz.open(path)
            for page_number in range(len(doc)):
                page = doc[page_number]
                pix = page.get_pixmap(dpi=dpi)
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                if img and resize_to:
                    processed = img.resize(resize_to)
                if processed:
                    all_images.append(processed)
        else:
            # Handle image file
            img = Image.open(path).convert("RGB")
            if img and resize_to:
                processed = img.resize(resize_to)
            if processed:
                all_images.append(processed)

    return all_images




In [13]:
def get_message(image_paths, prompt):
    messages = [
    {
        "role": "user",
        "content": [
            # Add each image entry
            *[
                {
                    "type": "image",
                    "image": path,
                } for path in image_paths
            ],
            # Add the final instruction
            {
                "type": "text",
                "text": prompt,
            }
        ],
    }
   ]
    return messages

In [14]:
def get_output_text(messages):
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=100)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )
    return output_text


In [15]:
import re

def get_data_from_output_text(output_text):
    # Extract the string from the list
    text = output_text[0]

    # Initialize dictionary
    data = {}
    for line in text.split('\n'):
        line = line.strip()

        if ':' in line:
            raw_key, value = line.split(':', 1)
            value = value.strip()

            # Clean key: remove everything before the first letter and after the last letter
            cleaned_key = re.sub(r'^[^a-zA-ZÀ-ÿ]+', '', raw_key)        # Remove prefix
            cleaned_key = re.sub(r'[^a-zA-ZÀ-ÿ\s]+$', '', cleaned_key)  # Remove suffix
            cleaned_key = cleaned_key.strip()

            if cleaned_key and value:
                data[cleaned_key] = value

    return data


In [None]:
grise_path = [
    "../images/grise_vehicule.jpg",
   ]

grise_prompt = "From this image, Extract these informations: Numéro d'immatriculation, Propriétaire, Fin de validité, P.T.A.C.  Return in JSON."
message_grise = get_message(grise_path, grise_prompt)
grise_infos = get_output_text(message_grise)
print(grise_infos)


In [None]:
assurance_path = [
    "../images/assurance_vehicule.jpg",
   ]

assurance_prompt = """
    From these images, extract the following informations:
    Numéro d'immatriculation (matricule)
    Période de garantie
    Nom
    Marque et type
    Important: The 'matricule' follows this format: a sequence of digits, followed by a single uppercase letter, enclosed between two hyphens (e.g., 123-A-456).
    """
message_assurance = get_message(assurance_path, assurance_prompt)
assurance_infos = get_output_text(message_assurance)
print(assurance_infos)


["Here are the extracted informations from the image:\n\n- **Nom**: STF TOP AUTO\n- **N° d'immatriculation**: 4913\n- **PERIODE DE GARANTIE**: JANVIER 2025 - DÉCEMBRE 2025"]


In [8]:
get_data_from_output_text(assurance_infos)

{'Nom': 'STF TOP AUTO',
 "N° d'immatriculation": '4913',
 'PERIODE DE GARANTIE': 'JANVIER 2025 - DÉCEMBRE 2025'}

In [None]:
vignite_path = [
    "../images/vignite_vehicule.jpg",
   ]

vignite_prompt = """
    From these images, extract the following information:
    - DATE DU CONTROLE
    - DATE DE VALIDITE
    - Numéro d'immatriculation
    - Propriétaire
    Important: The 'matricule' follows this format: a sequence of digits, followed by a single uppercase letter, enclosed between two hyphens (e.g., 123-A-456).
    The Arabic letter is part of the Arabic alphabet and not a number. It must be one of the following:
     أ, ب, ت, ث, ج, ح, خ, د, ذ, ر, ز, س, ش, ص, ض, ط, ظ, ع, غ, ف, ق, ك, ل, م, ن, ه, و, ي
    """
message_vignite = get_message(vignite_path, vignite_prompt)
vignite_infos = get_output_text(message_vignite)
print(vignite_infos)


In [None]:
VT_path = [
    "../images/VT_vehicule.jpg",
   ]

VT_prompt = "From this image, Extract these informations: DATE DU CONTROLE, Immatriculation, PROPRIETAIRE. Return in JSON."
message_VT = get_message(VT_path, VT_prompt)
VT_infos = get_output_text(message_VT)
print(VT_infos)

In [None]:
circulation_path = [
    "../Qwen2.5-VL-3B-Instruct/images/autorisa_circ_vehicule.jpg",
   ]

circulation_prompt = "From this image, Extract these informations: DATE DU CONTROLE, Immatriculation, PROPRIETAIRE. Return in JSON."
message_circulation = get_message(circulation_path, circulation_prompt)
circulation_infos = get_output_text(message_circulation)
print(circulation_infos)