In [1]:
import requests
import base64
import json
import pydantic
from pydantic import field_validator
from typing import List, Union, Dict, Literal
import traceback
import datetime
import uuid

In [2]:
# stuff to change for your application
API_KEY = ""
ENDPOINT_URL = ""
MODEL = "vllm-llama-4-scout-17b-16e-instruct"

In [3]:
prompt = """You are given an image of a technical drawing. The drawing will have textual and visual features like lines, hatching and other geometry. Focus only on the text. Extract the text and make some deductions from it:
There will most likely be several views of the same part in the drawing, as well as a table, that contains global information.

From the table extract:
1. Material information
2. Tolerance information (likely depicted using a DIN norm like 2768). The interesting parts are the two characters after the DIN. The characters may be one of ["f", "m", "c", "v"] for the first letter and one of ["H", "K", "L"] for the second. The class might spelled out: f = fein, m = mittel, c = grob, v = sehr grob.
3. Name of the part
4. Surface Information, which is most often referenced using Rz, Ra or Rt in front of a floating point number and is connected to a triangle shape
5. Extract Gd&Ts. These are rectangles that contain a symbol that specifies the GDT class, and a runout in form of a float. Match the symbols in the drawing to these descriptions:
    a) horizontal line -> Straightness
    b) parallelogram -> Flatness
    c) circle (dont confuse with diameter character "⌀", no through line) -> Circularity
    d) circle with two parallel lines on each side -> Cilindricity
    e) half circle with line at bottom -> Profile of a Surface
    f) half circle/ arc -> Profile of a Line
    g) horizontal line and a vertical line
    intersecting at 90°, like an upside down T -> Perpendicularity
    h) Two lines intersecting at an acute angle -> Angularity
    i) Two parallel lines like // -> Parallelism
    j) Circle with a cross -> Position
    k) Circle containing another circle -> Concentricity
    l) 3 parallel horizontal lines: Symmetry
    m) Arrow pointing top right -> Circular Runout
    n) Two arrows pointing top right, connected by a horizontal line -> Total Runout
 and add the classname (after the "->") to the result.
6. Also Extract Threads in all possible forms
7. Extract outer measures of the part. For this, disregard detail views. These are typically annotated using a capital letter and a scale e.g. "A 5:1" or some variation. If there is a diameter sign or "SW" present in the measurement, it indicates that the part is rotationally symmetric and thus the measurement has to be used for two dimensions. To detect biggest measures in each dimension, extract the biggest measure for both dimensions of each view. Afterwards merge the values using views that are adjacent. Reason through this. If you dont find 3 values, fill the output with copies of values that are already in extracted until outer_measures contains 3 floats.

Output the results in the following json format:

OUTPUT START
{
"material": [material strings],
"general_tolerances": {"char1": str, "char2": str},
"name": string,
"surfaces": [surface strings],
"gdts": [{"name": string, "runout": number}],
"threads": [thread strings],
"outer_measures": [x,y,z]
}
OUTPUT END

Mark the json output in your response by adding OUTPUT START and OUTPUT END. Do not add any other indicators, such as 'json' or use "```" to indicate the output. Make sure to use proper double quotes " for the field names. Do NOT add comments inside of the json. Do NOT add example values inside of the json output, only things you actually extracted from the image.

"""

class TechDrawing(pydantic.BaseModel):
    material: List[str]
    general_tolerances: Dict[Literal["char1", "char2"], str]
    name: str
    surfaces: List[str]
    gdts: List[Dict[Literal["name", "runout"], Union[str, float]]]
    threads: List[str]
    outer_measures: List[float]

    @field_validator('outer_measures')
    def validate_outer_measures(cls, v):
        if len(v) != 3:
            raise pydantic.ValidationError("outer_measures must have 3 elements")
        return v

    @field_validator('gdts')
    def validate_gdts(cls, v):
        if len(v) > 0:
            for gdt in v:
                if "name" not in gdt or "runout" not in gdt:
                    raise ValueError("gdts must contain both 'name' and 'runout' keys")
                if not isinstance(gdt["name"], str):
                    raise ValueError("gdts['name'] must be a string")
                if not isinstance(gdt["runout"], (int, float)):
                    raise ValueError("gdts['runout'] must be a number")
        return v

    @field_validator('general_tolerances')
    def validate_general_tolerances(cls, v):
        if len(v["char1"]) > 1:
            raise ValueError("Error at [Pos 0]: General Tolerances must have exactly 1 char in each position")
        if len(v["char1"]) > 1:
            raise ValueError("Error at [Pos 1]: General Tolerances must have exactly 1 char in each position")
        return v


def convert_llm_response_to_dict(response):
    completion = str(response.json()["choices"][0]["message"]["content"])
    json_output = completion.split("OUTPUT START")[-1]
    json_output = json_output.split("OUTPUT END")[0]
    json_output = json_output.replace("\n", "")
    json_output = json.loads(json_output)
    print(json_output)
    return TechDrawing(
        material=json_output["material"],
        general_tolerances=json_output["general_tolerances"],
        name=json_output["name"],
        surfaces=json_output["surfaces"],
        gdts=json_output["gdts"],
        threads=json_output["threads"],
        outer_measures=json_output["outer_measures"]
    )

In [4]:
def load_image_from_name(name):
    file_path = "./data/drawings/" + name
    with open(file_path, "rb") as f:
        img_bytes = f.read()
        img_base64 = str(base64.b64encode(img_bytes)).replace("b'", "").replace("'", "")
    return img_base64

In [5]:
def get_vlm_response(img_name, model_name):
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json",
        "accept": "application/json"
    }
    data = {
       "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{load_image_from_name(img_name)}"}},
                ]
            }
        ],
        "model": model_name
    }
    try:
        response = requests.post(ENDPOINT_URL, headers=headers, data=json.dumps(data), timeout=120)
        print(response.text)
        return convert_llm_response_to_dict(response)
    except Exception as e:
        print(e)
        return None

In [None]:
with open("./labels.json", "r") as f:
    data = json.load(f)

results = []

start = datetime.datetime.now()

for i, img_data in enumerate(data):
    img_name = img_data["name"]

    try:
        print(i, img_name)
        response_data = get_vlm_response(img_name, model)
        if response_data is not None:
            result = {
                "ground_truth": img_data["data"],
                "vlm_reponse": response_data.model_dump_json()
            }
            results.append(result)
    except Exception as e:
        traceback.print_exc()
        continue

end = datetime.datetime.now()

output = {
    "prompt": prompt,
    "model": MODEL,
    "time_spent": (end - start).total_seconds(),
    "results": results
}

with open(MODEL + "___" + str(uuid.uuid4()) + ".json", "w") as f:
    json.dump(output, f)