In [None]:
import base64
import io
import json
from os import getenv

import requests
from PIL import Image
from pydantic import BaseModel, ConfigDict, Field, ValidationError
from typing import List
from typing import Optional

from pydantic import BaseModel, Field, ConfigDict, validator, ValidationError
from pydantic import BaseModel, Field, ConfigDict, field_validator

In [None]:
load_dotenv(find_dotenv())

In [None]:
OPENROUTER_API_KEY_1 = getenv("OPENROUTER_API_KEY_1")
OPENROUTER_API_KEY_2 = getenv("OPENROUTER_API_KEY_2")
if not OPENROUTER_API_KEY_1 and not OPENROUTER_API_KEY_2:
    raise ValueError("OPENROUTER_API_KEY environment variable is not set.")

In [None]:
def image_to_base64(image_path: str) -> str:
    """
    Reads an image from disk and returns its Base64-encoded string (UTF-8).
    """
    with open(image_path, "rb") as img_file:
        encoded_bytes = base64.b64encode(img_file.read())
    return encoded_bytes.decode("utf-8")

def image_url(image_path: str) -> str:
    """
    Returns a URL to the image file.
    """
    return f"data:image/jpeg;base64,{image_to_base64(image_path)}"

In [None]:
from datetime import datetime
from zoneinfo import available_timezones, ZoneInfo

_VALID_ABBR: set[str] = {
    abbr
    for tz in available_timezones()
    if (abbr := ZoneInfo(tz).tzname(datetime.utcnow()))
    and abbr.isalpha()
}
_VALID_ABBR

In [None]:
class MovieMetadata(BaseModel):
    movie: Optional[str] = Field(None, description="Name of the movie")
    date: Optional[str] = Field(None, description="Date of the movie")
    time: Optional[str] = Field(None, description="Time of the movie")
    timezone_abbrv: Optional[str] = Field(None, description="Timezone abbreviation (e.g., IST, EST)")
    theater: Optional[str] = Field(None, description="Name of the theater or cinema where the movie is shown")
    seats: Optional[List[str]] = Field(default_factory=list, description="List of seat identifiers")
    language: Optional[str] = Field(None, description="Language of the movie")
    screen: Optional[str] = Field(None, description="Screen number or details")
    booking_ref: Optional[str] = Field(None, description="Booking reference or ticket ID")
    certificate: Optional[str] = Field(None, description="Movie certificate details")

    model_config = ConfigDict(extra="forbid", frozen=True)

    @field_validator("timezone_abbrv", mode="after")
    @classmethod
    def check_abbrv(cls, v: Optional[str]) -> Optional[str]:
        if v is None:
            return None
        return v if v in _VALID_ABBR else None




# schema = MovieMetadata.model_json_schema()
# import json
# print(json.dumps(schema, indent=2))

raw_schema = MovieMetadata.model_json_schema()


In [None]:
raw_schema

In [None]:
SYSTEM_PROMPT = """
You are a highly reliable assistant specialized in extracting structured data from images of movie tickets.

Goal:
- Analyze the attached image (photo, screenshot, scanned, or printed ticket).
- Extract as much ticket information as possible: movie title, date (YYYY-MM-DD), time (HH:MM), theater name, seats, language, screen, booking reference if visible, certificate if visible, and any other relevant fields.
- Based on the theatre, infer the **alphabetic timezone abbreviation** with **proper casing** (e.g., "IST", "EST", "ChST"). Do **not** use numeric offsets like “+05:30”. If uncertain, set `timezone_abbrv` to null.


Return Format:
- Output **only valid JSON** matching this schema:
  {
    "movie": string or null,
    "date": "YYYY-MM-DD" or null,
    "time": "HH:MM" or null,
    "timezone_abbrv": string or null,
    "theater": string or null,
    "seats": [string, ...] or empty array,
    "language": string or null,
    "screen": string or null,
    "booking_ref": string or null,
    "certificate": string or null
  }

Seats Parsing Rules:
- Interpret the “seats” field as a JSON array listing each seat code.
- If the ticket text reads `"PE - G17, G18"`, output `["G17", "G18"]`.
- Steps:
  1. Split the raw seats text on commas.
  2. Trim whitespace.
  3. Remove any common prefix ending with a hyphen (e.g., `"PE - "`).
- Return only the cleaned seat identifiers.

General Constraints:
- Do NOT include commentary or extra keys—only this JSON structure.
- If a field is missing or unreadable, use `null` (or `[]` for Seats).
- Use exact `"YYYY-MM-DD"` and `"HH:MM"` 24-hour format based on the ticket's local time.
- Ensure OCR correctness — do not guess or hallucinate.
- Output must be raw JSON (no markdown or explanation).

An image will be provided in the user message.
"""

In [None]:
USER_PROMPT = """
Here is the movie ticket. Please extract the details exactly following the specified JSON schema and constraints.
"""

In [None]:
# movie_schema = {
#     "type": "object",
#     "properties": {
#         "MovieName": {"type": "string", "description": "Name of the movie"},
#         "Theatre": {"type": "string", "description": "Theatre name or location"},
#         "Date": {"type": "string", "description": "Date of the show (YYYY-MM-DD)"},
#         "Time": {"type": "string", "description": "Time of the show (HH:MM)"},
#         "Seats": {
#             "type": "array",
#             "items": {"type": "string"},
#             "description": "List of seat identifiers",
#         },
#         "BookingRef": {
#             "type": "string",
#             "description": "Booking reference or ticket ID, if present",
#         },
#     },
#     "required": ["MovieName", "Date", "Time", "Seats"],
#     "additionalProperties": False,
# }

movie_schema = {
    "title": raw_schema.get("title", "MovieMetadata"),
    "type": "object",
    "additionalProperties": False,
    "properties": {
        "movie": raw_schema["properties"]["movie"],
        "date": raw_schema["properties"]["date"],
        "time": raw_schema["properties"]["time"],
        "theater": raw_schema["properties"]["theater"],
        "seats": raw_schema["properties"]["seats"],
        "language": raw_schema["properties"]["language"],
        "screen": raw_schema["properties"]["screen"],
        "booking_ref": raw_schema["properties"]["booking_ref"],
        "certificate": raw_schema["properties"]["certificate"],
    },
    "required": ["movie", "date", "time", "theater", "seats", "language", "screen", "timezone_abbrv"],
}
movie_schema

In [None]:

# movie_schema = MovieMetadata.model_json_schema()

response = requests.post(
    url="https://openrouter.ai/api/v1/chat/completions",
    headers={
        "Authorization": f"Bearer {OPENROUTER_API_KEY_2}",
        "Content-Type": "application/json",
        # "HTTP-Referer": "<YOUR_SITE_URL>",  # Optional. Site URL for rankings on openrouter.ai.
        # "X-Title": "<YOUR_SITE_NAME>",  # Optional. Site title for rankings on openrouter.ai.
    },
    data=json.dumps({
        "model": "qwen/qwen2.5-vl-72b-instruct:free",
        # "model": "meta-llama/llama-3.2-11b-vision-instruct:free",
        # "provider": {"require_parameters": True},
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": USER_PROMPT},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url(
                                rf"C:\Documents\movie-log\test-images\ticket-2.png"
                            ),
                        },
                    },
                ],
            },
        ],
        "response_format": {
            "type": "json_schema",
            "json_schema": {
                "name": "MovieTicket",
                "strict": True,
                "schema": movie_schema,
                # "schema": MovieMetadata.model_json_schema(),
                # "schema": json.dumps(movie_schema, indent=2),   # throwing error with json format
            },
        },
    },)
)


print(response.status_code)
print(
    json.dumps(
        response.json(),
        indent=2,
    )
)

print(repr(response.json()["choices"][0]["message"]["content"]))

In [None]:
# supported_models = requests.get("https://openrouter.ai/api/v1/models?supported_parameters=structured_outputs")

# print(json.dumps(supported_models.json(), indent=2))

In [None]:

# # movie_schema = MovieMetadata.model_json_schema()

# response = requests.post(
#     url="https://openrouter.ai/api/v1/chat/completions",
#     headers={
#         "Authorization": f"Bearer {OPENROUTER_API_KEY_2}",
#         "Content-Type": "application/json",
#         # "HTTP-Referer": "<YOUR_SITE_URL>",  # Optional. Site URL for rankings on openrouter.ai.
#         # "X-Title": "<YOUR_SITE_NAME>",  # Optional. Site title for rankings on openrouter.ai.
#     },
#     data=json.dumps({
#         "model": "qwen/qwen2.5-vl-72b-instruct:free",
#         # "model": "meta-llama/llama-3.2-11b-vision-instruct:free",
#         # "provider": {"require_parameters": True},
#         "messages": [
#             {"role": "system", "content": SYSTEM_PROMPT},
#             {
#                 "role": "user",
#                 "content": [
#                     {"type": "text", "text": USER_PROMPT},
#                     {
#                         "type": "image_url",
#                         "image_url": {
#                             "url": image_url(
#                                 rf"C:\Documents\movie-log\test-images\ticket-1.jpg"
#                             ),
#                         },
#                     },
#                 ],
#             },
#         ],
#         "response_format": {
#             "type": "json_object",
#             "object": MovieMetadata.model_json_schema(),
#         }
#     },)
# )


# print(response.status_code)
# print(
#     json.dumps(
#         response.json(),
#         indent=2,
#     )
# )

# print(repr(response.json()["choices"][0]["message"]["content"]))

In [None]:
from openai import OpenAI

In [None]:
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=getenv("OPENROUTER_API_KEY_1"),
)

In [None]:
completion = client.chat.completions.create(
    model="qwen/qwen2.5-vl-72b-instruct:free",
    # extra_headers={
    #     "HTTP-Referer": "<YOUR_SITE_URL>",  # Optional. Site URL for rankings on openrouter.ai.
    #     "X-Title": "<YOUR_SITE_NAME>",  # Optional. Site title for rankings on openrouter.ai.
    # },
    # pass extra_body to access OpenRouter-only arguments.
    # extra_body={
    # "models": [
    #   "${Model.GPT_4_Omni}",
    #   "${Model.Mixtral_8x_22B_Instruct}"
    # ]
    # },
    messages= [
            {"role": "system", "content": SYSTEM_PROMPT},
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": USER_PROMPT},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url(
                                rf"C:\Documents\movie-log\test-images\ticket-1.jpg"
                            ),
                        },
                    },
                ],
            },
        ],
    # response_format=MovieMetadata,
    response_format={
        "type": "json_schema",
        "json_schema": {"name": "MovieTicket", "strict": True, "schema": movie_schema},
    },
)

print(completion)

print(completion.choices[0].message.content)


In [None]:
# response = client.responses.parse(
response = client.beta.chat.completions.parse(
    model="qwen/qwen2.5-vl-72b-instruct:free",
    # input=[
    messages= [
            {"role": "system", "content": SYSTEM_PROMPT},
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": USER_PROMPT},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url(
                                rf"C:\Documents\movie-log\test-images\ticket-3.png"
                            ),
                        },
                    },
                ],
            },
        ],
    # text_format=MovieMetadata,
    response_format=MovieMetadata,
)
# ticket: MovieMetadata = response.output_parsed
# print(ticket.json(indent=2))

try:
    ticket: MovieMetadata = response.choices[0].message.parsed
    print(ticket.model_dump_json(indent=2))
except ValidationError as e:
    print("Validation error:", e)
