In [None]:
import dotenv

from openai import OpenAI
from pydantic import BaseModel, Field

from enum import Enum
from typing import List, Optional

dotenv.load_dotenv()
client = OpenAI()

## Text Summarization

In [None]:
def _summarize_article(text):
    system_prompt = """
    You are an AI-powered Earth Observation and Remote Sensing expert summarizer.\n
    Distill the following text into a readable summary of the main information and key visual details.\n
    You will be penalized for missing critical information and details and rewarded for providing a concise and visually informative summary, while discarding redundant text.
    """

    class Summary(BaseModel):
        """
        Domain-specific summary of the provided text, tailored to the field of Remote Sensing.
        """

        summary: str = Field(
            description="Summary of the input text.",
        )

    response = client.responses.parse(
        model="gpt-4o-mini-2024-07-18",
        input=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": text,
            },
        ],
        text_format=Summary,
    )

    return response.output_parsed

## Metadata Generation & Caption Generation

In [None]:
def _extract_data(image_url, article_text):
    system_prompt = """
    You are an Earth Observation (EO) and Remote Sensing (RS) image interpretation and captioning expert assistant.\n
    You will be provided with a RS image along with some text providing valuable information about the same RS image you are analyzing.\n
    Leverage this information to enrich your context and your textual descriptions of the scene.\n
    Your descriptions should convey a deep analysis of the image, as if you are observing and interpreting the scene.\n
    Your task is to extract useful domain-specific metadata and then generate five diverse and detailed captions that describe the RS image comprehensively and incorporate the extracted metadata.\n
    
    You will be rewarded for including the following details into your captions:\n
    (1) Earth Observation and Remote Sensing domain specific vocabulary.\n
    (2) The count, shapes and types of elements visible in the image.\n
    (3) The position of each element and the relative position between the elements within the image frame.\n
    (4) Temporal information (i.e. month or inferred season) based on the supplementary text and/or the image's datetime, if provided. Include these in the caption only if they are visually confirmed by the image and are useful for its interpretation.\n
    (5) Causal interpretations of the factors, processes, interactions, or relationships between the elements in the scene.\n
    (6) Atmospheric, environmental and socioeconomic conditions, land use and land cover, events, patterns, anomalies or any other significant observations.\n
    (7) Key visual features that offer insights into the scene, such as variations in brightness, saturation, texture.\n
    (8) The RS imagery modalities or sensor types used to capture the image (e.g. Optical, Thermal, In-Situ).\n
    (9) The satellite (e.g. Sentinel-2), sensor/instrument (e.g. MultiSpectral Instrument (MSI)) that captured the image.\n
    (10) The RS sensor/instrument spatial resolution [Low, Moderate, High, Very High], not the resolution of the image itself, always mentioned as "spatial resolution".\n
    (11) Any other in-domain details that can be confidently inferred directly from the image, supplemented by information mentioned within the text.\n
    (12) Introduce acronyms and abbreviations with their full names on their first use.\n

    You will be penalized when your captions:\n
    (1) Repeat or paraphrase exactly the same information and interpretations across any of the five captions.\n
    (2) Include hallucinations or provide superficial, incomplete or false EO and RS information.\n
    (3) Use fancy words or phrases that are not commonly used in the EO and RS domain.\n
    (4) Include subjective interpretations (e.g., "the image is inspiring," "creates a unique visual").\n
    (5) Refer to the RS image as "photograph".\n
    (6) Refer to the RS image resolution itself, instead of the sensor's spatial resolution.\n
    (7) Mention RS satellite or sensor details without mentioning the satellite or sensor name.\n
    (8) Include direct references to dates, coordinates, historical events and credits, unless they are directly visible in the image.\n
    (9) Fail to condense as much unique and diverese information as possible, based on the supplementary text and the extracted metadata.\n
    (10) Fail to address every single rewarding detail mentioned above for each image.\n

    You MUST start each caption with 'The image showcases:' followed by a first letter capitalized noun.\n
    """

    class ResolutionEnum(str, Enum):
        """
        This class is used to define the various types of sensor/instrument spatial resolutions in Remote Sensing.
        """

        low = "Low"
        moderate = "Moderate"
        high = "High"
        very_high = "Very High"

    class RS_image(BaseModel):
        """
        This class is used to capture Remote Sensiong domain-specific metadata and captions according to the input image and text.
        """

        tag: Optional[List[str]] = Field(
            description="Three generic domain-specific tags that describe the input topic. This can be a keyword, a category, or a topic."
        )
        location: Optional[str] = Field(
            description="Precise point location of the input, preferably in 'Landmark, City, Country' format, or alternatively in broader terms such as Region or Continent."
        )
        modality: Optional[List[str]] = Field(
            description="The Remote Sensing modalities or sensor types used to capture the image (e.g. Optical, Thermal)."
        )
        satellite: Optional[List[str]] = Field(
            description="The name of the Remote Sensing satellite that captured the image, if applicable (e.g. Sentinel-2)."
        )
        sensor: Optional[List[str]] = Field(
            description="The Remote Sensing sensor/instrument used to capture the image, if applicable (e.g. MultiSpectral Instrument (MSI))."
        )
        resolution: Optional[ResolutionEnum] = Field(
            description="The spatial resolution of the Remote Sensing sensor or instrument."
        )
        captions: List[str] = Field(
            description="Five captions that describe the Remote Sensing image comprehensively, according to the guidelines."
        )

    response = client.responses.parse(
        model="gpt-4o-2024-08-06",
        input=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": [
                    {"type": "input_text",
                     "text": article_text
                    },
                    {
                        "type": "input_image",
                        "image_url": image_url,
                    },
                ],
            },
        ],
        text_format=RS_image,
    )

    return response.output_parsed