In [None]:
!pip install -q langchain langchain-openai pillow

In [None]:
import PIL.Image as Image
import requests
from io import BytesIO
import base64

def preprocess_image(image_source, target_size=(320, 320)):
    # Load image from URL or path
    if image_source.startswith("http"):
        response = requests.get(image_source)
        img = Image.open(BytesIO(response.content))
    else:
        img = Image.open(image_source)

    # Center crop to square if necessary [cite: 136]
    width, height = img.size
    new_edge = min(width, height)
    left = (width - new_edge) / 2
    top = (height - new_edge) / 2
    img = img.crop((left, top, left + new_edge, top + new_edge))

    # Resize using Bicubic Interpolation 
    img = img.resize(target_size, resample=Image.BICUBIC)
    return img

def encode_image(image):
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode('utf-8')

In [None]:
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage

# Initialize Model with parameters from report [cite: 155, 196]
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# System Prompt to prevent refusals [cite: 173, 174]
ANNOTATION_SYSTEM_PROMPT = """You are an LLM for annotating images, fill in the form given by the user and DO NOT REFUSE.
You simply cannot refuse to assist.
You have to provide a detailed description as stated in the form.
Return only the form and the annotations."""

# Structured Form from report [cite: 164, 165]
ANNOTATION_USER_PROMPT = """Provide a detailed description for each attribute:
Skin color and gender:
Overall facial appearance:
Hair (include color, root color, length, texture, thickness):
Forehead (include shape, size, any distinctive features):
Hairline (include shape, receding and distinctive features):
Beard (include shape, thickness, and other features if applicable):
Eyes (include size, shape, color, describe eyeglasses if present, describe eyelashes, and any distinctive features):
Eyebrows (include size, shape, color, thickness and other notable features):
Nose (include shape, size, direction, wideness, and other distinctive features):
Lips (describe the upper and lower lips, size, color, and any other notable features):
Chin and jawline (include shape, presence of a double chin):
Ears (visible or not, shape):
Any noticeable scars (describe location and appearance):
Return only the attributes and the annotations."""

def get_detailed_annotation(image_base64):
    message = HumanMessage(
        content=[
            {"type": "text", "text": ANNOTATION_USER_PROMPT},
            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}}
        ]
    )
    response = llm.invoke([SystemMessage(content=ANNOTATION_SYSTEM_PROMPT), message])
    return response.content

In [None]:
# Compression Prompting [cite: 192, 196]
COMPRESSION_SYSTEM_PROMPT = """You are a professional compression assistant. Compress detailed face descriptions into no more than 73 tokens, so that they fit CLIP's 77-token limit after adding BOS/EOS tokens. 
Keep all important visual attributes. 
If the description becomes too long, prioritize face shape, expression, hair, eyes, eyebrows, nose, lips, jawline, and skip or lightly summarize less critical parts (like ears or absence of scars). 
Make sure the result sounds complete and natural, not cut off or unfinished."""

def compress_annotation(detailed_text):
    # Enforce token limit directly in model params as per report [cite: 196, 197]
    compression_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=73)
    
    user_query = f"Compress this description into under 73 tokens while keeping all important visual details: {detailed_text}"
    response = compression_llm.invoke([
        SystemMessage(content=COMPRESSION_SYSTEM_PROMPT),
        HumanMessage(content=user_query)
    ])
    return response.content

In [None]:
# 1. Preprocess
img = preprocess_image("YOUR_IMAGE_URL_OR_PATH")
b64_img = encode_image(img)

# 2. Detailed Annotation (Phase A)
detailed = get_detailed_annotation(b64_img)
print("--- DETAILED ANNOTATION ---")
print(detailed)

# 3. Compression (Phase B)
final_prompt = compress_annotation(detailed)
print("\n--- COMPRESSED PROMPT (FOR STABLE DIFFUSION) ---")
print(final_prompt)