# Automated Attention-Aware Output Generator
This iterates through NEAR project folders, analyzes gaze data with Gemini, and saves composite results.

In [8]:
!pip install -q openai

In [9]:
import os
import re
import glob
import time
import base64
from typing import Optional, Tuple

from tqdm.auto import tqdm
from PIL import Image, ImageDraw, ImageFont
from openai import OpenAI
from google.colab import drive, userdata

# ============================================================
# 1) AUTHENTICATION
# ============================================================
OPENAI_API_KEY = userdata.get("OPENAI_KEY")
client = OpenAI(api_key=OPENAI_API_KEY)

drive.mount("/content/drive")

# ============================================================
# 2) PATH CONFIG
# ============================================================
BASE_PATH = "/content/drive/My Drive/AI_event_demo/NEAR_Experiment_Design_Output/"
OUTPUT_ROOT = "/content/drive/My Drive/AI_event_demo/Attention-Aware_Output"
os.makedirs(OUTPUT_ROOT, exist_ok=True)

TASK_GROUPS = [
    # ["AT_1", "AT_2", "AT_3_1", "AT_3_2"],
    ["Ayu_1", "Ayu_2", "Ayu_3"],
    # ["JC_1", "JC_2", "JC_3_1", "JC_3_2"],
    # ["KC_1", "KC_2", "KC_3_1", "KC_3_2"],
    # ["LKH_1", "LKH_2", "LKH_3_1", "LKH_3_2"],
    # ["SYH_1_simple", "SYH_2_simple", "SYH_3_1_simple", "SYH_3_2_simple"],
    # ["YL_1", "YL_2", "YL_3"],
]

ALL_FOLDERS = [f for g in TASK_GROUPS for f in g]

# ============================================================
# 3) FONT SETUP
# ============================================================

def load_fonts():
    """
    Robust font loader for Colab environments.

    Why this exists:
    - Colab images can vary (python 3.10/3.11/3.12, different base images).
    - Some runtimes do NOT include the same fonts in the same locations.
    - This function tries multiple candidates and falls back gracefully.

    Returns:
        (header_font, label_font, body_font)
    """
    candidates = [
        # DejaVu (common on many Linux distros)
        ("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
         "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"),

        # Liberation (also common)
        ("/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf",
         "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf"),

        # Noto (sometimes present)
        ("/usr/share/fonts/truetype/noto/NotoSans-Regular.ttf",
         "/usr/share/fonts/truetype/noto/NotoSans-Bold.ttf"),

        # Alternative DejaVu location (rare but possible)
        ("/usr/share/fonts/dejavu/DejaVuSans.ttf",
         "/usr/share/fonts/dejavu/DejaVuSans-Bold.ttf"),
    ]

    for regular_path, bold_path in candidates:
        if os.path.exists(regular_path) and os.path.exists(bold_path):
            try:
                header = ImageFont.truetype(bold_path, 16)
                label  = ImageFont.truetype(bold_path, 12)
                body   = ImageFont.truetype(regular_path, 12)
                print(f"[Font] Using: {regular_path} / {bold_path}")
                return header, label, body
            except Exception as e:
                print(f"[Font] Failed to load {regular_path}: {e}")

    # Safe fallback: never crash due to font issues
    print("[Font] Falling back to default PIL font (may look less pretty).")
    return ImageFont.load_default(), ImageFont.load_default(), ImageFont.load_default()

FONT_HEADER, FONT_LABEL, FONT_BODY = load_fonts()

# ============================================================
# 4) HELPERS
# ============================================================

def encode_image(path):
    with open(path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

def parse_task_type(folder_name):
    name = folder_name.replace("_simple", "")
    if "_3_1" in name:
        return "task3_1"
    if "_3_2" in name:
        return "task3_2"
    if name.endswith("_1"):
        return "task1"
    if name.endswith("_2"):
        return "task2"
    if name.endswith("_3"):
        return "task3"
    return "unknown"

def get_group_name(folder):
    return folder.split("_")[0]

def get_reference_folder(folder, suffix):
    return f"{get_group_name(folder)}{suffix}"

def prefer_crop(frames_dir, idx):
    src_crop = os.path.join(frames_dir, f"src_{idx}_crop.png")
    heat_crop = os.path.join(frames_dir, f"heat_{idx}_crop.png")
    if os.path.exists(src_crop):
        return src_crop
    if os.path.exists(heat_crop):
        return heat_crop
    return None

def sanitize_model_text(text: str) -> str:
    """
    Remove common markdown artifacts that LLMs sometimes output.
    This keeps the final rendered text clean (e.g., removes '###', '**', etc.).
    """
    if not text:
        return ""

    # Remove markdown headings like ###, ##, #
    text = re.sub(r"^\s*#{1,6}\s*", "", text, flags=re.MULTILINE)

    # Remove bold/italic markers
    text = text.replace("**", "").replace("*", "")

    # Collapse excessive blank lines
    text = re.sub(r"\n{3,}", "\n\n", text)

    return text.strip()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[Font] Using: /usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf / /usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf


In [10]:
# ============================================================
# 5) PROMPT ENGINEERING
# ============================================================

def build_prompt(task_type):
    base = """
You are analyzing a user's visual attention in an experiment.

Return:
Line 1: A short heading (<=10 words).
Line 2+: A focused paragraph (3-6 sentences).
No bullet points.
Be concise and emphasize key differences or important objects.
"""

    if task_type == "task1":
        return base + """
Task 1:
State what the participant is looking at.
Describe key visual features of the attended object.
"""
    if task_type == "task2":
        return base + """
Task 2:
State what the participant is doing (e.g., comparing).
Explain similarities and differences being inspected.
"""
    if task_type == "task3_1":
        return base + """
Task 3_1:
Participant is reviewing the photo.
Describe multiple important objects and their details.
"""
    if task_type == "task3_2":
        return base + """
Task 3_2:
Compare the REFERENCE image and CURRENT image.
Describe what is different.
"""
    if task_type == "task3":
        return base + """
Task 3:
Compare REFERENCE (Task1) and CURRENT image.
Explain what is missing and how gaze suggests the participant is reasoning.
"""
    return base

# ============================================================
# 6) GPT-4o CALL
# ============================================================

def call_gpt4o(prompt, image_paths):
    messages = [{
        "role": "user",
        "content": [{"type": "text", "text": prompt}]
    }]

    for path in image_paths:
        messages[0]["content"].append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/png;base64,{encode_image(path)}"
            }
        })

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        max_tokens=600
    )

    return response.choices[0].message.content

# ============================================================
# 7) RENDER OUTPUT IMAGE
# ============================================================

def wrap_text(draw, text, font, max_width):
    words = text.split()
    lines = []
    current = ""
    for w in words:
        test = (current + " " + w).strip()
        if draw.textlength(test, font=font) <= max_width:
            current = test
        else:
            lines.append(current)
            current = w
    if current:
        lines.append(current)
    return lines

def create_output_image(heat_path, right_images, text, save_path, title):
    """
    Create the final composite image.
    Layout:
      - Top: Title
      - Left: Heatmap context
      - Right: AOI crop(s)
      - Bottom: Text block (clean + clipped to box)
    """

    # -----------------------------
    # Canvas / layout configuration
    # -----------------------------
    CANVAS_W, CANVAS_H = 1200, 820

    # Heatmap stays the same size for now (you can enlarge later if you want)
    heat_img = Image.open(heat_path).convert("RGB").resize((640, 360))

    # Right panel x anchor (keeps your layout unchanged)
    RIGHT_X = 740
    TOP_Y = 100

    # Bottom text box: extend downward (taller + lower)
    BOX_X1, BOX_Y1 = 24, 520
    BOX_X2, BOX_Y2 = 1176, 805  # extended down + wider to match larger canvas

    # Text padding inside the box
    PAD_X = 18
    PAD_TOP = 14

    canvas = Image.new("RGB", (CANVAS_W, CANVAS_H), (248, 248, 248))
    draw = ImageDraw.Draw(canvas)

    # -----------------------------
    # Header and labels
    # -----------------------------
    draw.text((24, 18), f"Experiment Task: {title}", font=FONT_HEADER, fill=(20, 20, 20))
    draw.text((24, 70), "User Gaze Context (Heatmap)", font=FONT_LABEL, fill=(90, 90, 90))
    draw.text((RIGHT_X, 70), "Extracted AOI", font=FONT_LABEL, fill=(90, 90, 90))

    # -----------------------------
    # Left heatmap
    # -----------------------------
    canvas.paste(heat_img, (24, TOP_Y))

    # -----------------------------
    # Right crop(s)
    # -----------------------------
    if len(right_images) == 1:
        crop = Image.open(right_images[0]).convert("RGB").resize((260, 260))
        canvas.paste(crop, (RIGHT_X, TOP_Y))
    else:
        ref = Image.open(right_images[0]).convert("RGB").resize((260, 170))
        cur = Image.open(right_images[1]).convert("RGB").resize((260, 170))
        canvas.paste(ref, (RIGHT_X, TOP_Y))
        canvas.paste(cur, (RIGHT_X, TOP_Y + 190))

        draw.text((RIGHT_X, TOP_Y + 170 + 10), "Reference", font=FONT_LABEL, fill=(120, 120, 120))
        draw.text((RIGHT_X, TOP_Y + 190 + 170 + 20), "Current", font=FONT_LABEL, fill=(120, 120, 120))

    # -----------------------------
    # Bottom text box (bigger)
    # -----------------------------
    draw.rounded_rectangle(
        [BOX_X1, BOX_Y1, BOX_X2, BOX_Y2],
        radius=18,
        fill=(255, 255, 255),
        outline=(220, 220, 220),
        width=2
    )

    draw.text(
        (BOX_X1 + PAD_X, BOX_Y1 + PAD_TOP),
        "AI Attention-Aware Output",
        font=FONT_LABEL,
        fill=(25, 90, 160)
    )

    # -----------------------------
    # Clean + parse model output
    # -----------------------------
    text = sanitize_model_text(text)

    lines = text.split("\n", 1)
    heading = (lines[0].strip() if lines and lines[0].strip() else "Attention Summary")
    body = (lines[1].strip() if len(lines) > 1 else "")

    # -----------------------------
    # Render heading
    # -----------------------------
    heading_x = BOX_X1 + PAD_X
    heading_y = BOX_Y1 + 52
    draw.text((heading_x, heading_y), heading, font=FONT_HEADER, fill=(30, 30, 30))

    # -----------------------------
    # Render body (WRAPPED + CLIPPED)
    # -----------------------------
    body_x = BOX_X1 + PAD_X
    body_y = heading_y + 50

    body_max_w = (BOX_X2 - BOX_X1) - 2 * PAD_X
    body_max_h = (BOX_Y2 - body_y) - 14  # bottom padding

    # Wrap body text to the available width
    wrapped = wrap_text(draw, body, FONT_BODY, body_max_w)

    # Choose line height based on your body font size
    # (If FONT_BODY is 18, 24 is a good readable spacing)
    line_h = 24

    # Max lines that fit inside the box height
    max_lines = max(0, body_max_h // line_h)

    # Clip lines and add ellipsis if needed
    truncated = len(wrapped) > max_lines
    wrapped = wrapped[:max_lines]

    if truncated and max_lines > 0:
        last = wrapped[-1]
        ell = "..."
        # Ensure ellipsis fits on the last line
        while last and draw.textlength(last + ell, font=FONT_BODY) > body_max_w:
            last = last[:-1].rstrip()
        wrapped[-1] = (last + ell) if last else ell

    # Draw the body lines
    y = body_y
    for ln in wrapped:
        draw.text((body_x, y), ln, font=FONT_BODY, fill=(20, 20, 20))
        y += line_h

    canvas.save(save_path)

In [11]:
# ============================================================
# 8) MAIN LOOP
# ============================================================

print("Starting NEAR Pipeline...")

for folder in ALL_FOLDERS:

    frames_dir = os.path.join(BASE_PATH, folder, "frames")
    if not os.path.exists(frames_dir):
        continue

    group = get_group_name(folder)
    group_out = os.path.join(OUTPUT_ROOT, group)
    os.makedirs(group_out, exist_ok=True)

    task_type = parse_task_type(folder)
    prompt = build_prompt(task_type)

    sources = sorted(glob.glob(os.path.join(frames_dir,"src_*.png")))
    sources = [p for p in sources if not p.endswith("_crop.png")]

    print(f"\nProcessing {folder} ({len(sources)} images)")

    for src_path in tqdm(sources, desc=folder):

        idx = re.search(r"src_(\d+)\.png$", src_path).group(1)

        heat_path = os.path.join(frames_dir, f"heat_{idx}.png")
        current_crop = prefer_crop(frames_dir, idx)

        reference_crop = None

        if task_type == "task3_2":
            ref_folder = get_reference_folder(folder,"_3_1")
            ref_frames = os.path.join(BASE_PATH, ref_folder,"frames")
            reference_crop = prefer_crop(ref_frames, idx)

        if task_type == "task3":
            ref_folder = get_reference_folder(folder,"_1")
            ref_frames = os.path.join(BASE_PATH, ref_folder,"frames")
            reference_crop = prefer_crop(ref_frames, idx)

        image_inputs = [src_path]

        if reference_crop:
            image_inputs.append(reference_crop)

        image_inputs.append(current_crop)

        result = call_gpt4o(prompt, image_inputs)

        title = f"{group} {task_type.replace('task','task ')} - idx {idx}"

        if reference_crop:
            right_imgs = (reference_crop,current_crop)
        else:
            right_imgs = (current_crop,)

        save_path = os.path.join(group_out,f"{folder}_idx_{idx}.png")

        create_output_image(heat_path,right_imgs,result,save_path,title)

print("\nFinished.")

Starting NEAR Pipeline...

Processing Ayu_1 (19 images)


Ayu_1:   0%|          | 0/19 [00:00<?, ?it/s]


Processing Ayu_2 (13 images)


Ayu_2:   0%|          | 0/13 [00:00<?, ?it/s]


Processing Ayu_3 (25 images)


Ayu_3:   0%|          | 0/25 [00:00<?, ?it/s]


Finished.


# Run all tester

In [12]:
import os
import re
import glob
import time
import base64
from typing import Optional, Tuple

from tqdm.auto import tqdm
from PIL import Image, ImageDraw, ImageFont
from openai import OpenAI
from google.colab import drive, userdata

# ============================================================
# 1) AUTHENTICATION
# ============================================================
OPENAI_API_KEY = userdata.get("OPENAI_KEY")
client = OpenAI(api_key=OPENAI_API_KEY)

drive.mount("/content/drive")

# ============================================================
# 2) PATH CONFIG
# ============================================================
BASE_PATH = "/content/drive/My Drive/AI_event_demo/NEAR_Experiment_Design_Output/"
OUTPUT_ROOT = "/content/drive/My Drive/AI_event_demo/GazeLLM"
os.makedirs(OUTPUT_ROOT, exist_ok=True)

TASK_GROUPS = [
    ["AT_1", "AT_2", "AT_3_1", "AT_3_2"],
    ["Ayu_1", "Ayu_2", "Ayu_3"],
    ["JC_1", "JC_2", "JC_3_1", "JC_3_2"],
    ["KC_1", "KC_2", "KC_3_1", "KC_3_2"],
    ["LKH_1", "LKH_2", "LKH_3_1", "LKH_3_2"],
    ["SYH_1_simple", "SYH_2_simple", "SYH_3_1_simple", "SYH_3_2_simple"],
    ["YL_1", "YL_2", "YL_3"],
]

ALL_FOLDERS = [f for g in TASK_GROUPS for f in g]

# ============================================================
# 3) FONT SETUP
# ============================================================

def load_fonts():
    """
    Robust font loader for Colab environments.

    Why this exists:
    - Colab images can vary (python 3.10/3.11/3.12, different base images).
    - Some runtimes do NOT include the same fonts in the same locations.
    - This function tries multiple candidates and falls back gracefully.

    Returns:
        (header_font, label_font, body_font)
    """
    candidates = [
        # DejaVu (common on many Linux distros)
        ("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
         "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"),

        # Liberation (also common)
        ("/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf",
         "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf"),

        # Noto (sometimes present)
        ("/usr/share/fonts/truetype/noto/NotoSans-Regular.ttf",
         "/usr/share/fonts/truetype/noto/NotoSans-Bold.ttf"),

        # Alternative DejaVu location (rare but possible)
        ("/usr/share/fonts/dejavu/DejaVuSans.ttf",
         "/usr/share/fonts/dejavu/DejaVuSans-Bold.ttf"),
    ]

    for regular_path, bold_path in candidates:
        if os.path.exists(regular_path) and os.path.exists(bold_path):
            try:
                header = ImageFont.truetype(bold_path, 16)
                label  = ImageFont.truetype(bold_path, 12)
                body   = ImageFont.truetype(regular_path, 12)
                print(f"[Font] Using: {regular_path} / {bold_path}")
                return header, label, body
            except Exception as e:
                print(f"[Font] Failed to load {regular_path}: {e}")

    # Safe fallback: never crash due to font issues
    print("[Font] Falling back to default PIL font (may look less pretty).")
    return ImageFont.load_default(), ImageFont.load_default(), ImageFont.load_default()

FONT_HEADER, FONT_LABEL, FONT_BODY = load_fonts()

# ============================================================
# 4) HELPERS
# ============================================================

def encode_image(path):
    with open(path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

def parse_task_type(folder_name):
    name = folder_name.replace("_simple", "")
    if "_3_1" in name:
        return "task3_1"
    if "_3_2" in name:
        return "task3_2"
    if name.endswith("_1"):
        return "task1"
    if name.endswith("_2"):
        return "task2"
    if name.endswith("_3"):
        return "task3"
    return "unknown"

def get_group_name(folder):
    return folder.split("_")[0]

def get_reference_folder(folder, suffix):
    return f"{get_group_name(folder)}{suffix}"

def prefer_crop(frames_dir, idx):
    src_crop = os.path.join(frames_dir, f"src_{idx}_crop.png")
    heat_crop = os.path.join(frames_dir, f"heat_{idx}_crop.png")
    if os.path.exists(src_crop):
        return src_crop
    if os.path.exists(heat_crop):
        return heat_crop
    return None

def sanitize_model_text(text: str) -> str:
    """
    Remove common markdown artifacts that LLMs sometimes output.
    This keeps the final rendered text clean (e.g., removes '###', '**', etc.).
    """
    if not text:
        return ""

    # Remove markdown headings like ###, ##, #
    text = re.sub(r"^\s*#{1,6}\s*", "", text, flags=re.MULTILINE)

    # Remove bold/italic markers
    text = text.replace("**", "").replace("*", "")

    # Collapse excessive blank lines
    text = re.sub(r"\n{3,}", "\n\n", text)

    return text.strip()

# ============================================================
# 5) PROMPT ENGINEERING
# ============================================================

def build_prompt(task_type):
    base = """
You are analyzing a user's visual attention in an experiment.

Return:
Line 1: A short heading (<=10 words).
Line 2+: A focused paragraph (3-6 sentences).
No bullet points.
Be concise and emphasize key differences or important objects.
"""

    if task_type == "task1":
        return base + """
Task 1:
State what the participant is looking at.
Describe key visual features of the attended object.
"""
    if task_type == "task2":
        return base + """
Task 2:
State what the participant is doing (e.g., comparing).
Explain similarities and differences being inspected.
"""
    if task_type == "task3_1":
        return base + """
Task 3_1:
Participant is reviewing the photo.
Describe multiple important objects and their details.
"""
    if task_type == "task3_2":
        return base + """
Task 3_2:
Compare the REFERENCE image and CURRENT image.
Describe what is different.
"""
    if task_type == "task3":
        return base + """
Task 3:
Compare REFERENCE (Task1) and CURRENT image.
Explain what is missing and how gaze suggests the participant is reasoning.
"""
    return base

# ============================================================
# 6) GPT-4o CALL
# ============================================================

def call_gpt4o(prompt, image_paths):
    messages = [{
        "role": "user",
        "content": [{"type": "text", "text": prompt}]
    }]

    for path in image_paths:
        messages[0]["content"].append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/png;base64,{encode_image(path)}"
            }
        })

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        max_tokens=600
    )

    return response.choices[0].message.content

# ============================================================
# 7) RENDER OUTPUT IMAGE
# ============================================================

def wrap_text(draw, text, font, max_width):
    words = text.split()
    lines = []
    current = ""
    for w in words:
        test = (current + " " + w).strip()
        if draw.textlength(test, font=font) <= max_width:
            current = test
        else:
            lines.append(current)
            current = w
    if current:
        lines.append(current)
    return lines

def create_output_image(heat_path, right_images, text, save_path, title):
    """
    Create the final composite image with adjusted vertical layout:
      - Title and section labels moved UP
      - Heatmap + AOI moved UP
      - Bottom text box moved UP and enlarged (bigger framed area)
    Keeps text sanitization, wrapping and clipping.
    """

    # -----------------------------
    # Canvas / layout configuration
    # -----------------------------
    CANVAS_W, CANVAS_H = 1200, 820  # keep overall canvas size
    # Move the top area (title/heatmap/aoI) upward by reducing top margin.
    TITLE_Y = 8                # Title y (moved up)
    LABEL_Y = 44               # Section labels y (moved up)
    TOP_Y = 60                 # Top y where heatmap/aois start (was 100)

    # Heatmap size (unchanged) but pasted at TOP_Y
    HEAT_W, HEAT_H = 640, 360

    # Right panel x anchor (unchanged)
    RIGHT_X = 740

    # Bottom text box: moved upward (so the top visual area occupies more of the canvas)
    # This gives a larger framed text area while keeping the canvas height constant.
    BOX_X1, BOX_Y1 = 24, 500   # moved up from ~520 to 420
    BOX_X2, BOX_Y2 = 1176, 760 # keeps a tall box, ending before canvas bottom

    PAD_X = 18
    PAD_TOP = 14

    # -----------------------------
    # Load images
    # -----------------------------
    heat_img = Image.open(heat_path).convert("RGB").resize((HEAT_W, HEAT_H))

    canvas = Image.new("RGB", (CANVAS_W, CANVAS_H), (248, 248, 248))
    draw = ImageDraw.Draw(canvas)

    # -----------------------------
    # Header and labels (moved up)
    # -----------------------------
    draw.text((24, TITLE_Y), f"Experiment Task: {title}", font=FONT_HEADER, fill=(20, 20, 20))
    draw.text((24, LABEL_Y), "User Gaze Context (Heatmap)", font=FONT_LABEL, fill=(90, 90, 90))
    draw.text((RIGHT_X, LABEL_Y), "Extracted AOI", font=FONT_LABEL, fill=(90, 90, 90))

    # -----------------------------
    # Paste heatmap and AOI (moved up)
    # -----------------------------
    canvas.paste(heat_img, (24, TOP_Y))

    if len(right_images) == 1:
        crop = Image.open(right_images[0]).convert("RGB").resize((260, 260))
        canvas.paste(crop, (RIGHT_X, TOP_Y))
    else:
        # stacked reference + current crops (moved up)
        ref = Image.open(right_images[0]).convert("RGB").resize((260, 170))
        cur = Image.open(right_images[1]).convert("RGB").resize((260, 170))
        canvas.paste(ref, (RIGHT_X, TOP_Y))
        canvas.paste(cur, (RIGHT_X, TOP_Y + 190))
        draw.text((RIGHT_X, TOP_Y + 170 + 10), "Reference", font=FONT_LABEL, fill=(120, 120, 120))
        draw.text((RIGHT_X, TOP_Y + 190 + 170 + 20), "Current", font=FONT_LABEL, fill=(120, 120, 120))

    # -----------------------------
    # Bottom text box (moved up + bigger)
    # -----------------------------
    draw.rounded_rectangle(
        [BOX_X1, BOX_Y1, BOX_X2, BOX_Y2],
        radius=18,
        fill=(255, 255, 255),
        outline=(220, 220, 220),
        width=2
    )
    draw.text((BOX_X1 + PAD_X, BOX_Y1 + PAD_TOP), "AI Attention-Aware Output", font=FONT_LABEL, fill=(25, 90, 160))

    # -----------------------------
    # Clean + parse model output
    # -----------------------------
    text = sanitize_model_text(text)

    lines = text.split("\n", 1)
    heading = (lines[0].strip() if lines and lines[0].strip() else "Attention Summary")
    body = (lines[1].strip() if len(lines) > 1 else "")

    # -----------------------------
    # Render heading (inside box)
    # -----------------------------
    heading_x = BOX_X1 + PAD_X
    heading_y = BOX_Y1 + 52
    draw.text((heading_x, heading_y), heading, font=FONT_HEADER, fill=(30, 30, 30))

    # -----------------------------
    # Render body (WRAPPED + CLIPPED)
    # -----------------------------
    body_x = BOX_X1 + PAD_X
    body_y = heading_y + 50

    body_max_w = (BOX_X2 - BOX_X1) - 2 * PAD_X
    body_max_h = (BOX_Y2 - body_y) - 14  # bottom padding

    # Wrap body text to the available width
    wrapped = wrap_text(draw, body, FONT_BODY, body_max_w)

    # Line height for FONT_BODY (adjust if you use a different FONT_BODY size)
    line_h = 24

    # Max lines that fit inside the box height
    max_lines = max(0, body_max_h // line_h)

    # Clip lines and add ellipsis if needed
    truncated = len(wrapped) > max_lines
    wrapped = wrapped[:max_lines]

    if truncated and max_lines > 0:
        last = wrapped[-1]
        ell = "..."
        while last and draw.textlength(last + ell, font=FONT_BODY) > body_max_w:
            last = last[:-1].rstrip()
        wrapped[-1] = (last + ell) if last else ell

    # Draw the body lines
    y = body_y
    for ln in wrapped:
        draw.text((body_x, y), ln, font=FONT_BODY, fill=(20, 20, 20))
        y += line_h

    # -----------------------------
    # Save final canvas
    # -----------------------------
    canvas.save(save_path)

# ============================================================
# 8) MAIN LOOP
# ============================================================

print("Starting NEAR Pipeline...")

for folder in ALL_FOLDERS:

    frames_dir = os.path.join(BASE_PATH, folder, "frames")
    if not os.path.exists(frames_dir):
        continue

    group = get_group_name(folder)
    group_out = os.path.join(OUTPUT_ROOT, group)
    os.makedirs(group_out, exist_ok=True)

    task_type = parse_task_type(folder)
    prompt = build_prompt(task_type)

    sources = sorted(glob.glob(os.path.join(frames_dir,"src_*.png")))
    sources = [p for p in sources if not p.endswith("_crop.png")]

    print(f"\nProcessing {folder} ({len(sources)} images)")

    for src_path in tqdm(sources, desc=folder):

        idx = re.search(r"src_(\d+)\.png$", src_path).group(1)

        heat_path = os.path.join(frames_dir, f"heat_{idx}.png")
        current_crop = prefer_crop(frames_dir, idx)

        reference_crop = None

        if task_type == "task3_2":
            ref_folder = get_reference_folder(folder,"_3_1")
            ref_frames = os.path.join(BASE_PATH, ref_folder,"frames")
            reference_crop = prefer_crop(ref_frames, idx)

        if task_type == "task3":
            ref_folder = get_reference_folder(folder,"_1")
            ref_frames = os.path.join(BASE_PATH, ref_folder,"frames")
            reference_crop = prefer_crop(ref_frames, idx)

        image_inputs = [src_path]

        if reference_crop:
            image_inputs.append(reference_crop)

        image_inputs.append(current_crop)

        result = call_gpt4o(prompt, image_inputs)

        title = f"{group} {task_type.replace('task','task ')} - idx {idx}"

        if reference_crop:
            right_imgs = (reference_crop,current_crop)
        else:
            right_imgs = (current_crop,)

        save_path = os.path.join(group_out,f"{folder}_idx_{idx}.png")

        create_output_image(heat_path,right_imgs,result,save_path,title)

print("\nFinished.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[Font] Using: /usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf / /usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf
Starting NEAR Pipeline...

Processing AT_1 (23 images)


AT_1:   0%|          | 0/23 [00:00<?, ?it/s]


Processing AT_2 (9 images)


AT_2:   0%|          | 0/9 [00:00<?, ?it/s]


Processing AT_3_1 (6 images)


AT_3_1:   0%|          | 0/6 [00:00<?, ?it/s]


Processing AT_3_2 (14 images)


AT_3_2:   0%|          | 0/14 [00:00<?, ?it/s]


Processing Ayu_1 (19 images)


Ayu_1:   0%|          | 0/19 [00:00<?, ?it/s]


Processing Ayu_2 (13 images)


Ayu_2:   0%|          | 0/13 [00:00<?, ?it/s]


Processing Ayu_3 (25 images)


Ayu_3:   0%|          | 0/25 [00:00<?, ?it/s]


Processing JC_1 (18 images)


JC_1:   0%|          | 0/18 [00:00<?, ?it/s]


Processing JC_2 (31 images)


JC_2:   0%|          | 0/31 [00:00<?, ?it/s]


Processing JC_3_1 (9 images)


JC_3_1:   0%|          | 0/9 [00:00<?, ?it/s]


Processing JC_3_2 (13 images)


JC_3_2:   0%|          | 0/13 [00:00<?, ?it/s]


Processing KC_1 (9 images)


KC_1:   0%|          | 0/9 [00:00<?, ?it/s]


Processing KC_2 (15 images)


KC_2:   0%|          | 0/15 [00:00<?, ?it/s]


Processing KC_3_1 (5 images)


KC_3_1:   0%|          | 0/5 [00:00<?, ?it/s]


Processing KC_3_2 (5 images)


KC_3_2:   0%|          | 0/5 [00:00<?, ?it/s]


Processing LKH_1 (12 images)


LKH_1:   0%|          | 0/12 [00:00<?, ?it/s]


Processing LKH_2 (13 images)


LKH_2:   0%|          | 0/13 [00:00<?, ?it/s]


Processing LKH_3_1 (4 images)


LKH_3_1:   0%|          | 0/4 [00:00<?, ?it/s]


Processing LKH_3_2 (9 images)


LKH_3_2:   0%|          | 0/9 [00:00<?, ?it/s]


Processing SYH_1_simple (16 images)


SYH_1_simple:   0%|          | 0/16 [00:00<?, ?it/s]


Processing SYH_2_simple (11 images)


SYH_2_simple:   0%|          | 0/11 [00:00<?, ?it/s]


Processing SYH_3_1_simple (7 images)


SYH_3_1_simple:   0%|          | 0/7 [00:00<?, ?it/s]


Processing SYH_3_2_simple (7 images)


SYH_3_2_simple:   0%|          | 0/7 [00:00<?, ?it/s]


Processing YL_1 (14 images)


YL_1:   0%|          | 0/14 [00:00<?, ?it/s]


Processing YL_2 (9 images)


YL_2:   0%|          | 0/9 [00:00<?, ?it/s]


Processing YL_3 (7 images)


YL_3:   0%|          | 0/7 [00:00<?, ?it/s]


Finished.
