In [1]:
# ========================================
# COLAB HIGH-ACCURACY ICON DETECTION API
# Final Production Version - Full Prompt Mode
# ========================================

# =============================================================================
# STEP 1: INSTALL ALL DEPENDENCIES
# =============================================================================
!pip install -q pyngrok flask transformers accelerate qwen-vl-utils easyocr

# =============================================================================
# STEP 2: CLEANUP & NGROK SETUP
# =============================================================================
!kill -9 $(lsof -t -i:5001) 2>/dev/null || true

from pyngrok import ngrok
import getpass

ngrok.kill()

print("="*60)
print("Get your ngrok authtoken from:")
print("https://dashboard.ngrok.com/get-started/your-authtoken")
print("="*60)
ngrok_token = getpass.getpass("Enter your ngrok authtoken: ")
ngrok.set_auth_token(ngrok_token)

# =============================================================================
# STEP 3: IMPORTS & CONSTANTS
# =============================================================================
import torch
import re
import io
import os
import math
import base64
import time
import warnings
from dataclasses import dataclass, field
from typing import Optional, Tuple, List, Dict, Any
from PIL import Image
import numpy as np
from flask import Flask, request, jsonify

warnings.filterwarnings("ignore")

# -----------------------------------------------------------------------------
# CONSTANTS CALIBRATED FOR 1920x1080
# -----------------------------------------------------------------------------
SCREEN_WIDTH = 1920
SCREEN_HEIGHT = 1080

ICON_SIZES = {
    "small": 48,
    "medium": 64,
    "large": 96,
    "taskbar": 32,
}

CROP_SIZES = {
    "initial": 512,
    "refinement": 384,
    "final": 256,
}

CONFIDENCE_HIGH = 0.85
CONFIDENCE_MEDIUM = 0.6
CONFIDENCE_LOW = 0.4

OCR_MAX_BOOST = 0.1
OCR_MISMATCH_PENALTY = 0.15

DUPLICATE_DISTANCE_THRESHOLD = 100

# =============================================================================
# STEP 4: DATA STRUCTURES
# =============================================================================
@dataclass
class BoundingBox:
    x1: int
    y1: int
    x2: int
    y2: int

    @property
    def center(self) -> Tuple[int, int]:
        return ((self.x1 + self.x2) // 2, (self.y1 + self.y2) // 2)

    @property
    def area(self) -> int:
        return (self.x2 - self.x1) * (self.y2 - self.y1)


@dataclass
class Detection:
    coords: Tuple[int, int]
    raw_response: str
    vlm_confidence: float
    position_confidence: float
    ocr_modifier: float = 0.0
    iteration: int = 0
    crop_region: Optional[Tuple[int, int, int, int]] = None
    ocr_texts_found: List[str] = field(default_factory=list)

    @property
    def final_confidence(self) -> float:
        base = (self.vlm_confidence * 0.7 + self.position_confidence * 0.3)
        adjusted = base + self.ocr_modifier
        return max(0.0, min(1.0, adjusted))

    @property
    def bbox(self) -> BoundingBox:
        half_size = ICON_SIZES["medium"] // 2
        return BoundingBox(
            x1=max(0, self.coords[0] - half_size),
            y1=max(0, self.coords[1] - half_size),
            x2=min(SCREEN_WIDTH, self.coords[0] + half_size),
            y2=min(SCREEN_HEIGHT, self.coords[1] + half_size)
        )


@dataclass
class DetectionResult:
    prompt: str
    found: bool
    coords: Optional[Tuple[int, int]]
    confidence: float
    method: str
    detections: List[Detection] = field(default_factory=list)
    ocr_verification: Optional[str] = None
    warnings: List[str] = field(default_factory=list)
    image_size: Tuple[int, int] = (SCREEN_WIDTH, SCREEN_HEIGHT)
    time_seconds: float = 0.0

    def to_dict(self) -> Dict[str, Any]:
        return {
            "found": self.found,
            "x": self.coords[0] if self.coords else None,
            "y": self.coords[1] if self.coords else None,
            "confidence": round(self.confidence, 3),
            "method": self.method,
            "ocr_verification": self.ocr_verification,
            "iterations": len(self.detections),
            "warnings": self.warnings,
            "time_seconds": round(self.time_seconds, 2),
            "raw_response": self.detections[-1].raw_response if self.detections else None,
        }

# =============================================================================
# STEP 5: OCR VERIFIER
# =============================================================================
class OCRVerifier:
    """OCR for verification only."""

    def __init__(self):
        self.reader = None
        self._initialized = False
        self._available = False

    def _lazy_init(self) -> bool:
        if self._initialized:
            return self._available
        self._initialized = True
        try:
            import easyocr
            self.reader = easyocr.Reader(
                ['en'],
                gpu=torch.cuda.is_available(),
                verbose=False
            )
            self._available = True
            print("  ✓ OCR initialized")
            return True
        except Exception as e:
            print(f"  ⚠ OCR unavailable: {e}")
            self._available = False
            return False

    def verify_detection(
        self,
        image: Image.Image,
        detection: Detection,
        target_keywords: List[str]
    ) -> float:
        if not self._lazy_init() or not target_keywords:
            return 0.0

        try:
            bbox = detection.bbox
            margin = 25
            region = (
                max(0, bbox.x1 - margin),
                max(0, bbox.y1 - margin),
                min(image.width, bbox.x2 + margin),
                min(image.height, bbox.y2 + margin)
            )
            crop = image.crop(region)
            crop_array = np.array(crop)

            results = self.reader.readtext(crop_array)
            if not results:
                return 0.0

            found_texts = []
            for _, text, conf in results:
                if conf > 0.4:
                    found_texts.append(text.lower().strip())

            detection.ocr_texts_found = found_texts

            for keyword in target_keywords:
                keyword_lower = keyword.lower()
                for text in found_texts:
                    if keyword_lower in text or text in keyword_lower:
                        return OCR_MAX_BOOST

            if found_texts and any(len(t) > 2 for t in found_texts):
                return -OCR_MISMATCH_PENALTY * 0.5

            return 0.0
        except Exception:
            return 0.0

# =============================================================================
# STEP 6: POSITION VALIDATOR
# =============================================================================
class PositionValidator:
    """Validates position plausibility."""

    DESKTOP_MARGIN = 10
    TASKBAR_HEIGHT = 48

    def validate(self, coords: Tuple[int, int], img_size: Tuple[int, int], context: str = "desktop") -> float:
        x, y = coords
        img_w, img_h = img_size
        taskbar_y = img_h - self.TASKBAR_HEIGHT

        if not (0 <= x < img_w and 0 <= y < img_h):
            return 0.0

        if context == "taskbar":
            return 1.0 if y >= taskbar_y else 0.3

        elif context == "desktop":
            if y >= taskbar_y:
                return 0.5
            if x < self.DESKTOP_MARGIN or y < self.DESKTOP_MARGIN:
                return 0.7
            return 1.0

        return 0.8

# =============================================================================
# STEP 7: SPATIAL CLUSTERER
# =============================================================================
class SpatialClusterer:
    """Handles duplicate/nearby detections."""

    @staticmethod
    def cluster_detections(
        detections: List[Detection],
        threshold: int = DUPLICATE_DISTANCE_THRESHOLD
    ) -> List[List[Detection]]:
        if not detections:
            return []

        clusters = []
        used = set()

        for i, det in enumerate(detections):
            if i in used:
                continue
            cluster = [det]
            used.add(i)

            for j, other in enumerate(detections):
                if j in used:
                    continue
                dist = math.sqrt(
                    (det.coords[0] - other.coords[0])**2 +
                    (det.coords[1] - other.coords[1])**2
                )
                if dist < threshold:
                    cluster.append(other)
                    used.add(j)

            clusters.append(cluster)
        return clusters

    @staticmethod
    def select_best(clusters: List[List[Detection]]) -> List[Detection]:
        return [max(c, key=lambda d: d.final_confidence) for c in clusters]

# =============================================================================
# STEP 8: KEYWORD EXTRACTOR
# =============================================================================
def extract_keywords(prompt: str) -> List[str]:
    """Extract keywords from prompt for OCR verification."""
    prompt_lower = prompt.lower()

    # Common app names to look for
    known_apps = [
        "notepad", "chrome", "firefox", "edge", "explorer", "recycle",
        "settings", "calculator", "terminal", "cmd", "powershell",
        "vscode", "word", "excel", "outlook", "teams", "discord",
        "spotify", "steam", "obs", "vlc", "photoshop", "illustrator",
        "blender", "unity", "unreal", "code", "git", "docker"
    ]

    keywords = []
    for app in known_apps:
        if app in prompt_lower:
            keywords.append(app)

    # Also extract capitalized words
    words = re.findall(r'\b[A-Z][a-z]+\b', prompt)
    for word in words:
        if word.lower() not in ["locate", "find", "the", "icon", "from", "this", "desktop", "screenshot", "return", "center", "coordinates"]:
            keywords.append(word.lower())

    return list(set(keywords))

# =============================================================================
# STEP 9: MAIN DETECTOR
# =============================================================================
class HighAccuracyIconDetector:
    """
    Production-grade icon detector.
    Accepts full prompts directly.
    """

    def __init__(
        self,
        model_name: Optional[str] = None,
        use_ocr: bool = True,
        max_image_dim: int = 1920,  # No resize for 1080p by default
    ):
        self.model_name = model_name
        self.max_image_dim = max_image_dim
        self.model = None
        self.processor = None
        self.device = None

        self.ocr = OCRVerifier() if use_ocr else None
        self.position_validator = PositionValidator()
        self.clusterer = SpatialClusterer()
        self._warnings = []

    def load_model(self) -> bool:
        try:
            from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
        except ImportError:
            print("Install: pip install transformers accelerate qwen-vl-utils")
            return False

        torch.cuda.empty_cache()

        has_gpu = torch.cuda.is_available()
        gpu_memory = 0

        if has_gpu:
            gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
            print(f"  ✓ GPU: {torch.cuda.get_device_name(0)} ({gpu_memory:.1f} GB)")
        else:
            print("  ⚠ No GPU - will be slow!")

        if self.model_name is None:
            if gpu_memory >= 16:
                self.model_name = "osunlp/UGround-V1-7B"
            else:
                self.model_name = "osunlp/UGround-V1-2B"

        print(f"  Loading {self.model_name}...")
        t0 = time.time()

        try:
            self.model = Qwen2VLForConditionalGeneration.from_pretrained(
                self.model_name,
                torch_dtype=torch.float16 if has_gpu else torch.float32,
                device_map="auto" if has_gpu else None,
            )
            self.processor = AutoProcessor.from_pretrained(self.model_name)
            self.device = next(self.model.parameters()).device if has_gpu else torch.device("cpu")

            if not has_gpu:
                self.model = self.model.to(self.device)

            print(f"  ✓ Model loaded on {self.device} in {time.time() - t0:.1f}s")
            return True

        except Exception as e:
            print(f"  ✗ Load failed: {e}")
            return False

    def _parse_response(
        self,
        response: str,
        img_w: int,
        img_h: int
    ) -> Tuple[Optional[Tuple[int, int]], float]:
        """Parse coordinates and estimate confidence."""
        patterns = [
            r'\((\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\)',
            r'(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)',
        ]

        coords = None
        for pattern in patterns:
            match = re.search(pattern, response)
            if match:
                try:
                    x = float(match.group(1))
                    y = float(match.group(2))
                    if 0 <= x <= 1000 and 0 <= y <= 1000:
                        pixel_x = int((x / 1000) * img_w)
                        pixel_y = int((y / 1000) * img_h)
                        pixel_x = max(0, min(pixel_x, img_w - 1))
                        pixel_y = max(0, min(pixel_y, img_h - 1))
                        coords = (pixel_x, pixel_y)
                        break
                except (ValueError, IndexError):
                    continue

        if coords is None:
            return None, 0.0

        response_stripped = response.strip()

        if re.match(r'^\(\d+(?:\.\d+)?,\s*\d+(?:\.\d+)?\)$', response_stripped):
            return coords, 0.9

        hedges = ["not sure", "might be", "possibly", "unclear", "cannot find", "don't see"]
        for hedge in hedges:
            if hedge in response.lower():
                return coords, 0.5

        return coords, 0.75

    def _ground_single(
        self,
        image: Image.Image,
        prompt: str
    ) -> Tuple[Optional[Tuple[int, int]], str, float]:
        """Single grounding query with full prompt."""
        if self.model is None:
            raise RuntimeError("Model not loaded")

        from qwen_vl_utils import process_vision_info

        if image.mode != 'RGB':
            image = image.convert('RGB')

        img_w, img_h = image.size

        # Use the prompt directly - no modification
        messages = [{
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt},
            ],
        }]

        text = self.processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)

        inputs = self.processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        ).to(self.device)

        with torch.no_grad():
            output = self.model.generate(
                **inputs,
                max_new_tokens=32,
                do_sample=False,
            )

        generated = [out[len(inp):] for inp, out in zip(inputs.input_ids, output)]
        response = self.processor.batch_decode(
            generated,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )[0].strip()

        coords, confidence = self._parse_response(response, img_w, img_h)
        return coords, response, confidence

    def _compute_crop_region(
        self,
        center: Tuple[int, int],
        crop_size: int,
        img_size: Tuple[int, int]
    ) -> Tuple[int, int, int, int]:
        img_w, img_h = img_size
        half = crop_size // 2

        x1 = max(0, center[0] - half)
        y1 = max(0, center[1] - half)
        x2 = min(img_w, center[0] + half)
        y2 = min(img_h, center[1] + half)

        if x2 - x1 < crop_size:
            if x1 == 0:
                x2 = min(img_w, crop_size)
            else:
                x1 = max(0, x2 - crop_size)

        if y2 - y1 < crop_size:
            if y1 == 0:
                y2 = min(img_h, crop_size)
            else:
                y1 = max(0, y2 - crop_size)

        return (x1, y1, x2, y2)

    def detect(
        self,
        image: Image.Image,
        prompt: str,
        context: str = "desktop",
        num_iterations: int = 2
    ) -> DetectionResult:
        """
        Main detection with full prompt.

        Args:
            image: Screenshot
            prompt: Full prompt to send to model
            context: "desktop", "taskbar", or "any"
            num_iterations: Refinement passes
        """
        t0 = time.time()

        if image.mode != 'RGB':
            image = image.convert('RGB')

        orig_w, orig_h = image.size
        self._warnings = []

        # use image
        working_image = image

        img_w, img_h = working_image.size

        # Extract keywords for OCR
        keywords = extract_keywords(prompt)

        # Detection loop
        detections = []
        current_image = working_image
        current_offset = (0, 0)

        for iteration in range(num_iterations):
            crop_size = CROP_SIZES["initial"] if iteration == 0 else CROP_SIZES["refinement"]

            coords, response, vlm_conf = self._ground_single(current_image, prompt)

            if coords is None:
                break

            # Map to global coordinates
            global_x = coords[0] + current_offset[0]
            global_y = coords[1] + current_offset[1]
            global_x = max(0, min(global_x, img_w - 1))
            global_y = max(0, min(global_y, img_h - 1))
            global_coords = (global_x, global_y)

            # Scale back to original image size
            if orig_w != img_w or orig_h != img_h:
                scale_x = orig_w / img_w
                scale_y = orig_h / img_h
                final_coords = (int(global_x * scale_x), int(global_y * scale_y))
            else:
                final_coords = global_coords

            # Position validation
            pos_conf = self.position_validator.validate(final_coords, (orig_w, orig_h), context)

            det = Detection(
                coords=final_coords,
                raw_response=response,
                vlm_confidence=vlm_conf,
                position_confidence=pos_conf,
                iteration=iteration,
            )

            # OCR on final iteration
            if iteration == num_iterations - 1 and self.ocr and keywords:
                ocr_mod = self.ocr.verify_detection(image, det, keywords)
                det.ocr_modifier = ocr_mod

            detections.append(det)

            # Prepare crop for next iteration
            if iteration < num_iterations - 1:
                crop_region = self._compute_crop_region(global_coords, crop_size, (img_w, img_h))
                current_image = working_image.crop(crop_region)
                current_offset = (crop_region[0], crop_region[1])
                det.crop_region = crop_region

        elapsed = time.time() - t0

        # Cluster and select best
        if detections:
            clusters = self.clusterer.cluster_detections(detections)
            best_detections = self.clusterer.select_best(clusters)

            if len(clusters) > 1:
                self._warnings.append(f"Found {len(clusters)} clusters")

            best = max(best_detections, key=lambda d: d.final_confidence)

            return DetectionResult(
                prompt=prompt,
                found=True,
                coords=best.coords,
                confidence=best.final_confidence,
                method="ReGround-v2",
                detections=detections,
                ocr_verification="match" if best.ocr_modifier > 0 else (
                    "mismatch" if best.ocr_modifier < 0 else "none"
                ),
                warnings=self._warnings,
                image_size=(orig_w, orig_h),
                time_seconds=elapsed,
            )

        return DetectionResult(
            prompt=prompt,
            found=False,
            coords=None,
            confidence=0.0,
            method="ReGround-v2",
            warnings=self._warnings + ["No valid detection"],
            image_size=(orig_w, orig_h),
            time_seconds=elapsed,
        )

# =============================================================================
# STEP 10: INITIALIZE DETECTOR
# =============================================================================
print("\n" + "="*60)
print("INITIALIZING HIGH-ACCURACY ICON DETECTOR")
print("="*60)

detector = HighAccuracyIconDetector(
    use_ocr=True,
    max_image_dim=1920,  # No resize for 1080p
)

if not detector.load_model():
    raise RuntimeError("Failed to load model")

if detector.ocr:
    detector.ocr._lazy_init()

# =============================================================================
# STEP 11: FLASK API
# =============================================================================
app = Flask(__name__)

@app.route('/health', methods=['GET'])
def health():
    return jsonify({
        "status": "ok",
        "model": detector.model_name,
        "device": str(detector.device),
        "ocr_enabled": detector.ocr is not None and detector.ocr._available,
        "max_image_dim": detector.max_image_dim,
        "method": "ReGround-v2",
        "mode": "full_prompt",
    })

@app.route('/detect', methods=['POST'])
def detect_icon():
    """
    Detect icon using full prompt.

    Request JSON:
    {
        "image": "base64_encoded_image",
        "description": "Locate the Notepad icon",
        "context": "desktop",      // optional
        "iterations": 2            // optional
    }

    Also accepts "prompt" or "app_name" as alias for "description".
    """
    try:
        data = request.get_json()

        if not data or 'image' not in data:
            return jsonify({"error": "Missing 'image' in request"}), 400

        # Accept description, prompt, or app_name
        description = data.get('description')
        if not description:
            return jsonify({"error": "Missing 'description' in request"}), 400

        context = data.get('context', 'desktop')
        iterations = min(3, max(1, data.get('iterations', 2)))

        # Decode image
        image_data = base64.b64decode(data['image'])
        image = Image.open(io.BytesIO(image_data))

        # PREPEND SYSTEM PROMPT
        prompt = f"""Your task is to help the user identify the precise coordinates (x, y) of a specific area/element/object on the screen based on a description.
- Your response should aim to point to the center or a representative point within the described area/element/object as accurately as possible.
- If the description is unclear or ambiguous, infer the most relevant area or element based on its likely context or purpose.
- Your answer should be a single string (x, y) corresponding to the point of the interest.
Description: {description}
Answer:"""

        print(f"\n[REQUEST] Description: {description[:80]}{'...' if len(description) > 80 else ''}")
        print(f"[IMAGE] Size: {image.size}")
        print(f"[CONFIG] Context: {context}, Iterations: {iterations}")
        final_prompt="Your task is to help the user identify the precise coordinates (x, y) of a specific area/element/object on the screen based on a description. Your response should aim to point to the center or a representative point within the described area/element/object as accurately as possible.- If the description is unclear or ambiguous, infer the most relevant area or element based on its likely context or purpose.- Your answer should be a single string (x, y) corresponding to the point of the interest.Locate the Notepad Windows application icon from this desktop screenshot and return the center coordinates as (x, y)"     # Detect
        result = detector.detect(
            image,
            final_prompt,
            context=context,
            num_iterations=iterations
        )

        # Log result
        if result.found:
            print(f"  ✓ Found at {result.coords} ({result.confidence:.0%})")
            print(f"  OCR: {result.ocr_verification}")
        else:
            print(f"  ✗ Not found")

        if result.warnings:
            for w in result.warnings:
                print(f"  ⚠ {w}")

        return jsonify(result.to_dict())

    except Exception as e:
        print(f"[ERROR] {e}")
        import traceback
        traceback.print_exc()
        return jsonify({"error": str(e)}), 500

# =============================================================================
# STEP 12: START SERVER
# =============================================================================
PORT = 5001

print("\n" + "="*60)
print("STARTING API SERVER")
print("="*60)

public_url = ngrok.connect(PORT)
url_str = str(public_url)
if '"' in url_str:
    url_str = url_str.split('"')[1]

print(f"\n{'='*60}")
print(f"✓ API URL: {url_str}")
print(f"{'='*60}")
print(f"""
ENDPOINTS:
  GET  {url_str}/health
  POST {url_str}/detect

REQUEST EXAMPLE (full prompt mode):
{{
    "image": "<base64>",
    "prompt": "Locate the Notepad Windows application icon from this desktop screenshot and return the center coordinates as (x, y)",
    "context": "desktop",
    "iterations": 2
}}

RESPONSE EXAMPLE:
{{
    "found": true,
    "x": 150,
    "y": 200,
    "confidence": 0.87,
    "method": "ReGround-v2",
    "ocr_verification": "match",
    "iterations": 2,
    "time_seconds": 3.2
}}
""")
print("="*60)
print("Keep this cell running!")
print("="*60)

app.run(host='0.0.0.0', port=PORT)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.2/41.2 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m978.2/978.2 kB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.6/300.6 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Get your ngrok authtoken from:
https://dashboard.ngrok.com/get-started/your-authtoken
Enter your ngrok authtoken: ··········

INITIALIZING HIGH-ACCURACY ICON DETECTOR


`torch_dtype` is deprecated! Use `dtype` instead!


  ✓ GPU: Tesla T4 (15.8 GB)
  Loading osunlp/UGround-V1-2B...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/4.42G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/143 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/348 [00:00<?, ?B/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

  ✓ Model loaded on cuda:0 in 70.4s




  ✓ OCR initialized

STARTING API SERVER

✓ API URL: https://aa6154f92f24.ngrok-free.app

ENDPOINTS:
  GET  https://aa6154f92f24.ngrok-free.app/health
  POST https://aa6154f92f24.ngrok-free.app/detect

REQUEST EXAMPLE (full prompt mode):
{
    "image": "<base64>",
    "prompt": "Locate the Notepad Windows application icon from this desktop screenshot and return the center coordinates as (x, y)",
    "context": "desktop",
    "iterations": 2
}

RESPONSE EXAMPLE:
{
    "found": true,
    "x": 150,
    "y": 200,
    "confidence": 0.87,
    "method": "ReGround-v2",
    "ocr_verification": "match",
    "iterations": 2,
    "time_seconds": 3.2
}

Keep this cell running!
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5001
 * Running on http://172.28.0.12:5001
INFO:werkzeug:[33mPress CTRL+C to quit[0m
