In [None]:
"""
CLIP – NPU demo (no torch, no qai-hub-models)
Screenshot → QNN → similarity table
"""
from __future__ import annotations
from pathlib import Path
import platform, time

import cv2 as cv
import numpy as np
from PIL import ImageGrab
import onnxruntime as ort
from transformers import CLIPTokenizer

# ── 0.  Environment info ────────────────────────────────────────────────
print("Python arch :", platform.machine())          # arm64 or AMD64
print("ONNX RT     :", ort.__version__)

# ── 1.  Build QNN session ───────────────────────────────────────────────
onnx_path = Path("openai_clip.onnx")                 # QDQ / quantised!
assert onnx_path.exists(), f"Missing {onnx_path}"

ort_dir  = Path(ort.__file__).parent
qnn_dll  = ort_dir / "capi" / "QnnHtp.dll"

sess = ort.InferenceSession(
    onnx_path,
    providers=[("QNNExecutionProvider", {"backend_path": str(qnn_dll)}),
               "CPUExecutionProvider"]
)
print("Providers   :", sess.get_providers())

img_name = sess.get_inputs()[0].name                 # "images"
txt_name = sess.get_inputs()[1].name                 # "texts"
out_name = sess.get_outputs()[0].name                # "similarities"

# ── 2.  Tokeniser (pure Python) ─────────────────────────────────────────
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16")

def encode_text(prompt: str) -> np.ndarray:
    """Return a (1,77) int32 tensor for CLIP."""
    ids = tokenizer(prompt,
                    padding="max_length",
                    truncation=True,
                    max_length=77,
                    return_tensors="np").input_ids
    return ids.astype(np.int32)

# ── 3.  Image pre-processing (OpenCV/NumPy) ─────────────────────────────
def preprocess_image(pil_img) -> np.ndarray:
    """PIL → (1,3,224,224) float32, NCHW, normalized."""
    img = cv.cvtColor(np.array(pil_img), cv.COLOR_RGB2BGR)
    img = cv.resize(img, (224, 224), interpolation=cv.INTER_CUBIC)
    img = img[:, :, ::-1]  # BGR→RGB
    img = img.astype(np.float32) / 255.0
    img = (img - [0.485,0.456,0.406]) / [0.229,0.224,0.225]
    chw  = np.transpose(img, (2,0,1))[None]          # (1,3,224,224)
    return chw.astype(np.float32)

# ── 4.  Capture screen once ─────────────────────────────────────────────
print("📸  Capturing screenshot …")
time.sleep(1)
pil_shot = ImageGrab.grab()
print("✅  Screenshot captured.")

img_tensor = preprocess_image(pil_shot)

# ── 5.  Label list  – edit freely ───────────────────────────────────────
labels = [
    "a web browser", "a terminal", "a YouTube video", "a game", "a cat", "a spreadsheet"
]


Python arch : ARM64
ONNX RT     : 1.22.0
Providers   : ['QNNExecutionProvider', 'CPUExecutionProvider']
📸  Capturing screenshot …
✅  Screenshot captured.

🧠  Classification result
   → label        : a code editor
   → score (×100) : 31.50
   → probability  : 15.42%

🔍  Full table
   - a web browser       :  25.39   (14.51%)
   - a terminal          :  27.00   (14.74%)
   - a YouTube video     :  19.73   (13.71%)
   - a code editor       :  31.50   (15.42%)
   - a game              :  20.45   (13.81%)
   - a cat               :  20.28   (13.79%)
   - a spreadsheet       :  21.97   (14.02%)


In [9]:

scores = []
for lbl in labels:
    txt_tensor = encode_text(lbl)
    sim = sess.run([out_name],
                   {img_name: img_tensor,
                    txt_name: txt_tensor})[0]        # (1,1) ×100
    scores.append(float(sim.squeeze()))

scores = np.array(scores)
probs  = np.exp(scores/100 - scores.max()/100)
probs /= probs.sum()

best = int(scores.argmax())
print("\n🧠  Classification result")
print(f"   → label        : {labels[best]}")
print(f"   → score (×100) : {scores[best]:.2f}")
print(f"   → probability  : {probs[best]:.2%}")

print("\n🔍  Full table")
for lbl, sc, pb in zip(labels, scores, probs):
    print(f"   - {lbl:20s}: {sc:6.2f}   ({pb:.2%})")


🧠  Classification result
   → label        : a code editor
   → score (×100) : 31.50
   → probability  : 15.42%

🔍  Full table
   - a web browser       :  25.39   (14.51%)
   - a terminal          :  27.00   (14.74%)
   - a YouTube video     :  19.73   (13.71%)
   - a code editor       :  31.50   (15.42%)
   - a game              :  20.45   (13.81%)
   - a cat               :  20.28   (13.79%)
   - a spreadsheet       :  21.97   (14.02%)


In [2]:
from __future__ import annotations
from pathlib import Path
import platform, time

import cv2 as cv
# import numpy as np


: 

In [1]:
import os   # standard library

# 0 = keep OpenCV’s baseline-feature guard ON
# 1 = skip the guard entirely
os.environ["OPENCV_SKIP_CPU_BASELINE_CHECK"] = "1"

from pathlib import Path
import platform, time
import cv2 as cv          # safe to import now

print("OpenCV", cv.getVersionString(), "on", platform.machine())


OpenCV 4.12.0 on ARM64
