### 0. Importy

In [2]:
# ### 0. Importy i konfiguracja środowiska (CLIP search)
# Funkcja:
# - sprawdza, czy notebook działa na właściwym kernelu (.venv-clip / clip-search),
# - dodaje repo root do sys.path (żeby działało: from src...),
# - ładuje minimalny zestaw importów (bez duplikatów),
# - wycisza tylko wybrane ostrzeżenia (tqdm).

import sys
import subprocess
import warnings
import os
import io
import re
import base64
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import open_clip
import pyarrow as pa
import pyarrow.parquet as pq
from PIL import Image, ImageDraw
from tqdm import tqdm
from tqdm import TqdmWarning
from IPython.display import display, HTML

warnings.filterwarnings("ignore", category=TqdmWarning)

# 0) sanity-check kernela
exe = sys.executable
if ".venv-clip" not in exe:
    raise RuntimeError(
        f"Zły kernel: {exe}\n"
        f"Wybierz: Kernel → Change Kernel → Python (clip-search)"
    )
print("OK kernel:", exe)

# 1) repo root na sys.path (ZANIM zaimportujesz cokolwiek z src)
_repo_root = subprocess.check_output(["git", "rev-parse", "--show-toplevel"], text=True).strip()
if _repo_root not in sys.path:
    sys.path.insert(0, _repo_root)

REPO_ROOT = Path(_repo_root)
print("Repo root:", _repo_root)
print("sys.path[0]:", sys.path[0])

# 2) ipywidgets opcjonalnie (jeśli brak w env, notebook nadal może działać w trybie bez-widgetowym)
try:
    import ipywidgets as widgets
    from IPython.display import display as _display_widgets
    HAS_WIDGETS = True
except ModuleNotFoundError:
    HAS_WIDGETS = False
    widgets = None
    print("[WARN] ipywidgets nie jest zainstalowane w tym kernelu. Część UI (dropdown/button/upload) będzie niedostępna.")

# 3) helper do pobierania bytes z GCS (preferuj moduł, ale daj fallback)
try:
    from src.viz.gcs_cat import gcs_cat_bytes
except ModuleNotFoundError:
    def gcs_cat_bytes(gs_path: str) -> bytes:
        """Pobiera plik z GCS przez: gcloud storage cat gs://..."""
        r = subprocess.run(["gcloud", "storage", "cat", gs_path], capture_output=True)
        if r.returncode != 0 or not r.stdout:
            raise FileNotFoundError(r.stderr.decode("utf-8", errors="ignore")[:800])
        return r.stdout
    print("[WARN] Nie znaleziono src.viz.gcs_cat. Używam fallback: gcloud storage cat ...")

print("torch:", torch.__version__)
print("open_clip:", open_clip.__version__)
print("numpy:", np.__version__)
print("pyarrow:", pa.__version__)
print("pandas:", pd.__version__)
print("HAS_WIDGETS:", HAS_WIDGETS)

OK kernel: /home/jupyter/ocr-search/.venv-clip/bin/python
Repo root: /home/jupyter/ocr-search
sys.path[0]: /home/jupyter/ocr-search
torch: 2.10.0+cu128
open_clip: 3.2.0
numpy: 1.26.4
pyarrow: 23.0.0
pandas: 2.3.3
HAS_WIDGETS: True


### 2. Wybór folderu w GCS + ścieżki indeksów (per-folder)

In [4]:
# ### 2. Wybór zestawu obrazów do wyszukiwania (folder w GCS)
# Funkcja:
# - wybiera folder w gs://ocr-2026/, który traktujemy jako „zestaw” obrazów,
# - ten folder jest zakresem wyszukiwania: TYLKO w nim budujemy indeks i TYLKO w nim szukamy podobieństw,
# - ustawia GCS_PREFIX (prefix folderu) oraz ścieżki plików indeksów (global + patch) zapisanych per-zestaw.

BUCKET_ROOT = "gs://ocr-2026"
LIMIT_IMAGES = None  # np. 200 dla testu; None = wszystko

REPO_ROOT = Path(subprocess.check_output(["git", "rev-parse", "--show-toplevel"], text=True).strip())
INDEX_DIR = (REPO_ROOT / "outputs" / "clip_index")
INDEX_DIR.mkdir(parents=True, exist_ok=True)

EXCLUDE_FOLDERS = {"referencje"}  # foldery niebędące zestawami do indeksowania

def list_bucket_folders(bucket_root: str) -> list[str]:
    r = subprocess.run(
        ["gcloud", "storage", "ls", f"{bucket_root.rstrip('/')}" + "/*"],
        capture_output=True,
        text=True
    )
    if r.returncode != 0:
        raise RuntimeError(r.stderr.strip()[:2000])

    out = []
    for line in r.stdout.splitlines():
        p = line.strip()
        if p.endswith("/"):
            name = p.rstrip("/").split("/")[-1]
            if name not in EXCLUDE_FOLDERS:
                out.append(name)
    return sorted(set(out))

def safe_slug(name: str) -> str:
    s = re.sub(r"\s+", "_", name.strip())
    s = re.sub(r"[^0-9A-Za-z._-]+", "_", s)
    return s

folders = list_bucket_folders(BUCKET_ROOT)
if not folders:
    raise RuntimeError(f"Brak folderów-zestawów w {BUCKET_ROOT}/")

folder_dd = widgets.Dropdown(
    options=folders,
    value=folders[0],
    description="Zestaw:",
    layout=widgets.Layout(width="520px"),
)
out = widgets.Output()

def _set_paths(_=None):
    with out:
        out.clear_output()
        folder = folder_dd.value
        slug = safe_slug(folder)

        globals()["GCS_PREFIX"] = f"{BUCKET_ROOT.rstrip('/')}/{folder}"
        globals()["INDEX_GLOBAL_PARQUET"] = str(INDEX_DIR / f"clip_global__{slug}.parquet")
        globals()["INDEX_PATCH_PARQUET"]  = str(INDEX_DIR / f"clip_patch__{slug}.parquet")

        print("GCS_PREFIX           =", GCS_PREFIX)
        print("INDEX_GLOBAL_PARQUET =", INDEX_GLOBAL_PARQUET)
        print("INDEX_PATCH_PARQUET  =", INDEX_PATCH_PARQUET)

folder_dd.observe(_set_paths, names="value")
display(folder_dd, out)
_set_paths()

Dropdown(description='Zestaw:', layout=Layout(width='520px'), options=('other photos', 'photos'), value='other…

Output()

### 4. Wyszukiwanie GLOBAL (podobne zdjęcia) – referencja z GCS (dropdown) + TopK

In [9]:
# ### 4. Wyszukiwanie GLOBAL (podobne zdjęcia) – referencja z GCS (dropdown) + TopK
# Funkcja:
# - pozwala wybrać obraz referencyjny z gs://ocr-2026/referencje/ (albo z bieżącego zestawu GCS_PREFIX),
# - liczy embedding CLIP dla referencji,
# - porównuje do indeksu GLOBAL (df_global) i pokazuje TopK wyników jako kafelki.

import io
import base64

# --- konfiguracja źródeł referencji ---
REFS_PREFIX = "gs://ocr-2026/referencje"
IMAGE_EXTS = (".jpg", ".jpeg", ".png", ".tif", ".tiff", ".webp")

def list_gcs_images(prefix: str) -> list[str]:
    r = subprocess.run(
        ["gcloud", "storage", "ls", f"{prefix.rstrip('/')}/**"],
        capture_output=True,
        text=True,
    )
    if r.returncode != 0:
        raise RuntimeError(r.stderr.strip()[:2000])

    paths = []
    for line in r.stdout.splitlines():
        p = line.strip()
        if not p or p.endswith("/"):
            continue
        if p.lower().endswith(IMAGE_EXTS):
            paths.append(p)
    return sorted(paths)

def _ensure_clip_model():
    # model/preprocess w tej komórce zawsze dostępne (nawet jeśli indeks tylko wczytany)
    if "model" in globals() and "preprocess" in globals() and "device" in globals():
        return

    globals()["device"] = "cuda" if torch.cuda.is_available() else "cpu"
    model_name = "ViT-B-32"
    pretrained = "laion2b_s34b_b79k"
    m, _, pp = open_clip.create_model_and_transforms(model_name, pretrained=pretrained)
    m = m.to(device).eval()
    globals()["model"] = m
    globals()["preprocess"] = pp
    globals()["clip_model_id"] = f"{model_name}::{pretrained}"

def _embed_pil(img: Image.Image) -> np.ndarray:
    _ensure_clip_model()
    x = preprocess(img.convert("RGB")).unsqueeze(0).to(device)
    with torch.no_grad():
        e = model.encode_image(x)
        e = e / e.norm(dim=-1, keepdim=True)
    return e.squeeze(0).detach().cpu().numpy().astype(np.float32)

def _decode_emb_f16(b: bytes, dim: int) -> np.ndarray:
    v = np.frombuffer(b, dtype=np.float16, count=int(dim)).astype(np.float32)
    # embeddings były normalizowane przy zapisie, ale defensywnie:
    n = np.linalg.norm(v) + 1e-12
    return v / n

_GLOBAL_M = None
_GLOBAL_META = None

def _get_global_matrix():
    global _GLOBAL_M, _GLOBAL_META

    if _GLOBAL_M is not None and _GLOBAL_META is not None:
        return _GLOBAL_M, _GLOBAL_META

    if "df_global" not in globals() or df_global is None or len(df_global) == 0:
        raise RuntimeError("Brak df_global. Uruchom najpierw komórkę 3 (indeks GLOBAL).")

    need_cols = ["gcs_path", "file_name", "emb_f16", "dim"]
    for c in need_cols:
        if c not in df_global.columns:
            raise KeyError(f"df_global nie ma kolumny '{c}'. Ma: {df_global.columns.tolist()}")

    dff = df_global[df_global["emb_f16"].notna()].copy()
    if len(dff) == 0:
        raise RuntimeError("df_global nie zawiera żadnych embeddingów (emb_f16).")

    embs = []
    meta = []
    for _, r in dff.iterrows():
        b = r["emb_f16"]
        dim = int(r["dim"])
        try:
            v = _decode_emb_f16(b, dim)
            embs.append(v)
            meta.append((str(r["gcs_path"]), str(r["file_name"])))
        except Exception:
            # pomiń uszkodzone rekordy
            continue

    if not embs:
        raise RuntimeError("Nie udało się zdekodować żadnych embeddingów z df_global.")

    M = np.vstack(embs).astype(np.float32)  # [N, D]
    _GLOBAL_M = M
    _GLOBAL_META = meta
    return _GLOBAL_M, _GLOBAL_META

def _thumb_data_uri(gs_path: str, max_side: int) -> str:
    b = gcs_cat_bytes(gs_path)
    img = Image.open(io.BytesIO(b)).convert("RGB")
    W, H = img.size
    scale = min(max_side / max(W, H), 1.0)
    nw, nh = int(W * scale), int(H * scale)
    if (nw, nh) != (W, H):
        img = img.resize((nw, nh))
    buf = io.BytesIO()
    img.save(buf, format="JPEG", quality=85)
    return "data:image/jpeg;base64," + base64.b64encode(buf.getvalue()).decode("ascii")

def _render_cards(results: list[dict], max_side: int):
    cards = []
    for it in results:
        uri = _thumb_data_uri(it["gcs_path"], max_side=max_side)
        score = it["score"]
        fn = it["file_name"]
        cards.append(f"""
        <div style="width:{max_side+40}px; margin:10px;">
            <div style="font-size:12px; margin-bottom:6px;">
                <b>{fn}</b><br/>
                sim: {score:.3f}
            </div>
            <img src="{uri}" style="max-width:{max_side}px; border:1px solid #ddd;" />
        </div>
        """)
    html = "<div style='display:flex; flex-wrap:wrap; align-items:flex-start;'>" + "\n".join(cards) + "</div>"
    display(HTML(html))

# --- UI: wybór referencji ---
ref_src = widgets.Dropdown(
    options=[("referencje (GCS)", "refs"), ("bieżący zestaw (GCS_PREFIX)", "set")],
    value="refs",
    description="Źródło:",
    layout=widgets.Layout(width="520px"),
)

ref_dd = widgets.Dropdown(
    options=[],
    description="Referencja:",
    layout=widgets.Layout(width="820px"),
)

topk = widgets.IntSlider(value=20, min=5, max=80, step=5, description="TopK:", layout=widgets.Layout(width="520px"))
max_side = widgets.Dropdown(options=[360, 480, 720], value=480, description="Miniatura:", layout=widgets.Layout(width="260px"))
btn = widgets.Button(description="Szukaj (GLOBAL)", layout=widgets.Layout(width="220px"))
out = widgets.Output()

def _refresh_ref_list(_=None):
    with out:
        out.clear_output()
        base = REFS_PREFIX if ref_src.value == "refs" else GCS_PREFIX
        paths = list_gcs_images(base)
        if not paths:
            print(f"[WARN] Brak obrazów pod: {base}")
            ref_dd.options = []
            return
        # w dropdown pokazujemy krótsze etykiety
        opts = [(p.split("/")[-1], p) for p in paths]
        ref_dd.options = opts
        ref_dd.value = opts[0][1]
        print("Referencje:", len(paths), "| base:", base)

def _on_search(_):
    with out:
        out.clear_output()

        if not ref_dd.value:
            print("[WARN] Brak wybranej referencji.")
            return

        # 1) embedding referencji
        ref_path = ref_dd.value
        ref_img = Image.open(io.BytesIO(gcs_cat_bytes(ref_path))).convert("RGB")
        q = _embed_pil(ref_img)  # [D]

        # 2) macierz globali
        M, meta = _get_global_matrix()  # M:[N,D], meta:[(gcs_path,file_name)]
        sims = M @ q  # cosine similarity

        k = int(topk.value)
        k = min(k, len(sims))
        idxs = np.argpartition(-sims, kth=k-1)[:k]
        idxs = idxs[np.argsort(-sims[idxs])]

        results = []
        for i in idxs:
            gcs_path, file_name = meta[int(i)]
            results.append({"gcs_path": gcs_path, "file_name": file_name, "score": float(sims[int(i)])})

        print("Referencja:", ref_path)
        print("Wyniki:", len(results))
        _render_cards(results, max_side=int(max_side.value))

ref_src.observe(_refresh_ref_list, names="value")
btn.on_click(_on_search)

display(widgets.VBox([
    widgets.HBox([ref_src]),
    widgets.HBox([ref_dd]),
    widgets.HBox([topk, max_side, btn]),
    out
]))

_refresh_ref_list()

VBox(children=(HBox(children=(Dropdown(description='Źródło:', layout=Layout(width='520px'), options=(('referen…