In [1]:
!pip install python-dotenv requests Pillow numpy soundfile librosa ffmpeg-python tqdm open-clip-torch torch transformers sentence-transformers
# ensure ffmpeg binary installed on the system (apt/brew)





[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
#!/usr/bin/env python3
"""
ensemble_pipeline.py

Complete pipeline:
- parse prompt with LLM (OpenAI HTTP)
- fetch from Pexels (images+videos) and Freesound (audio)
- preprocess in-memory to CLIP-ready tensors
- embed with primary CLIP (open_clip) and a secondary model (ALIGN or other)
- compute per-model similarity to prompt and an ensemble ranking
- optional BLIP caption re-rank (disabled by default)

Configure via environment variables or .env file:
  OPENAI_API_KEY, PEXELS_API_KEY, FREESOUND_API_KEY
Optional:
  SECOND_MODEL = "align" or name like "ViT-L-14" (open_clip)
  ENABLE_BLIP = "1" to enable BLIP caption scoring (requires more downloads)
"""

import os
import io
import json
import math
import time
import requests
import numpy as np
from PIL import Image
from typing import List, Dict, Optional
from dotenv import load_dotenv
from tqdm import tqdm

# audio libs
import soundfile as sf
import librosa
import ffmpeg

# ML libs
import torch
import open_clip

# optional HF models for ALIGN or others
from transformers import AutoProcessor, AutoModelForVision2Seq, AutoModel, AutoTokenizer
# BLIP (optional)
from transformers import BlipProcessor, BlipForConditionalGeneration

# Load .env if present
load_dotenv()

# -------------------------
# Config / env
# -------------------------
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PEXELS_KEY = os.getenv("PEXELS_API_KEY")
FREESOUND_KEY = os.getenv("FREESOUND_API_KEY")
SECOND_MODEL = os.getenv("SECOND_MODEL", "ViT-L-14")  # use "align" to attempt HF align
ENABLE_BLIP = os.getenv("ENABLE_BLIP", "0") == "1"

if not OPENAI_API_KEY:
    raise RuntimeError("Set OPENAI_API_KEY in environment (or load .env with load_dotenv()).")
if not PEXELS_KEY:
    print("[warn] No PEXELS_API_KEY set; Pexels fetcher will return empty.")
if not FREESOUND_KEY:
    print("[warn] No FREESOUND_API_KEY set; Freesound fetcher will return empty.")

# -------------------------
# LLM prompt parser (OpenAI HTTP)
# -------------------------
_SYSTEM_PROMPT = """You are an assistant that converts an unstructured user prompt into a structured search plan for media assets (images, videos, audio). Always consider all three asset types and decide which are useful for the prompt.

Your output MUST be valid JSON only with exactly these fields:
- clean_prompt: (string) a cleaned, concise query suitable for search engines like Pexels, Unsplash, or Freesound
- num_images: (integer) number of image assets to fetch (0–10)
- num_videos: (integer) number of video assets to fetch (0–10)
- num_audio: (integer) number of audio assets to fetch (0–10)
- notes: (string) brief reasoning why you chose these asset types and counts

Guidelines (methods you MUST follow):
1. Parse the user's intent. If the user explicitly mentions a medium (e.g., 'background music'), allocate more to that medium.
2. If the prompt is visually focused (people, landscapes, actions), include images and consider videos (default at least 1 video for visual scenes).
3. If the prompt implies sound (birds chirping, music, crowd noise), include audio (music, SFX, ambience).
4. Decide numbers based on diversity: broad/ambiguous prompts -> larger counts (up to 10); very specific prompts -> fewer (1–4).
5. Keep results practical: use integers, clamp counts to 0..10.
6. Output JSON ONLY. Do NOT include extra text or explanation.
"""

def _clamp_int(v, lo=0, hi=10):
    try:
        vi = int(v)
    except Exception:
        return lo
    if vi < lo: return lo
    if vi > hi: return hi
    return vi

def parse_prompt_with_openai_http(raw_prompt: str) -> Dict:
    url = "https://api.openai.com/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json",
    }
    user_message = f"User prompt: {raw_prompt}\n\nReturn JSON only."
    payload = {
        "model": "gpt-4o-mini",   # change if needed
        "messages": [
            {"role": "system", "content": _SYSTEM_PROMPT},
            {"role": "user", "content": user_message}
        ],
        "temperature": 0.0,
        "max_tokens": 300,
    }
    try:
        r = requests.post(url, headers=headers, json=payload, timeout=20)
        r.raise_for_status()
        data = r.json()
        text = data["choices"][0]["message"]["content"]
        # extract JSON blob robustly
        if "{" in text and "}" in text:
            start = text.index("{")
            end = text.rindex("}") + 1
            json_str = text[start:end]
            j = json.loads(json_str)
        else:
            j = json.loads(text)
        clean_prompt = str(j.get("clean_prompt") or raw_prompt).strip()
        num_images = _clamp_int(j.get("num_images", 0))
        num_videos = _clamp_int(j.get("num_videos", 0))
        num_audio  = _clamp_int(j.get("num_audio", 0))
        notes = str(j.get("notes") or "").strip()
        if (num_images + num_videos + num_audio) == 0:
            low = raw_prompt.lower()
            if any(w in low for w in ("music","song","sound","audio","ambient","sfx","effect")):
                num_audio = max(2, num_audio)
            if any(w in low for w in ("photo","image","picture","video","clip","scene","shot","portrait","landscape")):
                num_images = max(3, num_images); num_videos = max(1, num_videos)
            if (num_images + num_videos + num_audio) == 0:
                num_images = 3; num_videos = 1
        return {
            "clean_prompt": clean_prompt,
            "num_images": num_images,
            "num_videos": num_videos,
            "num_audio": num_audio,
            "notes": notes
        }
    except Exception as e:
        print("[warn] parse prompt failed:", e)
        raw = raw_prompt.strip()
        low = raw.lower()
        if any(w in low for w in ("music","song","sound","audio","ambient","sfx","effect")):
            return {"clean_prompt": raw, "num_images":0, "num_videos":0, "num_audio":6, "notes":"fallback audio"}
        else:
            return {"clean_prompt": raw, "num_images":5, "num_videos":2, "num_audio":1, "notes":"fallback visual"}

# -------------------------
# Fetchers: Pexels & Freesound (metadata only + thumbnail/preview URLs)
# -------------------------
def unified_record(provider, id, typ, title, description, url, thumbnail_url=None,
                   duration=None, uploader=None, published_at=None, tags=None, raw=None):
    return {
        "provider": provider,
        "id": str(id),
        "type": typ,  # image|video|audio
        "title": title or "",
        "description": description,
        "url": url,
        "thumbnail_url": thumbnail_url,
        "duration": duration,
        "uploader": uploader,
        "published_at": published_at,
        "tags": tags or [],
        "raw_meta": raw or {}
    }

def search_pexels_images(prompt: str, per_page: int = 10) -> List[Dict]:
    if not PEXELS_KEY:
        return []
    endpoint = "https://api.pexels.com/v1/search"
    headers = {"Authorization": PEXELS_KEY}
    params = {"query": prompt, "per_page": per_page}
    r = requests.get(endpoint, headers=headers, params=params, timeout=15)
    r.raise_for_status()
    out = []
    for it in r.json().get("photos", []):
        out.append(unified_record(
            provider="pexels",
            id=it.get("id"),
            typ="image",
            title=it.get("alt") or "",
            description=None,
            url=(it.get("src") or {}).get("original"),
            thumbnail_url=(it.get("src") or {}).get("medium"),
            duration=None,
            uploader=it.get("photographer"),
            published_at=None,
            tags=[],
            raw=it
        ))
    return out

def search_pexels_videos(prompt: str, per_page: int = 8) -> List[Dict]:
    if not PEXELS_KEY:
        return []
    endpoint = "https://api.pexels.com/videos/search"
    headers = {"Authorization": PEXELS_KEY}
    params = {"query": prompt, "per_page": per_page}
    r = requests.get(endpoint, headers=headers, params=params, timeout=15)
    r.raise_for_status()
    out = []
    for it in r.json().get("videos", []):
        file_url = None
        for vf in it.get("video_files", []) or []:
            if vf.get("quality") == "hd":
                file_url = vf.get("link"); break
        if not file_url and it.get("video_files"):
            file_url = it.get("video_files")[0].get("link")
        out.append(unified_record(
            provider="pexels",
            id=it.get("id"),
            typ="video",
            title=(it.get("user") or {}).get("name") or str(it.get("id")),
            description=it.get("url"),
            url=file_url,
            thumbnail_url=it.get("image"),
            duration=it.get("duration"),
            uploader=(it.get("user") or {}).get("name"),
            published_at=None,
            tags=[],
            raw=it
        ))
    return out

def search_freesound(prompt: str, per_page: int = 10) -> List[Dict]:
    if not FREESOUND_KEY:
        return []
    endpoint = "https://freesound.org/apiv2/search/text/"
    headers = {"Authorization": f"Token {FREESOUND_KEY}"}
    params = {"query": prompt, "page_size": per_page}
    r = requests.get(endpoint, headers=headers, params=params, timeout=15)
    r.raise_for_status()
    out = []
    for it in r.json().get("results", []):
        preview = (it.get("previews") or {}).get("preview-hq-mp3") or (it.get("previews") or {}).get("preview-lq-mp3")
        out.append(unified_record(
            provider="freesound",
            id=it.get("id"),
            typ="audio",
            title=it.get("name"),
            description=it.get("description"),
            url=preview,
            thumbnail_url=None,
            duration=it.get("duration"),
            uploader=it.get("username"),
            published_at=it.get("created"),
            tags=it.get("tags") or [],
            raw=it
        ))
    return out

def fetch_assets(clean_prompt: str, num_images:int, num_videos:int, num_audio:int) -> List[Dict]:
    results = []
    if num_images > 0:
        imgs = search_pexels_images(clean_prompt, per_page=num_images)
        results.extend(imgs[:num_images])
    if num_videos > 0:
        vids = search_pexels_videos(clean_prompt, per_page=num_videos)
        results.extend(vids[:num_videos])
    if num_audio > 0:
        aud = search_freesound(clean_prompt, per_page=num_audio)
        results.extend(aud[:num_audio])
    return results

# -------------------------
# In-memory download & preprocessing
# -------------------------
def download_bytes(url: str, timeout=20) -> Optional[bytes]:
    if not url:
        return None
    try:
        r = requests.get(url, timeout=timeout, stream=True)
        r.raise_for_status()
        return r.content
    except Exception as e:
        print("[warn] download failed:", e)
        return None

def download_image_pil(url: str, timeout=15) -> Optional[Image.Image]:
    b = download_bytes(url, timeout=timeout)
    if not b:
        return None
    try:
        img = Image.open(io.BytesIO(b)).convert("RGB")
        return img
    except Exception as e:
        print("[warn] PIL open failed:", e)
        return None

def center_crop_and_resize_pil(img: Image.Image, size:int=224) -> Image.Image:
    w,h = img.size
    m = min(w,h)
    left = (w-m)//2
    top = (h-m)//2
    img = img.crop((left, top, left+m, top+m))
    img = img.resize((size,size), Image.LANCZOS)
    return img

def pil_to_clip_tensor(img: Image.Image, size:int=224, normalize:bool=True) -> Optional[np.ndarray]:
    if img is None:
        return None
    img = center_crop_and_resize_pil(img, size=size)
    arr = np.asarray(img).astype(np.float32)/255.0  # H W C
    arr = np.transpose(arr,(2,0,1)).copy()  # C H W
    if normalize:
        arr = arr*2.0 - 1.0
    return arr

def decode_audio_bytes_to_waveform(audio_bytes: bytes, target_sr:int=16000, duration:float=5.0) -> Optional[np.ndarray]:
    if not audio_bytes:
        return None
    try:
        bio = io.BytesIO(audio_bytes)
        data, sr = sf.read(bio, dtype='float32')
        if data.ndim > 1:
            data = np.mean(data, axis=1)
        if sr != target_sr:
            data = librosa.resample(data, sr, target_sr)
        desired = int(target_sr * duration)
        if len(data) > desired:
            start = max(0, (len(data)-desired)//2)
            data = data[start:start+desired]
        elif len(data) < desired:
            data = np.concatenate([data, np.zeros(desired - len(data), dtype=np.float32)])
        return data.astype(np.float32)
    except Exception as e:
        try:
            proc = (
                ffmpeg.input('pipe:0')
                .output('pipe:1', format='f32le', ar=target_sr, ac=1)
                .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True)
            )
            out, err = proc.communicate(input=audio_bytes)
            if proc.returncode != 0:
                raise RuntimeError("ffmpeg decode fail")
            data = np.frombuffer(out, dtype=np.float32)
            desired = int(target_sr * duration)
            if len(data) > desired:
                start = max(0, (len(data)-desired)//2)
                data = data[start:start+desired]
            elif len(data) < desired:
                data = np.concatenate([data, np.zeros(desired - len(data), dtype=np.float32)])
            return data.astype(np.float32)
        except Exception as e2:
            print("[warn] audio decode failed:", e, e2)
            return None

def waveform_to_mel_image_tensor(wav: np.ndarray, sr:int=16000, n_mels:int=128, n_fft:int=2048, hop_length:int=512, size:int=224) -> Optional[np.ndarray]:
    if wav is None:
        return None
    S = librosa.feature.melspectrogram(y=wav, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    S_db = librosa.power_to_db(S, ref=np.max)
    S_min, S_max = S_db.min(), S_db.max()
    S_norm = (S_db - S_min) / (S_max - S_min + 1e-9)
    img_arr = (S_norm * 255.0).astype(np.uint8)
    pil = Image.fromarray(img_arr)
    pil = pil.resize((size,size), Image.LANCZOS).convert("RGB")
    arr = np.asarray(pil).astype(np.float32)/255.0
    arr = np.transpose(arr,(2,0,1)).copy()
    arr = arr*2.0 - 1.0
    return arr

def preprocess_records(records: List[Dict], image_size:int=224, audio_duration:float=5.0, audio_sr:int=16000) -> List[Dict]:
    processed = []
    for r in tqdm(records, desc="preprocess"):
        rec = dict(r)
        typ = rec["type"]
        if typ == "image":
            url = rec.get("thumbnail_url") or rec.get("url")
            pil = download_image_pil(url)
            rec["img_pil"] = pil
            rec["img_tensor"] = pil_to_clip_tensor(pil, size=image_size) if pil is not None else None
        elif typ == "video":
            url = rec.get("thumbnail_url") or rec.get("url")
            pil = download_image_pil(url)
            rec["img_pil"] = pil
            rec["img_tensor"] = pil_to_clip_tensor(pil, size=image_size) if pil is not None else None
        elif typ == "audio":
            url = rec.get("url")
            audio_bytes = download_bytes(url)
            wav = decode_audio_bytes_to_waveform(audio_bytes, target_sr=audio_sr, duration=audio_duration)
            rec["waveform"] = wav
            rec["audio_mel_tensor"] = waveform_to_mel_image_tensor(wav, sr=audio_sr, size=image_size) if wav is not None else None
        else:
            rec["img_tensor"] = None
            rec["audio_mel_tensor"] = None
        processed.append(rec)
    return processed

# -------------------------
# Embedding wrappers: primary CLIP (open_clip) and secondary model
# -------------------------
class CLIPWrapper:
    def __init__(self, model_name="ViT-B-32", pretrained="openai", device=None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained)
        self.model = model.to(self.device).eval()
        self.preprocess = preprocess
        self.tokenizer = open_clip.get_tokenizer(model_name)
        self.dtype = next(self.model.parameters()).dtype

    @torch.no_grad()
    def embed_images(self, imgs: List[np.ndarray], batch_size:int=32) -> np.ndarray:
        if len(imgs) == 0:
            return np.zeros((0, self.model.visual.output_dim), dtype=np.float32)
        embs = []
        for i in range(0, len(imgs), batch_size):
            batch = imgs[i:i+batch_size]
            t = torch.from_numpy(np.stack(batch, axis=0)).to(self.device)
            if t.max() <= 1.0 + 1e-6 and t.min() >= -1.0 - 1e-6:
                t = (t + 1.0) / 2.0
            t = t.type(self.dtype)
            img_emb = self.model.encode_image(t)
            img_emb = img_emb / img_emb.norm(dim=-1, keepdim=True)
            embs.append(img_emb.cpu().numpy())
        return np.vstack(embs).astype(np.float32)

    @torch.no_grad()
    def embed_texts(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
        """
        Robust text embedding that handles different tokenizer return types:
          - torch.LongTensor -> call model.encode_text(tensor)
          - list/ndarray -> convert to torch.LongTensor and call model.encode_text(tensor)
          - dict -> try to convert values to tensors and:
             * if dict has single tensor-like value -> call model.encode_text(tensor)
             * else try model.encode_text(**dict) and fallback to passing first tensor
        Returns numpy array (N, D), L2-normalized.
        """
        if len(texts) == 0:
            try:
                text_dim = self.model.text_projection.shape[1]
            except Exception:
                text_dim = 512
            return np.zeros((0, text_dim), dtype=np.float32)

        out_embs = []
        for i in range(0, len(texts), batch_size):
            chunk = texts[i:i+batch_size]
            toks = self.tokenizer(chunk)  # may be tensor, list, ndarray, or dict

            # Normalize into torch tensors on device
            toks_norm = None
            if isinstance(toks, torch.Tensor):
                toks_norm = toks.to(self.device)
                # call positional
                try:
                    txt_emb = self.model.encode_text(toks_norm)
                except TypeError:
                    # some models expect kwargs, try as dict
                    try:
                        txt_emb = self.model.encode_text(input_ids=toks_norm)
                    except Exception as e:
                        raise RuntimeError(f"encode_text failed for tensor tokenizer output: {e}")
            elif isinstance(toks, (list, tuple, np.ndarray)):
                # convert to tensor
                toks_tensor = torch.tensor(toks, device=self.device)
                try:
                    txt_emb = self.model.encode_text(toks_tensor)
                except TypeError:
                    try:
                        txt_emb = self.model.encode_text(input_ids=toks_tensor)
                    except Exception as e:
                        raise RuntimeError(f"encode_text failed for list/ndarray tokenizer output: {e}")
            elif isinstance(toks, dict):
                # convert values to tensors if needed
                dict_t = {}
                for k, v in toks.items():
                    if isinstance(v, torch.Tensor):
                        dict_t[k] = v.to(self.device)
                    else:
                        try:
                            dict_t[k] = torch.tensor(v, device=self.device)
                        except Exception:
                            # fallback: skip this key
                            pass

                # If dict has exactly one tensor, pass it positionally
                tensor_values = [v for v in dict_t.values() if isinstance(v, torch.Tensor)]
                if len(tensor_values) == 1:
                    single = tensor_values[0]
                    try:
                        txt_emb = self.model.encode_text(single)
                    except TypeError:
                        # try with kwarg name guesses
                        try:
                            txt_emb = self.model.encode_text(input_ids=single)
                        except Exception as e:
                            raise RuntimeError(f"encode_text failed for single-token dict: {e}")
                else:
                    # try calling with kwargs; if that fails, try passing first tensor positional
                    try:
                        txt_emb = self.model.encode_text(**dict_t)
                    except TypeError:
                        if tensor_values:
                            first = tensor_values[0]
                            try:
                                txt_emb = self.model.encode_text(first)
                            except Exception as e:
                                raise RuntimeError(f"encode_text failed for dict fallback: {e}")
                        else:
                            raise RuntimeError("Tokenizer returned dict but no tensor-like values found.")
            else:
                raise RuntimeError(f"Unsupported tokenizer output type: {type(toks)}")

            # normalize and append
            txt_emb = txt_emb / (txt_emb.norm(dim=-1, keepdim=True) + 1e-12)
            out_embs.append(txt_emb.cpu().numpy())

        return np.vstack(out_embs).astype(np.float32)

# Secondary model loader: try ALIGN via HF if requested, otherwise load another open_clip model name
class SecondaryWrapper:
    def __init__(self, mode="align", device=None):
        self.mode = mode.lower()
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.available = False
        # try ALIGN via HF if mode == "align"
        if self.mode == "align":
            try:
                # We'll attempt to load a vision-text model if available; if not, fallback.
                # Note: ALIGN HF checkpoint interfaces differ; this is a best-effort attempt.
                print("[info] Attempting to load ALIGN via HF 'kakaobrain/align-base' (may download large files)...")
                self.processor = AutoProcessor.from_pretrained("kakaobrain/align-base")
                self.model_hf = AutoModel.from_pretrained("kakaobrain/align-base").to(self.device).eval()
                # For text embedding, we need tokenizer — try AutoTokenizer
                self.tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")
                self.available = True
                print("[info] ALIGN model loaded.")
            except Exception as e:
                print("[warn] ALIGN load failed:", e)
                self.available = False
        # fallback: load another open_clip variant if mode is like 'ViT-L-14' or fallback
        if not self.available:
            try:
                model_name = mode if mode and mode.lower() != "align" else "ViT-L-14"
                print(f"[info] Loading secondary OpenCLIP model {model_name}")
                self.clip = CLIPWrapper(model_name=model_name, pretrained="openai", device=self.device)
                self.available = True
            except Exception as e:
                print("[warn] secondary open_clip load failed:", e)
                self.available = False

    def embed_images(self, imgs: List[np.ndarray], batch_size=16) -> np.ndarray:
        if not self.available:
            return np.zeros((len(imgs), 512), dtype=np.float32)
        if hasattr(self, "clip"):
            return self.clip.embed_images(imgs, batch_size=batch_size)
        else:
            # best-effort HF path: convert PIL? we won't implement complex logic here
            raise NotImplementedError("HF ALIGN image embedding path not implemented in this script.")

    def embed_texts(self, texts: List[str], batch_size=16) -> np.ndarray:
        if not self.available:
            return np.zeros((len(texts), 512), dtype=np.float32)
        if hasattr(self, "clip"):
            return self.clip.embed_texts(texts, batch_size=batch_size)
        else:
            raise NotImplementedError("HF ALIGN text embedding path not implemented in this script.")


# -------------------------
# BLIP captioner (optional)
# -------------------------
class BLIPCaptioner:
    def __init__(self, device=None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        try:
            self.proc = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
            self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(self.device)
            self.model.eval()
            self.available = True
        except Exception as e:
            print("[warn] BLIP load failed:", e)
            self.available = False

    @torch.no_grad()
    def caption(self, pil_images: List[Image.Image], max_length:int=30) -> List[str]:
        if not self.available:
            return ["" for _ in pil_images]
        caps = []
        for img in pil_images:
            if img is None:
                caps.append("")
                continue
            inputs = self.proc(images=img, return_tensors="pt").to(self.device)
            out = self.model.generate(**inputs, max_new_tokens=max_length)
            txt = self.proc.decode(out[0], skip_special_tokens=True)
            caps.append(txt)
        return caps

# -------------------------
# Utilities: scoring and fusion
# -------------------------
def cosine_sim(a: np.ndarray, b: np.ndarray) -> np.ndarray:
    # a: (N,d), b: (M,d) -> (N,M)
    if a.size == 0 or b.size == 0:
        return np.zeros((a.shape[0], b.shape[0]), dtype=np.float32)
    an = a / (np.linalg.norm(a, axis=1, keepdims=True) + 1e-12)
    bn = b / (np.linalg.norm(b, axis=1, keepdims=True) + 1e-12)
    return an.dot(bn.T)

def minmax_normalize(x: np.ndarray) -> np.ndarray:
    if x.size == 0:
        return x
    mn, mx = x.min(), x.max()
    if mx <= mn:
        return np.ones_like(x) * 0.5
    return (x - mn) / (mx - mn)

def fuse_minmax_average(score_list: List[np.ndarray]) -> np.ndarray:
    # score_list: list of 1D arrays of same length
    if not score_list:
        return np.array([])
    nm = [minmax_normalize(s) for s in score_list]
    stacked = np.vstack(nm)  # (models, N)
    return stacked.mean(axis=0)

# -------------------------
# Pipeline orchestration
# -------------------------
def pipeline_run(user_prompt: str, top_k:int=10):
    plan = parse_prompt_with_openai_http(user_prompt)
    print("LLM plan:", plan)

    recs = fetch_assets(plan["clean_prompt"], plan["num_images"], plan["num_videos"], plan["num_audio"])
    print(f"Fetched total {len(recs)} assets from providers")

    processed = preprocess_records(recs, image_size=224, audio_duration=5.0, audio_sr=16000)

    # instantiate models
    device = "cuda" if torch.cuda.is_available() else "cpu"
    clip_primary = CLIPWrapper(model_name="ViT-B-32", pretrained="openai", device=device)
    clip_secondary = SecondaryWrapper(mode=SECOND_MODEL, device=device)
    blip = BLIPCaptioner(device=device) if ENABLE_BLIP else None

    # embed prompt text using both models
    txt_emb_primary = clip_primary.embed_texts([plan["clean_prompt"]])[0]
    txt_emb_secondary = clip_secondary.embed_texts([plan["clean_prompt"]])[0] if clip_secondary.available else None

    # prepare image tensors and indices
    img_tensors = []
    img_indices = []
    for i, r in enumerate(processed):
        if r["type"] in ("image","video") and r.get("img_tensor") is not None:
            img_tensors.append(r["img_tensor"].astype(np.float32))
            img_indices.append(i)

    # embed images
    if img_tensors:
        print(f"Embedding {len(img_tensors)} images with primary CLIP...")
        img_embs_primary = clip_primary.embed_images(img_tensors, batch_size=16)
        img_embs_secondary = clip_secondary.embed_images(img_tensors, batch_size=16) if clip_secondary.available else np.zeros_like(img_embs_primary)
    else:
        img_embs_primary = np.zeros((0, txt_emb_primary.shape[0]))
        img_embs_secondary = np.zeros((0, txt_emb_primary.shape[0])) if txt_emb_primary is not None else np.zeros((0,512))

    # audio embedding via mel-image fallback through CLIP (primary/secondary)
    audio_tensors = []
    audio_indices = []
    for i, r in enumerate(processed):
        if r["type"] == "audio" and r.get("audio_mel_tensor") is not None:
            audio_tensors.append(r["audio_mel_tensor"].astype(np.float32))
            audio_indices.append(i)
    if audio_tensors:
        print(f"Embedding {len(audio_tensors)} audio mel-images with primary CLIP...")
        audio_embs_primary = clip_primary.embed_images(audio_tensors, batch_size=8)
        audio_embs_secondary = clip_secondary.embed_images(audio_tensors, batch_size=8) if clip_secondary.available else np.zeros_like(audio_embs_primary)
    else:
        audio_embs_primary = np.zeros((0, txt_emb_primary.shape[0]))
        audio_embs_secondary = np.zeros((0, txt_emb_primary.shape[0]))

    # compute similarities
    results = []
    # images
    if img_tensors:
        sims_p = (img_embs_primary @ txt_emb_primary) / (np.linalg.norm(img_embs_primary,axis=1) * (np.linalg.norm(txt_emb_primary)+1e-12))
        sims_s = (img_embs_secondary @ txt_emb_secondary) / (np.linalg.norm(img_embs_secondary,axis=1) * (np.linalg.norm(txt_emb_secondary)+1e-12)) if txt_emb_secondary is not None else np.zeros_like(sims_p)
        for local_idx, (sp, ss) in enumerate(zip(sims_p, sims_s)):
            rec_idx = img_indices[local_idx]
            r = processed[rec_idx]
            results.append({
                "provider": r["provider"], "id": r["id"], "type": r["type"], "title": r["title"],
                "primary_score": float(sp), "secondary_score": float(ss),
                "url": r.get("url"), "thumbnail": r.get("thumbnail_url")
            })
    # audio
    if audio_tensors:
        sims_p = (audio_embs_primary @ txt_emb_primary) / (np.linalg.norm(audio_embs_primary,axis=1) * (np.linalg.norm(txt_emb_primary)+1e-12))
        sims_s = (audio_embs_secondary @ txt_emb_secondary) / (np.linalg.norm(audio_embs_secondary,axis=1) * (np.linalg.norm(txt_emb_secondary)+1e-12)) if txt_emb_secondary is not None else np.zeros_like(sims_p)
        for local_idx, (sp, ss) in enumerate(zip(sims_p, sims_s)):
            rec_idx = audio_indices[local_idx]
            r = processed[rec_idx]
            results.append({
                "provider": r["provider"], "id": r["id"], "type": r["type"], "title": r["title"],
                "primary_score": float(sp), "secondary_score": float(ss),
                "url": r.get("url"), "thumbnail": r.get("thumbnail_url")
            })

    if not results:
        print("[info] No embeddable results found. Exiting.")
        return []

    # build arrays and ensemble
    prim = np.array([r["primary_score"] for r in results], dtype=np.float32)
    sec  = np.array([r["secondary_score"] for r in results], dtype=np.float32)
    fused = fuse_minmax_average([prim, sec])  # average of minmax-normalized scores

    # Add fused & ranking
    for i, r in enumerate(results):
        r["secondary_score"] = float(sec[i])
        r["primary_score"] = float(prim[i])
        r["ensemble_score"] = float(fused[i])

    # optional BLIP captions & caption similarity (optional stronger re-rank)
    if ENABLE_BLIP and blip and blip.available:
        # caption all items that have pil
        pil_images = []
        map_idx_to_result = []
        for i_res, r in enumerate(results):
            # find corresponding processed record to get pil image
            # find first processed with matching id
            found = None
            for pr in processed:
                if pr["id"] == r["id"]:
                    found = pr
                    break
            pil = found.get("img_pil") if found else None
            pil_images.append(pil)
            map_idx_to_result.append(i_res)
        captions = blip.caption(pil_images)
        # embed captions via primary CLIP text encoder and compute similarity
        cap_embs = clip_primary.embed_texts(captions)
        prompt_emb = clip_primary.embed_texts([plan["clean_prompt"]])[0]
        cap_sims = (cap_embs @ prompt_emb) / (np.linalg.norm(cap_embs,axis=1) * (np.linalg.norm(prompt_emb)+1e-12))
        # normalize and combine into ensemble (replace previous ensemble or combine further)
        cap_nm = minmax_normalize(cap_sims)
        fused_with_caption = (fused + cap_nm) / 2.0
        for i_r, r in enumerate(results):
            r["blip_caption"] = captions[i_r]
            r["caption_sim"] = float(cap_sims[i_r])
            r["ensemble_score_caption"] = float(fused_with_caption[i_r])
        # present ranked by ensemble_with_caption
        rank_key = "ensemble_score_caption"
    else:
        rank_key = "ensemble_score"

    results_sorted = sorted(results, key=lambda x: -x.get(rank_key, 0.0))

    # print top_k
    print("\nTop results (ranked by {}):".format(rank_key))
    for i, rr in enumerate(results_sorted[:top_k], start=1):
        print(f"{i}. [{rr['type']}] {rr['provider']} id={rr['id']} ensemble={rr.get('ensemble_score'):.4f}")
        print(f"    primary_score  : {rr.get('primary_score'):.4f}")
        print(f"    secondary_score: {rr.get('secondary_score'):.4f}")
        if "caption_sim" in rr:
            print(f"    caption_sim    : {rr.get('caption_sim'):.4f}")
        print(f"    title: {rr['title']}")
        print(f"    url  : {rr['url']}")
        print()

    return results_sorted

# -------------------------
# CLI entry
# -------------------------
if __name__ == "__main__":
    prompt = input("Enter user prompt: ").strip()
    if not prompt:
        prompt = "a girl talking to a man"
    out = pipeline_run(prompt, top_k=10)


LLM plan: {'clean_prompt': 'a girl talking to a man', 'num_images': 5, 'num_videos': 2, 'num_audio': 0, 'notes': 'The prompt is visually focused on a scene involving two people, so I included images and videos to capture the interaction.'}
Fetched total 7 assets from providers


preprocess: 100%|██████████| 7/7 [00:01<00:00,  4.23it/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


[info] Loading secondary OpenCLIP model ViT-L-14
Embedding 7 images with primary CLIP...

Top results (ranked by ensemble_score):
1. [image] pexels id=6668312 ensemble=1.0000
    primary_score  : 0.2388
    secondary_score: 0.1854
    title: Father and daughter share a bonding moment reading a storybook indoors.
    url  : https://images.pexels.com/photos/6668312/pexels-photo-6668312.jpeg

2. [video] pexels id=7426713 ensemble=0.7997
    primary_score  : 0.2363
    secondary_score: 0.1680
    title: Pavel Danilyuk
    url  : https://videos.pexels.com/video-files/7426713/7426713-hd_720_1280_25fps.mp4

3. [image] pexels id=5710988 ensemble=0.5945
    primary_score  : 0.2327
    secondary_score: 0.1517
    title: A group therapy session indoors with diverse adults in a supportive environment.
    url  : https://images.pexels.com/photos/5710988/pexels-photo-5710988.jpeg

4. [image] pexels id=5711017 ensemble=0.4032
    primary_score  : 0.2305
    secondary_score: 0.1350
    title: A divers

Model Comparison: OpenCLIP Primary vs Secondary

Primary Model: ViT-B-32 (OpenCLIP)

Parameters: ~150 million

Image Resolution: 224×224

Strengths: Lightweight, fast inference, low memory usage, and provides decent general relevance across a wide range of prompts.

Weaknesses: Lower semantic depth — it struggles with complex relationships or emotional/contextual compositions (e.g., human interactions, nuanced scenes).

Secondary Model: ViT-L-14 (OpenCLIP)

Parameters: ~428 million

Image Resolution: 224×224 or 336×336

Strengths: Much larger embedding space, captures compositional and contextual understanding better, and aligns more strongly with text semantics.

Weaknesses: Requires over 1.7 GB of model weights, needs significantly more GPU/CPU memory, and runs slower than ViT-B-32.