In [1]:
!pip install openai requests python-dotenv Pillow numpy soundfile librosa ffmpeg-python tqdm open-clip-torch torch
# System-level: ffmpeg must be installed on the machine (apt install ffmpeg / brew install ffmpeg)





[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
pip install --upgrade openai


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
"""
llm_fetch_clip_pipeline.py

End-to-end script:
1) parse user prompt with OpenAI LLM to determine: cleaned prompt, #assets, which types (image/video/audio)
2) fetch metadata from Pexels (images + videos) and Freesound (audio) accordingly
3) download thumbnails/previews in-memory and preprocess into CLIP-ready tensors
4) embed prompt and assets using OpenCLIP and compute cosine similarity (relevance)
5) print top-K per type

Environment variables:
  OPENAI_API_KEY, PEXELS_API_KEY, FREESOUND_API_KEY

NOTE: This script does not save files to disk.
"""

import os
import io
import json
import math
import time
import requests
import numpy as np
from PIL import Image
from typing import List, Dict, Optional
from dotenv import load_dotenv
load_dotenv()
from tqdm import tqdm

# audio libs
import soundfile as sf
import librosa
import ffmpeg

# openai
from openai import OpenAI
import json
import os
import openai

# open_clip
try:
    import open_clip
    _OPENCLIP_AVAILABLE = True
except Exception:
    _OPENCLIP_AVAILABLE = False

# -------------------------
# Config
# -------------------------
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PEXELS_KEY = os.getenv("PEXELS_API_KEY")
FREESOUND_KEY = os.getenv("FREESOUND_API_KEY")

if not OPENAI_API_KEY:
    raise RuntimeError("Set OPENAI_API_KEY in env")
if not PEXELS_KEY:
    print("[WARN] No PEXELS_API_KEY found; Pexels fetching will return empty.")
if not FREESOUND_KEY:
    print("[WARN] No FREESOUND_API_KEY found; Freesound fetching will return empty.")

openai.api_key = OPENAI_API_KEY

# -------------------------
# 1) LLM prompt parser (OpenAI)
# -------------------------
client = OpenAI()

def parse_prompt_with_openai(raw_prompt: str) -> dict:
    """
    Uses the new openai client API:
      client.chat.completions.create(...)
    Returns a dict with clean_prompt, num_images, num_videos, num_audio, notes
    """
    system = (
        "You are a helpful assistant that converts an unstructured user prompt "
        "into a structured search plan. Return JSON only with fields: clean_prompt (string), "
        "num_images (int), num_videos (int), num_audio (int), notes (string). "
        "Rules:\n"
        "- If the user didn't request videos, set num_videos to 0.\n"
        "- If the user didn't request audio, set num_audio to 0.\n"
        "- Make counts sensible (1-10 per type). Be conservative if uncertain.\n"
        "- Clean the prompt for image/video/audio search (no extra commentary)."
    )
    user = f"User prompt: {raw_prompt}\n\nReturn JSON only."

    # call new client
    resp = client.chat.completions.create(
        model="gpt-4o-mini",   # replace if unavailable
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": user}
        ],
        temperature=0.0,
        max_tokens=300,
    )

    # The new response shape: resp.choices[0].message.content
    text = ""
    try:
        text = resp.choices[0].message["content"]
    except Exception:
        # fallback: sometimes it's resp.choices[0].message.content
        try:
            text = resp.choices[0].message.content
        except Exception:
            text = str(resp)

    # parse JSON
    try:
        j = json.loads(text)
        return {
            "clean_prompt": j.get("clean_prompt") or raw_prompt,
            "num_images": int(j.get("num_images") or 0),
            "num_videos": int(j.get("num_videos") or 0),
            "num_audio": int(j.get("num_audio") or 0),
            "notes": j.get("notes","")
        }
    except Exception as e:
        # fallback heuristic if parsing fails
        print("[warn] OpenAI response not parseable as JSON; falling back. Raw text:", text[:400])
        return {
            "clean_prompt": raw_prompt,
            "num_images": 5,
            "num_videos": 0,
            "num_audio": 0,
            "notes": "fallback heuristic used"
        }# -------------------------
# 2) Fetchers (Pexels & Freesound)
# -------------------------
def unified_record(provider, id, typ, title, description, url, thumbnail_url=None,
                   duration=None, uploader=None, published_at=None, tags=None, raw=None):
    return {
        "provider": provider,
        "id": str(id),
        "type": typ,  # image|video|audio
        "title": title or "",
        "description": description,
        "url": url,
        "thumbnail_url": thumbnail_url,
        "duration": duration,
        "uploader": uploader,
        "published_at": published_at,
        "tags": tags or [],
        "raw_meta": raw or {}
    }

# Pexels images
def search_pexels_images(prompt: str, per_page: int = 10) -> List[Dict]:
    if not PEXELS_KEY:
        return []
    endpoint = "https://api.pexels.com/v1/search"
    headers = {"Authorization": PEXELS_KEY}
    params = {"query": prompt, "per_page": per_page}
    r = requests.get(endpoint, headers=headers, params=params, timeout=15)
    r.raise_for_status()
    out = []
    for it in r.json().get("photos", []):
        out.append(unified_record(
            provider="pexels",
            id=it.get("id"),
            typ="image",
            title=it.get("alt") or "",
            description=None,
            url=(it.get("src") or {}).get("original"),
            thumbnail_url=(it.get("src") or {}).get("medium"),
            duration=None,
            uploader=it.get("photographer"),
            published_at=None,
            tags=[],
            raw=it
        ))
    return out

# Pexels videos
def search_pexels_videos(prompt: str, per_page: int = 8) -> List[Dict]:
    if not PEXELS_KEY:
        return []
    endpoint = "https://api.pexels.com/videos/search"
    headers = {"Authorization": PEXELS_KEY}
    params = {"query": prompt, "per_page": per_page}
    r = requests.get(endpoint, headers=headers, params=params, timeout=15)
    r.raise_for_status()
    out = []
    for it in r.json().get("videos", []):
        file_url = None
        for vf in it.get("video_files", []) or []:
            if vf.get("quality") == "hd":
                file_url = vf.get("link"); break
        if not file_url and it.get("video_files"):
            file_url = it.get("video_files")[0].get("link")
        out.append(unified_record(
            provider="pexels",
            id=it.get("id"),
            typ="video",
            title=(it.get("user") or {}).get("name") or str(it.get("id")),
            description=it.get("url"),
            url=file_url,
            thumbnail_url=it.get("image"),
            duration=it.get("duration"),
            uploader=(it.get("user") or {}).get("name"),
            published_at=None,
            tags=[],
            raw=it
        ))
    return out

# Freesound
def search_freesound(prompt: str, per_page: int = 10) -> List[Dict]:
    if not FREESOUND_KEY:
        return []
    endpoint = "https://freesound.org/apiv2/search/text/"
    headers = {"Authorization": f"Token {FREESOUND_KEY}"}
    params = {"query": prompt, "page_size": per_page}
    r = requests.get(endpoint, headers=headers, params=params, timeout=15)
    r.raise_for_status()
    out = []
    for it in r.json().get("results", []):
        preview = (it.get("previews") or {}).get("preview-hq-mp3") or (it.get("previews") or {}).get("preview-lq-mp3")
        out.append(unified_record(
            provider="freesound",
            id=it.get("id"),
            typ="audio",
            title=it.get("name"),
            description=it.get("description"),
            url=preview,
            thumbnail_url=None,
            duration=it.get("duration"),
            uploader=it.get("username"),
            published_at=it.get("created"),
            tags=it.get("tags") or [],
            raw=it
        ))
    return out

# Fetch orchestration
def fetch_assets(clean_prompt: str, num_images:int, num_videos:int, num_audio:int) -> List[Dict]:
    results = []
    if num_images > 0:
        imgs = search_pexels_images(clean_prompt, per_page=num_images)
        results.extend(imgs[:num_images])
    if num_videos > 0:
        vids = search_pexels_videos(clean_prompt, per_page=num_videos)
        results.extend(vids[:num_videos])
    if num_audio > 0:
        aud = search_freesound(clean_prompt, per_page=num_audio)
        results.extend(aud[:num_audio])
    return results

# -------------------------
# 3) In-memory download & preprocessing to CLIP-ready tensors
# -------------------------
def download_bytes(url: str, timeout=20) -> Optional[bytes]:
    if not url:
        return None
    try:
        r = requests.get(url, timeout=timeout, stream=True)
        r.raise_for_status()
        return r.content
    except Exception as e:
        print("[warn] download failed:", e)
        return None

def download_image_pil(url: str, timeout=15) -> Optional[Image.Image]:
    b = download_bytes(url, timeout=timeout)
    if not b:
        return None
    try:
        img = Image.open(io.BytesIO(b)).convert("RGB")
        return img
    except Exception as e:
        print("[warn] PIL open failed:", e)
        return None

def center_crop_and_resize_pil(img: Image.Image, size:int=224) -> Image.Image:
    w,h = img.size
    m = min(w,h)
    left = (w-m)//2
    top = (h-m)//2
    img = img.crop((left, top, left+m, top+m))
    img = img.resize((size,size), Image.LANCZOS)
    return img

def pil_to_clip_tensor(img: Image.Image, size:int=224, normalize:bool=True) -> Optional[np.ndarray]:
    if img is None:
        return None
    img = center_crop_and_resize_pil(img, size=size)
    arr = np.asarray(img).astype(np.float32)/255.0  # H W C
    arr = np.transpose(arr,(2,0,1)).copy()  # C H W
    if normalize:
        arr = arr*2.0 - 1.0
    return arr

# audio decode & mel -> 3-channel image-like tensor
def decode_audio_bytes_to_waveform(audio_bytes: bytes, target_sr:int=16000, duration:float=5.0) -> Optional[np.ndarray]:
    if not audio_bytes:
        return None
    try:
        bio = io.BytesIO(audio_bytes)
        data, sr = sf.read(bio, dtype='float32')
        if data.ndim > 1:
            data = np.mean(data, axis=1)
        if sr != target_sr:
            data = librosa.resample(data, sr, target_sr)
        desired = int(target_sr * duration)
        if len(data) > desired:
            start = max(0, (len(data)-desired)//2)
            data = data[start:start+desired]
        elif len(data) < desired:
            data = np.concatenate([data, np.zeros(desired - len(data), dtype=np.float32)])
        return data.astype(np.float32)
    except Exception as e:
        # fallback: ffmpeg
        try:
            proc = (
                ffmpeg.input('pipe:0')
                .output('pipe:1', format='f32le', ar=target_sr, ac=1)
                .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True)
            )
            out, err = proc.communicate(input=audio_bytes)
            if proc.returncode != 0:
                raise RuntimeError("ffmpeg decode fail")
            data = np.frombuffer(out, dtype=np.float32)
            desired = int(target_sr * duration)
            if len(data) > desired:
                start = max(0, (len(data)-desired)//2)
                data = data[start:start+desired]
            elif len(data) < desired:
                data = np.concatenate([data, np.zeros(desired - len(data), dtype=np.float32)])
            return data.astype(np.float32)
        except Exception as e2:
            print("[warn] audio decode failed:", e, e2)
            return None

def waveform_to_mel_image_tensor(wav: np.ndarray, sr:int=16000, n_mels:int=128, n_fft:int=2048, hop_length:int=512, size:int=224) -> Optional[np.ndarray]:
    if wav is None:
        return None
    S = librosa.feature.melspectrogram(y=wav, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    S_db = librosa.power_to_db(S, ref=np.max)
    S_min, S_max = S_db.min(), S_db.max()
    S_norm = (S_db - S_min) / (S_max - S_min + 1e-9)
    img_arr = (S_norm * 255.0).astype(np.uint8)  # H(n_mels) x T
    pil = Image.fromarray(img_arr)
    pil = pil.resize((size,size), Image.LANCZOS).convert("RGB")
    arr = np.asarray(pil).astype(np.float32)/255.0
    arr = np.transpose(arr,(2,0,1)).copy()
    arr = arr*2.0 - 1.0
    return arr

# preprocess a list of records in-memory
def preprocess_records(records: List[Dict], image_size:int=224, audio_duration:float=5.0, audio_sr:int=16000) -> List[Dict]:
    processed = []
    for r in tqdm(records, desc="preprocess"):
        rec = dict(r)
        typ = rec["type"]
        if typ == "image":
            url = rec.get("thumbnail_url") or rec.get("url")
            pil = download_image_pil(url)
            rec["img_pil"] = pil
            rec["img_tensor"] = pil_to_clip_tensor(pil, size=image_size) if pil is not None else None
        elif typ == "video":
            url = rec.get("thumbnail_url") or rec.get("url")
            pil = download_image_pil(url)
            rec["img_pil"] = pil
            rec["img_tensor"] = pil_to_clip_tensor(pil, size=image_size) if pil is not None else None
        elif typ == "audio":
            url = rec.get("url")
            audio_bytes = download_bytes(url)
            wav = decode_audio_bytes_to_waveform(audio_bytes, target_sr=audio_sr, duration=audio_duration)
            rec["waveform"] = wav
            rec["audio_mel_tensor"] = waveform_to_mel_image_tensor(wav, sr=audio_sr, size=image_size) if wav is not None else None
        else:
            rec["img_tensor"] = None
            rec["audio_mel_tensor"] = None
        processed.append(rec)
    return processed

# -------------------------
# 4) CLIP embedding & scoring (OpenCLIP)
# -------------------------
# Replace your previous CLIPEmbedder with this fixed implementation.

import numpy as np
import torch
import open_clip
from typing import List, Optional

class CLIPEmbedder:
    def __init__(self, model_name: str = "ViT-B-32", pretrained: str = "openai", device: Optional[str] = None):
        """
        Correctly initializes OpenCLIP model, preprocess transform, and text tokenizer.
        """
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        # create model and transforms; open_clip returns (model, _, preprocess) in many versions
        model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained)
        self.model = model.to(self.device).eval()
        self.preprocess = preprocess  # torchvision transform for PIL images
        # get text tokenizer separately (this returns a callable tokenizer)
        self.tokenizer = open_clip.get_tokenizer(model_name)
        # keep dtype
        self.dtype = next(self.model.parameters()).dtype
        # helper dims (text projection dim may be on model.text_projection)
        try:
            self.image_dim = self.model.visual.output_dim
        except Exception:
            # fallback size
            self.image_dim = 512

    @torch.no_grad()
    def embed_images(self, imgs: List[np.ndarray], batch_size: int = 32) -> np.ndarray:
        """
        imgs: list of numpy arrays shape (3,H,W), float32 in [-1,1] or [0,1]
        returns: numpy array (N, D) normalized (unit)
        """
        if len(imgs) == 0:
            return np.zeros((0, self.image_dim), dtype=np.float32)

        embs_list = []
        for i in range(0, len(imgs), batch_size):
            batch = imgs[i:i+batch_size]
            # convert each numpy (3,H,W) to torch tensor (C,H,W) and then stack -> (B,C,H,W)
            t = torch.from_numpy(np.stack(batch, axis=0)).to(self.device)
            # If values are in [-1,1], convert to [0,1] since open_clip preprocess expects 0..1 images (then normalization)
            if t.max() <= 1.0 + 1e-6 and t.min() >= -1.0 - 1e-6:
                t = (t + 1.0) / 2.0
            # ensure float dtype matches model dtype
            t = t.type(self.dtype)
            # encode
            img_emb = self.model.encode_image(t)
            img_emb = img_emb / img_emb.norm(dim=-1, keepdim=True)
            embs_list.append(img_emb.cpu().numpy())
        return np.vstack(embs_list).astype(np.float32)

    @torch.no_grad()
    def embed_texts(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
        """
        texts: list of strings
        returns: numpy array (N, D) normalized (unit)
        Handles different tokenizer return types:
          - dict of torch tensors (best case)
          - dict of lists/ndarrays (convert to torch)
          - single tensor/list/ndarray (wrap as {'input_ids': tensor})
        """
        if len(texts) == 0:
            try:
                text_dim = self.model.text_projection.shape[1]
            except Exception:
                text_dim = 512
            return np.zeros((0, text_dim), dtype=np.float32)

        embs_list = []
        for i in range(0, len(texts), batch_size):
            chunk = texts[i:i+batch_size]

            # Tokenize using tokenizer (may return dict or tensor or list)
            toks = self.tokenizer(chunk)

            # Normalize token output to a dict of torch tensors on device
            if isinstance(toks, dict):
                toks_norm = {}
                for k, v in toks.items():
                    # v could be torch.Tensor already or list/numpy
                    if isinstance(v, torch.Tensor):
                        toks_norm[k] = v.to(self.device)
                    else:
                        toks_norm[k] = torch.tensor(v, device=self.device)
            else:
                # toks is not a dict (could be torch.Tensor of shape (B, L) or list of lists)
                if isinstance(toks, torch.Tensor):
                    toks_norm = {"input_ids": toks.to(self.device)}
                else:
                    # assume list/ndarray -> convert to tensor
                    toks_norm = {"input_ids": torch.tensor(toks, device=self.device)}

            # Some open_clip versions expect 'text' key instead of 'input_ids' or expect specific names.
            # If model.encode_text errors, try these fallbacks in order.
            try:
                txt_emb = self.model.encode_text(**toks_norm)
            except TypeError:
                # Try remove unexpected keys, keep only tensor-like keys
                allowed = {k: v for k,v in toks_norm.items() if isinstance(v, torch.Tensor)}
                try:
                    txt_emb = self.model.encode_text(**allowed)
                except Exception as e:
                    # Last fallback: if tokens are just input_ids, try pass as single arg
                    if "input_ids" in allowed:
                        try:
                            txt_emb = self.model.encode_text(allowed["input_ids"])
                        except Exception as e2:
                            raise RuntimeError(f"encode_text failed with fallbacks: {e} / {e2}")
                    else:
                        raise RuntimeError(f"encode_text failed and no known fallback available: {e}")

            txt_emb = txt_emb / (txt_emb.norm(dim=-1, keepdim=True) + 1e-12)
            embs_list.append(txt_emb.cpu().numpy())
        return np.vstack(embs_list).astype(np.float32)

    @staticmethod
    def cosine_sim_matrix(img_embs: np.ndarray, txt_embs: np.ndarray) -> np.ndarray:
        """
        Compute cosine similarity matrix (N_img, N_txt)
        """
        if img_embs.size == 0 or txt_embs.size == 0:
            return np.zeros((img_embs.shape[0], txt_embs.shape[0]), dtype=np.float32)
        a = img_embs / (np.linalg.norm(img_embs, axis=1, keepdims=True) + 1e-12)
        b = txt_embs / (np.linalg.norm(txt_embs, axis=1, keepdims=True) + 1e-12)
        return a.dot(b.T)

# -------------------------
# 5) Pipeline orchestration
# -------------------------
def pipeline_run(user_prompt: str, top_k:int=5):
    # 1) parse prompt via OpenAI LLM
    plan = parse_prompt_with_openai(user_prompt)
    print("LLM plan:", plan)

    # 2) fetch assets according to plan
    recs = fetch_assets(plan["clean_prompt"], plan["num_images"], plan["num_videos"], plan["num_audio"])
    print(f"Fetched total {len(recs)} assets from providers")

    # 3) preprocess in-memory to tensors
    processed = preprocess_records(recs, image_size=224, audio_duration=5.0, audio_sr=16000)

    # 4) prepare arrays for CLIP
    clip = CLIPEmbedder(model_name="ViT-B-32", pretrained="openai")
    # text embedding for the prompt
    txt_emb = clip.embed_texts([plan["clean_prompt"]])[0]

    # collect image tensors (images + video thumbs)
    img_tensors = []
    img_indices = []
    for i, r in enumerate(processed):
        if r["type"] in ("image","video") and r.get("img_tensor") is not None:
            img_tensors.append(r["img_tensor"].astype(np.float32))
            img_indices.append(i)

    # embed images
    if img_tensors:
        print(f"Embedding {len(img_tensors)} images with CLIP...")
        img_embs = clip.embed_images(img_tensors, batch_size=16)
    else:
        img_embs = np.zeros((0,512))

    # audio via mel-image fallback through CLIP (if any)
    audio_tensors = []
    audio_indices = []
    for i, r in enumerate(processed):
        if r["type"] == "audio" and r.get("audio_mel_tensor") is not None:
            audio_tensors.append(r["audio_mel_tensor"].astype(np.float32))
            audio_indices.append(i)
    if audio_tensors:
        print(f"Embedding {len(audio_tensors)} audio mel-images with CLIP...")
        audio_embs = clip.embed_images(audio_tensors, batch_size=8)
    else:
        audio_embs = np.zeros((0, img_embs.shape[1] if img_embs.size else 512))

    # 5) compute similarities and rank
    results = []
    # images:
    if img_tensors:
        sims = (img_embs @ txt_emb) / (np.linalg.norm(img_embs, axis=1) * (np.linalg.norm(txt_emb)+1e-12))
        for idx_local, sim in enumerate(sims):
            rec_idx = img_indices[idx_local]
            r = processed[rec_idx]
            results.append({
                "provider": r["provider"],
                "id": r["id"],
                "type": r["type"],
                "title": r["title"],
                "score": float(sim),
                "url": r.get("url"),
                "thumbnail": r.get("thumbnail_url") or None
            })
    # audio:
    if audio_tensors:
        sims_a = (audio_embs @ txt_emb) / (np.linalg.norm(audio_embs, axis=1) * (np.linalg.norm(txt_emb)+1e-12))
        for idx_local, sim in enumerate(sims_a):
            rec_idx = audio_indices[idx_local]
            r = processed[rec_idx]
            results.append({
                "provider": r["provider"],
                "id": r["id"],
                "type": r["type"],
                "title": r["title"],
                "score": float(sim),
                "url": r.get("url"),
                "thumbnail": r.get("thumbnail_url") or None
            })

    # sort by score desc and print top_k
    results_sorted = sorted(results, key=lambda x: -x["score"])
    print("\nTop results:")
    for i, rr in enumerate(results_sorted[:top_k], start=1):
        print(f"{i}. [{rr['type']}] {rr['provider']} id={rr['id']} score={rr['score']:.4f}")
        print(f"    title: {rr['title']}")
        print(f"    url  : {rr['url']}")
        print()

    return results_sorted

# -------------------------
# CLI
# -------------------------
if __name__ == "__main__":
    raw = input("Enter user prompt: ").strip()
    if not raw:
        raw = "a happy person running on a beach at sunset with soft warm lighting"
    out = pipeline_run(raw, top_k=10)


LLM plan: {'clean_prompt': 'a girl talking to a man', 'num_images': 5, 'num_videos': 0, 'num_audio': 0, 'notes': ''}
Fetched total 5 assets from providers


preprocess: 100%|██████████| 5/5 [00:01<00:00,  4.72it/s]


Embedding 5 images with CLIP...

Top results:
1. [image] pexels id=6668312 score=0.2388
    title: Father and daughter share a bonding moment reading a storybook indoors.
    url  : https://images.pexels.com/photos/6668312/pexels-photo-6668312.jpeg

2. [image] pexels id=5710988 score=0.2327
    title: A group therapy session indoors with diverse adults in a supportive environment.
    url  : https://images.pexels.com/photos/5710988/pexels-photo-5710988.jpeg

3. [image] pexels id=5711017 score=0.2305
    title: A diverse group of people sitting in a circle during a therapy session in a sports hall.
    url  : https://images.pexels.com/photos/5711017/pexels-photo-5711017.jpeg

4. [image] pexels id=5710922 score=0.2217
    title: A group therapy session with six adults seated in a circle, discussing support and mental health.
    url  : https://images.pexels.com/photos/5710922/pexels-photo-5710922.jpeg

5. [image] pexels id=5711382 score=0.2192
    title: Black and white photo of a suppor

In [None]:
!pip install openai==0.28

Collecting openai==0.28
  Using cached openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Using cached openai-0.28.0-py3-none-any.whl (76 kB)
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 2.0.0
    Uninstalling openai-2.0.0:
      Successfully uninstalled openai-2.0.0
Successfully installed openai-0.28.0



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
!pip install python-dotenv





[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
# given lists (top->bottom) from each model
align_ids = ["5710922","5711017","6668312","5710988","5711382"]
clip_ids  = ["6668312","5710988","5711017","5710922","5711382"]

# 1) Spearman rank correlation
from scipy.stats import spearmanr, kendalltau
spearman_corr, _ = spearmanr([align_ids.index(i) for i in align_ids],
                             [clip_ids.index(i) for i in align_ids])
kendall_corr, _ = kendalltau([align_ids.index(i) for i in align_ids],
                             [clip_ids.index(i) for i in align_ids])
print("Spearman (align vs clip):", spearman_corr)
print("Kendall tau:", kendall_corr)

# 2) simple normalized-rank difference per item
def norm_rank_score(rank, n):
    return 1.0 - (rank / (n-1))  # 1 for top, 0 for bottom
n = len(align_ids)
for i in align_ids:
    a_rank = align_ids.index(i)
    c_rank = clip_ids.index(i)
    print(i, "align_rank", a_rank, "clip_rank", c_rank, "rank_diff", a_rank-c_rank)


Spearman (align vs clip): 0.09999999999999999
Kendall tau: 0.0
5710922 align_rank 0 clip_rank 3 rank_diff -3
5711017 align_rank 1 clip_rank 2 rank_diff -1
6668312 align_rank 2 clip_rank 0 rank_diff 2
5710988 align_rank 3 clip_rank 1 rank_diff 2
5711382 align_rank 4 clip_rank 4 rank_diff 0
