In [None]:
# ============================================================================
# CELL 2 ‚Äî IMPORTS, ENV, CONFIG
# ============================================================================

import os
import numpy as np
import pandas as pd
import cv2

from dotenv import load_dotenv
from apify_client import ApifyClient

import torch
import clip
from PIL import Image

# your existing helper
from mine_redis import get_files_gem  # def get_files_gem(REEL_URL, REEL_NO='0', task_id='default'):

# --------------------------------------------------------------------------
# ENVIRONMENT VARS
# --------------------------------------------------------------------------

load_dotenv()

if not APIFY_API_KEY:
    raise RuntimeError("Missing APIFY_API_KEY (set it in .env or env vars)")

apify = ApifyClient(APIFY_API_KEY)

# --------------------------------------------------------------------------
# GLOBAL CONFIG
# --------------------------------------------------------------------------

CREATOR_LIST = [
    "badassbrownbeauty",
    "museumofsoum",
    "mahiekasharma",
    "riapalkar",
    "nevaforevaa",
]

MAX_REELS_PER_CREATOR = 10     # reels per creator
MAX_FRAMES_PER_REEL   = 16     # sampled frames per reel
DELETE_AFTER_PROCESS  = False    # delete .mp4 after analysis

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Using device: cpu


In [2]:
import shutil
from pathlib import Path

def cache_video_to_reel_cache(downloaded_path: str, cache_path: str) -> str:
    Path(cache_path).parent.mkdir(parents=True, exist_ok=True)

    # If cache already exists and looks valid, use it
    if os.path.exists(cache_path) and os.path.getsize(cache_path) > 0:
        return cache_path

    # Otherwise copy downloaded file into cache
    shutil.copy2(downloaded_path, cache_path)
    return cache_path


In [3]:
import json
import time
from pathlib import Path

def _manifest_path(cache_dir: str, handle: str) -> str:
    safe_handle = (handle or "unknown").lstrip("@").lower()
    return str(Path(cache_dir) / safe_handle / "manifest.jsonl")

def append_to_manifest(cache_dir: str, handle: str, record: dict):
    Path(Path(cache_dir) / handle.lstrip("@").lower()).mkdir(parents=True, exist_ok=True)
    mp = _manifest_path(cache_dir, handle)
    with open(mp, "a", encoding="utf-8") as f:
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

def load_manifest_df(cache_dir: str, handle: str) -> pd.DataFrame:
    mp = _manifest_path(cache_dir, handle)
    if not os.path.exists(mp):
        return pd.DataFrame()
    rows = []
    with open(mp, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                rows.append(json.loads(line))
            except:
                continue
    return pd.DataFrame(rows)


In [4]:
import os
import re
import hashlib
from pathlib import Path
import pandas as pd

# ----------------------------
# Helpers: stable reel_id + paths
# ----------------------------
_REEL_ID_PATTERNS = [
    r"instagram\.com/reel/([^/?#]+)/?",
    r"instagram\.com/p/([^/?#]+)/?",
    r"instagram\.com/tv/([^/?#]+)/?",
]

def extract_reel_id(reel_url: str) -> str:
    """Best-effort stable ID from Instagram URL (shortcode)."""
    if not reel_url or not isinstance(reel_url, str):
        return ""
    for pat in _REEL_ID_PATTERNS:
        m = re.search(pat, reel_url)
        if m:
            return m.group(1)
    # Fallback: hash the URL so cache still works
    return hashlib.md5(reel_url.encode("utf-8")).hexdigest()[:12]

def video_path_for_reel(cache_dir: str, handle: str, reel_id: str) -> str:
    # Store under handle to keep things tidy
    safe_handle = (handle or "unknown").lstrip("@").lower()
    return str(Path(cache_dir) / safe_handle / f"{reel_id}.mp4")

# ----------------------------
# Your existing helpers
# ----------------------------
def flatten_comments(latest_comments, max_n=50):
    """Convert Apify comment objects ‚Üí simple text list."""
    if not isinstance(latest_comments, list):
        return []
    out = []
    for c in latest_comments[:max_n]:
        if isinstance(c, dict):
            txt = c.get("text") or c.get("body") or ""
            if txt.strip():
                out.append(txt.strip())
    return out


def fetch_reels_from_apify_with_comments(
    handle: str,
    max_items: int = 8,
    cache_dir: str = "./reel_cache",
    use_local_cache_only: bool = False,   # ‚úÖ NEW FLAG
) -> pd.DataFrame:
    """
    Fetch recent reel posts + basic metadata for a creator using an Apify actor.

    If use_local_cache_only=True:
      - DOES NOT call Apify
      - Scans cache_dir/<handle>/*.mp4 and returns those as the reel list

    Otherwise:
      - Calls Apify and returns reel_url, caption, flat_comments plus:
        reel_id, local_video_path, is_downloaded
    """
    safe_handle = (handle or "unknown").lstrip("@").lower()
    handle_dir = Path(cache_dir) / safe_handle
    handle_dir.mkdir(parents=True, exist_ok=True)

    # ----------------------------
    # LOCAL-ONLY MODE (NO APIFY CALLS)
    # ----------------------------
    if use_local_cache_only:
        dfm = load_manifest_df(cache_dir, handle)

        if dfm.empty:
            # fallback: still try mp4 scan so you at least can run scoring
            mp4s = [p for p in handle_dir.rglob("*.mp4") if p.is_file() and p.stat().st_size > 0]
            mp4s = sorted(mp4s, key=lambda p: p.stat().st_mtime, reverse=True)[:max_items]
            rows = []
            for p in mp4s:
                rows.append({
                    "reel_url": None,
                    "caption": "",
                    "flat_comments": [],
                    "reel_id": p.stem,
                    "local_video_path": str(p),
                    "is_downloaded": True,
                })
            print(f"\nüìÅ Local-only mode: manifest missing; using {len(rows)} cached mp4s for @{safe_handle}")
            return pd.DataFrame(rows).reset_index(drop=True)

        # Normalize fields the pipeline expects
        # (manifest row contains these keys if you store them as below)
        dfm["is_downloaded"] = dfm["local_video_path"].apply(lambda p: os.path.exists(p) and os.path.getsize(p) > 0)

        # Keep only downloaded reels, newest first if you stored timestamp
        if "downloaded_at" in dfm.columns:
            dfm = dfm.sort_values("downloaded_at", ascending=False)
        dfm = dfm[dfm["is_downloaded"]].head(max_items)

        out = dfm[["reel_url", "caption", "flat_comments", "reel_id", "local_video_path", "is_downloaded"]].copy()

        print(f"\nüìÅ Local-only mode: using {len(out)} cached reels for @{safe_handle} (Apify not called)")
        return out.reset_index(drop=True)

    # ----------------------------
    # APIFY MODE (original behavior)
    # ----------------------------
    print(f"\nüì∏ Fetching reels for @{handle} via Apify...")

    try:
        run_input = {
            "username": [handle],     # MUST be array for this actor
            "resultsLimit": max_items,
        }

        run = apify.actor("xMc5Ga1oCONPmWJIa").call(run_input=run_input)
        items = apify.dataset(run["defaultDatasetId"]).list_items().items
        
                # Persist raw Apify output + derived fields (manifest)
        for it in items:
            # derive reel_url similarly to your DF logic
            reel_url = it.get("url") or it.get("postUrl")
            if (not reel_url) and it.get("shortCode"):
                reel_url = "https://www.instagram.com/reel/" + str(it["shortCode"]) + "/"

            if not reel_url:
                continue

            rid = extract_reel_id(reel_url)
            local_path = video_path_for_reel(cache_dir, handle, rid)

            record = {
                "downloaded_at": time.time(),
                "handle": handle,
                "reel_id": rid,
                "reel_url": reel_url,
                "caption": it.get("caption") or "",
                "flat_comments": flatten_comments(it.get("latestComments"), max_n=50),
                "local_video_path": local_path,
                "apify_raw": it,  # ‚úÖ store entire Apify object
            }
            append_to_manifest(cache_dir, handle, record)


        if not items:
            print("  ‚úó No items returned.")
            return pd.DataFrame()

        df = pd.DataFrame(items)
        print(f"  ‚úì Apify returned {len(df)} items. Columns: {list(df.columns)}")

        # URL column
        if "url" in df.columns:
            df["reel_url"] = df["url"]
        elif "postUrl" in df.columns:
            df["reel_url"] = df["postUrl"]
        elif "shortCode" in df.columns:
            df["reel_url"] = "https://www.instagram.com/reel/" + df["shortCode"].astype(str) + "/"
        else:
            df["reel_url"] = None

        # Caption
        df["caption_norm"] = df["caption"] if "caption" in df.columns else ""

        # Comments (shallow)
        if "latestComments" in df.columns:
            df["flat_comments"] = df["latestComments"].apply(lambda x: flatten_comments(x, max_n=50))
        else:
            df["flat_comments"] = [[]]

        # Filter valid reel URLs
        mask = (
            df["reel_url"].notna()
            & (df["reel_url"].str.contains("/reel/") | df["reel_url"].str.contains("/p/"))
        )

        out = df.loc[mask, ["reel_url", "caption_norm", "flat_comments"]].copy()
        out = out.rename(columns={"caption_norm": "caption"})

        # NEW: reel_id + local cache check
        out["reel_id"] = out["reel_url"].apply(extract_reel_id)
        out["local_video_path"] = out["reel_id"].apply(lambda rid: video_path_for_reel(cache_dir, handle, rid))

        out["is_downloaded"] = out["local_video_path"].apply(lambda p: os.path.exists(p) and os.path.getsize(p) > 0)

        print(f"  ‚úì {len(out)} valid reels for @{handle} ({out['is_downloaded'].sum()} already downloaded)")
        return out.reset_index(drop=True)

    except Exception as e:
        print(f"  ‚úó Apify error for @{handle}: {e}")
        return pd.DataFrame()


# ----------------------------
# OPTIONAL: run pipeline directly (skip downloads if present)
# ----------------------------
def ensure_download_and_score_reels(
    df_reels: pd.DataFrame,
    download_fn,   # function: (reel_url, out_path) -> out_path
    score_fn,      # function: (video_path) -> dict (your compute_three_change_metrics_for_video)
) -> pd.DataFrame:
    """
    For each reel:
      - If local_video_path exists ‚Üí skip download, score immediately
      - Else download (if reel_url exists) ‚Üí then score
    Returns df with added scoring columns.
    """
    rows = []
    for _, r in df_reels.iterrows():
        reel_url = r.get("reel_url")
        video_path = r.get("local_video_path")

        if not video_path:
            rows.append({**r.to_dict(), "download_error": "Missing local_video_path"})
            continue

        # Download only if missing AND we actually have a URL to download from
        has_local = os.path.exists(video_path) and os.path.getsize(video_path) > 0
        if not has_local:
            if not reel_url:
                rows.append({**r.to_dict(), "download_error": "Video not cached and reel_url is None (local-only mode?)"})
                continue
            try:
                # ensure parent dir exists
                Path(video_path).parent.mkdir(parents=True, exist_ok=True)
                download_fn(reel_url, video_path)
            except Exception as e:
                rows.append({**r.to_dict(), "download_error": str(e)})
                continue

        # Score
        try:
            metrics = score_fn(video_path)
            rows.append({**r.to_dict(), **metrics, "download_error": ""})
        except Exception as e:
            rows.append({**r.to_dict(), "score_error": str(e), "download_error": ""})

    return pd.DataFrame(rows)


In [5]:
# ============================================================================
# CELL 4 ‚Äî FRAME SAMPLING FROM VIDEO
# ============================================================================

def sample_uniform_frames(video_path: str, max_frames: int = 32):
    """
    Sample up to `max_frames` frames roughly uniformly across the video.
    Returns: list of np.ndarray (BGR images)
    """
    frames = []
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print("    ‚úó Could not open video for frame sampling.")
        return frames

    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # Fallback if metadata is broken
    if frame_count <= 0:
        print("    ‚ö†Ô∏è CAP_PROP_FRAME_COUNT not available, reading sequentially.")
        i = 0
        while i < max_frames:
            ret, frame = cap.read()
            if not ret:
                break
            frames.append(frame)
            i += 1
        cap.release()
        print(f"    ‚úì Sampled {len(frames)} frames (sequential fallback).")
        return frames

    # Normal path: uniform indices
    if frame_count <= max_frames:
        indices = list(range(frame_count))
    else:
        indices = np.linspace(0, frame_count - 1, max_frames, dtype=int)

    for idx in indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
        ret, frame = cap.read()
        if not ret:
            continue
        frames.append(frame)

    cap.release()
    print(f"    ‚úì Sampled {len(frames)} frames (uniform across {frame_count} total).")
    return frames


In [6]:
# ============================================================================
# CELL 5 ‚Äî HISTOGRAM-BASED DISTANCE (METHOD #3 CORE PRIMITIVE)
# ============================================================================

def compute_hist_distance(frame1, frame2, bins=32):
    """
    Compute Bhattacharyya distance between color histograms of two frames.
    Returns a float in [0, 1+] (0 = identical, larger = more different).
    """
    # Convert to HSV or just use BGR; HSV can be more stable for lighting
    f1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2HSV)
    f2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2HSV)

    h1 = cv2.calcHist([f1], [0, 1, 2], None, [bins, bins, bins], [0, 180, 0, 256, 0, 256])
    h2 = cv2.calcHist([f2], [0, 1, 2], None, [bins, bins, bins], [0, 180, 0, 256, 0, 256])

    h1 = h1.flatten().astype("float32")
    h2 = h2.flatten().astype("float32")

    h1 /= (h1.sum() + 1e-8)
    h2 /= (h2.sum() + 1e-8)

    dist = cv2.compareHist(h1, h2, cv2.HISTCMP_BHATTACHARYYA)
    return float(dist)


In [7]:
# ============================================================================
# CELL 6 ‚Äî CLIP SETUP (FOR METHOD #2)
# ============================================================================

clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)

def clip_embed_frame(frame_bgr):
    """
    Compute CLIP embedding (L2-normalized) for a single frame (BGR).
    Returns 1D numpy vector.
    """
    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
    pil_img = Image.fromarray(frame_rgb)

    img = clip_preprocess(pil_img).unsqueeze(0).to(device)
    with torch.no_grad():
        emb = clip_model.encode_image(img)
        emb = emb / emb.norm(dim=-1, keepdim=True)

    return emb.cpu().numpy().flatten().astype("float32")


In [8]:
# ============================================================================
# CELL 7 ‚Äî THREE METRICS FOR ONE REEL
#   1) Scene-change density (from histogram jumps)
#   2) CLIP-embedding distance
#   3) Histogram distance
# ============================================================================

def compute_three_change_metrics_for_video(video_path: str, max_frames: int = 32):
    """
    For a given video:
      - Sample frames
      - Compute three frame-to-frame change metrics:
          1) Scene-change density (hist-based threshold ‚Üí approx shot boundaries)
          2) Mean CLIP embedding distance between consecutive frames
          3) Mean histogram distance between consecutive frames
      - Also returns normalized scores 0‚Äì10 for each metric.
    """
    frames = sample_uniform_frames(video_path, max_frames=max_frames)
    n_frames = len(frames)

    if n_frames < 2:
        return {
            "n_frames_used": n_frames,
            "scene_change_count": 0,
            "scene_change_density": 0.0,
            "scene_score_0_10": 0.0,
            "mean_clip_dist": 0.0,
            "std_clip_dist": 0.0,
            "clip_score_0_10": 0.0,
            "mean_hist_dist": 0.0,
            "std_hist_dist": 0.0,
            "hist_score_0_10": 0.0,
        }

    # -------------------------
    # METHOD 3: Histogram diffs
    # -------------------------
    hist_dists = []
    for i in range(1, n_frames):
        d = compute_hist_distance(frames[i - 1], frames[i])
        hist_dists.append(d)
    hist_dists = np.array(hist_dists, dtype=np.float32)

    mean_hist = float(hist_dists.mean())
    std_hist = float(hist_dists.std())

    # We'll assume typical Bhattacharyya distances are in [0, 1].
    # Clip to [0,1] before mapping to 0‚Äì10
    mean_hist_clipped = float(np.clip(mean_hist, 0.0, 1.0))
    hist_score = round(mean_hist_clipped * 10.0, 2)

    # -----------------------------------
    # METHOD 1: Scene-change density
    #   - Use histogram jumps above a threshold to count "scene changes"
    # -----------------------------------
    # Heuristic threshold: tuneable
    scene_thresh = 0.5
    scene_changes = int((hist_dists > scene_thresh).sum())
    scene_change_density = scene_changes / float(n_frames - 1)

    # Normalize density (assuming >5 changes per 32 frames is already "very high")
    scene_density_clipped = float(np.clip(scene_change_density * 5.0, 0.0, 1.0))
    scene_score = round(scene_density_clipped * 10.0, 2)

    # -----------------------------------
    # METHOD 2: CLIP embedding distances
    # -----------------------------------
    clip_embs = []
    for f in frames:
        e = clip_embed_frame(f)
        clip_embs.append(e)
    clip_embs = np.stack(clip_embs, axis=0)  # [n_frames, d]

    # compute distances between consecutive embeddings
    clip_dists = []
    for i in range(1, n_frames):
        v1 = clip_embs[i - 1]
        v2 = clip_embs[i]
        # Since vectors are normalized, 1 - cosine similarity ‚àà [0, 2]
        cos_sim = float(np.dot(v1, v2))
        d = 1.0 - cos_sim
        clip_dists.append(d)
    clip_dists = np.array(clip_dists, dtype=np.float32)

    mean_clip = float(clip_dists.mean())
    std_clip = float(clip_dists.std())

    # Clip-sim distance is usually within [0, 1]; clip to [0,1]
    mean_clip_clipped = float(np.clip(mean_clip, 0.0, 1.0))
    clip_score = round(mean_clip_clipped * 10.0, 2)

    return {
        "n_frames_used": n_frames,
        # scene-based
        "scene_change_count": int(scene_changes),
        "scene_change_density": float(scene_change_density),
        "scene_score_0_10": scene_score,
        # CLIP-based
        "mean_clip_dist": mean_clip,
        "std_clip_dist": std_clip,
        "clip_score_0_10": clip_score,
        # histogram-based
        "mean_hist_dist": mean_hist,
        "std_hist_dist": std_hist,
        "hist_score_0_10": hist_score,
    }


In [9]:
# ============================================================================
# CELL 8 ‚Äî MAIN PIPELINE: CREATOR ‚Üí REELS ‚Üí DOWNLOAD/CACHE ‚Üí 3 SCORES
# ============================================================================

def run_change_metrics_pipeline_for_creators(
    creator_list,
    max_reels_per_creator: int = MAX_REELS_PER_CREATOR,
    max_frames_per_reel: int = MAX_FRAMES_PER_REEL,
    delete_after: bool = DELETE_AFTER_PROCESS,
    use_local_cache_only: bool = True,
    cache_dir: str = "./reel_cache",
):
    rows = []

    for creator in creator_list:
        print(f"\n=== Processing creator (3 change metrics): {creator} ===")

        df_reels = fetch_reels_from_apify_with_comments(
            handle=creator,
            max_items=max_reels_per_creator,
            cache_dir=cache_dir,
            use_local_cache_only=use_local_cache_only,
        )

        if df_reels.empty:
            print(f"  ‚úó No reels found for {creator}, skipping.")
            continue

        for reel_idx, r in df_reels.iterrows():
            reel_url = r.get("reel_url")
            caption  = r.get("caption", "")

            # ----------------------------
            # Resolve video_path
            # ----------------------------
            video_path = None

            if use_local_cache_only:
                video_path = r.get("local_video_path")
                print(f"\n  ‚ñ∂ Reel {reel_idx} for {creator}: [LOCAL] {video_path}")

            else:
                print(f"\n  ‚ñ∂ Reel {reel_idx} for {creator}: {reel_url}")

                if not reel_url:
                    print("    ‚úó Missing reel_url from Apify row, skipping.")
                    continue

                # Stable cache path (based on shortcode/hash)
                reel_id    = extract_reel_id(reel_url)
                cache_path = video_path_for_reel(cache_dir, creator, reel_id)

                # If already cached, skip download
                if os.path.exists(cache_path) and os.path.getsize(cache_path) > 0:
                    video_path = cache_path
                    print(f"    ‚úÖ Using cached mp4: {video_path}")
                else:
                    # Download via your helper
                    task_id = f"change3_{creator.replace('@', '')}"
                    downloaded_path = get_files_gem(reel_url, str(reel_idx), task_id)

                    if (not downloaded_path) or (not os.path.exists(downloaded_path)):
                        print("    ‚úó Download failed or path missing, skipping this reel.")
                        continue

                    # Copy into cache so local-only mode works later
                    video_path = cache_video_to_reel_cache(downloaded_path, cache_path)
                    print(f"    üíæ Cached video: {video_path}")

            if (not video_path) or (not os.path.exists(video_path)):
                print("    ‚úó Video path missing / not found, skipping this reel.")
                continue

            print(f"    üìÇ Local video path: {video_path}")

            # ----------------------------
            # Compute metrics
            # ----------------------------
            try:
                metrics = compute_three_change_metrics_for_video(
                    video_path,
                    max_frames=max_frames_per_reel,  # keep explicit
                )
            except Exception as e:
                print(f"    ‚úó Error during change-metric computation: {repr(e)}")
                metrics = {
                    "n_frames_used": 0,
                    "scene_change_count": 0,
                    "scene_change_density": 0.0,
                    "scene_score_0_10": 0.0,
                    "mean_clip_dist": 0.0,
                    "std_clip_dist": 0.0,
                    "clip_score_0_10": 0.0,
                    "mean_hist_dist": 0.0,
                    "std_hist_dist": 0.0,
                    "hist_score_0_10": 0.0,
                }

            # ----------------------------
            # Cleanup (ONLY for non-cache mode)
            # ----------------------------
            if delete_after and (not use_local_cache_only):
                # NOTE: video_path might be cache_path; we should NEVER delete cache.
                # Only delete the temporary download if it differs from cache_path.
                try:
                    # If we used cached, no delete. If we downloaded+copied, delete the downloaded file only.
                    # We don't have downloaded_path in all branches, so do a safe check.
                    if (not os.path.abspath(video_path).startswith(os.path.abspath(cache_dir))):
                        os.remove(video_path)
                        print(f"    üßπ Deleted local video: {video_path}")
                    else:
                        print("    üßπ Skipping delete (video is in cache_dir).")
                except OSError as e:
                    print(f"    ‚ö†Ô∏è Could not delete video: {e}")

            row = {
                "creator": creator,
                "reel_idx": int(reel_idx),
                "reel_url": reel_url,
                "caption": caption,
            }
            row.update(metrics)
            rows.append(row)

    df_reels_scores = pd.DataFrame(rows)
    print("\n=== 3-METRIC CHANGE PIPELINE DONE ===")
    print("Per-reel rows:", len(df_reels_scores))

    if not df_reels_scores.empty:
        df_creator_agg = (
            df_reels_scores
            .groupby("creator", as_index=False)
            .agg(
                n_reels=("reel_url", "count"),
                mean_scene_score=("scene_score_0_10", "mean"),
                mean_clip_score=("clip_score_0_10", "mean"),
                mean_hist_score=("hist_score_0_10", "mean"),
                max_scene_score=("scene_score_0_10", "max"),
                max_clip_score=("clip_score_0_10", "max"),
                max_hist_score=("hist_score_0_10", "max"),
            )
        )
    else:
        df_creator_agg = pd.DataFrame()

    return df_reels_scores, df_creator_agg


In [11]:
# ============================================================================
# CELL 9 ‚Äî RUN PIPELINE & VIEW COMPARATIVE SCORES
# ============================================================================

df_reels_change, df_creator_change = run_change_metrics_pipeline_for_creators(
    CREATOR_LIST,
    max_reels_per_creator=MAX_REELS_PER_CREATOR,
    max_frames_per_reel=MAX_FRAMES_PER_REEL,
    delete_after=DELETE_AFTER_PROCESS,
    use_local_cache_only = True
)

print("\n=== PER-REEL CHANGE SCORES (TOP 10 ROWS) ===")
display(
    df_reels_change[
        [
            "creator",
            "reel_idx",
            "reel_url",
            "scene_score_0_10",
            "clip_score_0_10",
            "hist_score_0_10",
            "scene_change_count",
            "mean_clip_dist",
            "mean_hist_dist",
        ]
    ].head(10)
)

print("\n=== PER-CREATOR AVERAGE SCORES ===")
display(df_creator_change)



=== Processing creator (3 change metrics): badassbrownbeauty ===

üìÅ Local-only mode: using 10 cached reels for @badassbrownbeauty (Apify not called)

  ‚ñ∂ Reel 0 for badassbrownbeauty: [LOCAL] reel_cache\badassbrownbeauty\DRxArMLDBOa.mp4
    üìÇ Local video path: reel_cache\badassbrownbeauty\DRxArMLDBOa.mp4
    ‚úì Sampled 16 frames (uniform across 3681 total).

  ‚ñ∂ Reel 1 for badassbrownbeauty: [LOCAL] reel_cache\badassbrownbeauty\DSFZKKMkaHP.mp4
    üìÇ Local video path: reel_cache\badassbrownbeauty\DSFZKKMkaHP.mp4
    ‚úì Sampled 16 frames (uniform across 1041 total).

  ‚ñ∂ Reel 2 for badassbrownbeauty: [LOCAL] reel_cache\badassbrownbeauty\DR4oT2ijOhC.mp4
    üìÇ Local video path: reel_cache\badassbrownbeauty\DR4oT2ijOhC.mp4
    ‚úì Sampled 16 frames (uniform across 2729 total).

  ‚ñ∂ Reel 3 for badassbrownbeauty: [LOCAL] reel_cache\badassbrownbeauty\DRo-zZAEf55.mp4
    üìÇ Local video path: reel_cache\badassbrownbeauty\DRo-zZAEf55.mp4
    ‚úì Sampled 16 frames (uniform

Unnamed: 0,creator,reel_idx,reel_url,scene_score_0_10,clip_score_0_10,hist_score_0_10,scene_change_count,mean_clip_dist,mean_hist_dist
0,badassbrownbeauty,0,https://www.instagram.com/p/DRxArMLDBOa/,10.0,0.65,1.85,4,0.065067,0.184517
1,badassbrownbeauty,1,https://www.instagram.com/p/DSFZKKMkaHP/,10.0,2.07,3.87,7,0.206956,0.386735
2,badassbrownbeauty,2,https://www.instagram.com/p/DR4oT2ijOhC/,0.0,0.28,0.84,0,0.027633,0.083681
3,badassbrownbeauty,3,https://www.instagram.com/p/DRo-zZAEf55/,0.0,0.51,0.92,0,0.050889,0.09222
4,badassbrownbeauty,4,https://www.instagram.com/p/DR4oqOqDEUR/,0.0,0.26,0.84,0,0.02614,0.084284
5,badassbrownbeauty,5,https://www.instagram.com/p/DR9Y07wkX8b/,10.0,1.7,3.73,7,0.169986,0.373392
6,badassbrownbeauty,6,https://www.instagram.com/p/DSCzM7mDY95/,0.0,0.35,0.61,0,0.035469,0.061027
7,badassbrownbeauty,7,https://www.instagram.com/p/DSCzayUjW1f/,0.0,0.34,0.61,0,0.033741,0.061275
8,badassbrownbeauty,8,https://www.instagram.com/p/DRo_FVpEWjD/,0.0,0.47,0.92,0,0.047379,0.092152
9,badassbrownbeauty,9,https://www.instagram.com/p/DR9Y07wkX8b/,10.0,1.7,3.73,7,0.169986,0.373392



=== PER-CREATOR AVERAGE SCORES ===


Unnamed: 0,creator,n_reels,mean_scene_score,mean_clip_score,mean_hist_score,max_scene_score,max_clip_score,max_hist_score
0,badassbrownbeauty,10,4.0,0.833,1.792,10.0,2.07,3.87
1,mahiekasharma,10,6.334,0.942,2.869,10.0,1.89,6.15
2,museumofsoum,7,4.284286,0.985714,3.817143,10.0,1.49,5.84
3,nevaforevaa,10,5.998,0.865,3.806,10.0,1.17,5.7
4,riapalkar,10,8.333,2.129,4.762,10.0,4.41,8.56


In [15]:
import pandas as pd
import numpy as np
import re

# ----------------------------
# Load + normalize train_data.csv
# ----------------------------
df_train = pd.read_csv("train_data.csv")

def norm_col(c):
    c = str(c).replace("\r", "").replace("\n", "").replace("\t", "")
    c = c.strip().lower()
    c = re.sub(r"\s+", "_", c)
    c = re.sub(r"_+", "_", c)
    return c

df_train.columns = [norm_col(c) for c in df_train.columns]

df_train["creator_norm"] = df_train["creator"].astype(str).str.strip().str.lstrip("@").str.lower()

traits = ["cool", "aspirational", "relatable", "credible", "communication", "story_telling"]

# Ensure all trait columns exist (prints columns if not)
missing = [t for t in traits if t not in df_train.columns]
if missing:
    raise ValueError(f"Missing trait cols {missing}. Available: {list(df_train.columns)}")

df_labels = (
    df_train.groupby("creator_norm", as_index=False)
    .agg(**{t: (t, "mean") for t in traits})
)

# ----------------------------
# Prepare metrics df (df_creator_change) + normalize creator
# ----------------------------
df_metrics = df_creator_change.copy()
df_metrics["creator_norm"] = df_metrics["creator"].astype(str).str.strip().str.lstrip("@").str.lower()

if "mean_scene_score" not in df_metrics.columns:
    raise ValueError(f"mean_scene_score not found in df_creator_change. Columns: {list(df_metrics.columns)}")

# ----------------------------
# Merge + correlate (mean_scene_score vs each trait)
# ----------------------------
df_m = df_labels.merge(
    df_metrics[["creator_norm", "mean_scene_score"]],
    on="creator_norm",
    how="inner"
)

def corr(x, y):
    x = pd.to_numeric(x, errors="coerce")
    y = pd.to_numeric(y, errors="coerce")
    m = x.notna() & y.notna()
    if m.sum() < 3:
        return {"n": int(m.sum()), "pearson": np.nan, "spearman": np.nan}
    return {
        "n": int(m.sum()),
        "pearson": float(x[m].corr(y[m], method="pearson")),
        "spearman": float(x[m].corr(y[m], method="spearman")),
    }

out = pd.DataFrame([
    {"label": t, "metric": "mean_scene_score", **corr(df_m[t], df_m["mean_scene_score"])}
    for t in traits
]).sort_values("pearson", ascending=False)

print("Creators with labels:", len(df_labels))
print("Creators with scene score:", df_metrics["mean_scene_score"].notna().sum())
print("Creators matched:", len(df_m))

display(out)
display(df_m.sort_values("mean_scene_score", ascending=False))


Creators with labels: 78
Creators with scene score: 5
Creators matched: 5


Unnamed: 0,label,metric,n,pearson,spearman
0,cool,mean_scene_score,5,0.511727,0.6
1,aspirational,mean_scene_score,5,0.472109,0.6
4,communication,mean_scene_score,5,-0.226413,-0.2
5,story_telling,mean_scene_score,5,-0.39033,-0.153897
3,credible,mean_scene_score,5,-0.640386,-0.666886
2,relatable,mean_scene_score,5,-0.737641,-0.737865


Unnamed: 0,creator_norm,cool,aspirational,relatable,credible,communication,story_telling,mean_scene_score
4,riapalkar,6.0,6.0,4.0,3.0,4.0,4.0,8.333
1,mahiekasharma,8.0,9.0,5.0,3.0,6.5,4.0,6.334
3,nevaforevaa,2.0,2.0,4.0,2.0,1.0,1.0,5.998
2,museumofsoum,4.0,4.0,5.0,4.0,2.0,3.0,4.284286
0,badassbrownbeauty,3.0,3.0,7.0,9.0,9.0,9.0,4.0


In [9]:
import os
import cv2
import numpy as np
import pandas as pd
from pathlib import Path

# ----------------------------
# HAAR FACE HELPERS (self-contained)
# ----------------------------
_FACE_CASCADE = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
VIDEO_EXTS = {".mp4", ".mov", ".mkv", ".webm"}

def frame_has_face_haar(frame_bgr, min_face_width_frac=0.06):
    if frame_bgr is None:
        return False
    h, w = frame_bgr.shape[:2]
    if h == 0 or w == 0:
        return False

    gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
    gray = cv2.equalizeHist(gray)

    min_w = max(24, int(w * min_face_width_frac))
    faces = _FACE_CASCADE.detectMultiScale(
        gray,
        scaleFactor=1.1,
        minNeighbors=5,
        minSize=(min_w, min_w),
        flags=cv2.CASCADE_SCALE_IMAGE,
    )
    return len(faces) > 0

def compute_face_density_for_frames(frames, min_face_width_frac=0.06):
    if not frames:
        return 0, 0.0
    flags = [frame_has_face_haar(f, min_face_width_frac=min_face_width_frac) for f in frames]
    n_face = int(np.sum(flags))
    dens = float(n_face / len(frames))
    return n_face, dens

def compute_face_density_for_video(video_path: str, max_frames: int = 16, min_face_width_frac: float = 0.06):
    frames = sample_uniform_frames(video_path, max_frames=max_frames)  # uses your existing sampler
    n = len(frames)
    if n == 0:
        return {"n_frames_used": 0, "n_face_frames": 0, "face_frame_density": 0.0, "face_density_0_10": 0.0}

    n_face, dens = compute_face_density_for_frames(frames, min_face_width_frac=min_face_width_frac)
    return {
        "n_frames_used": int(n),
        "n_face_frames": int(n_face),
        "face_frame_density": float(dens),
        "face_density_0_10": round(float(np.clip(dens, 0.0, 1.0) * 10.0), 2),
    }

# ----------------------------
# LOCAL CACHE SCAN
# ----------------------------
def list_cached_videos_for_creator(cache_root: str, creator: str, max_items: int = 10):
    creator_norm = str(creator).strip().lstrip("@").lower()
    root = Path(cache_root)

    cand = []
    # 1) reels_cache/<creator>/**/*
    p1 = root / creator_norm
    if p1.exists():
        cand += [p for p in p1.rglob("*") if p.is_file() and p.suffix.lower() in VIDEO_EXTS and p.stat().st_size > 0]

    # 2) reels_cache/**/*<creator>*/*.*
    if root.exists():
        cand += [
            p for p in root.rglob("*")
            if p.is_file()
            and p.suffix.lower() in VIDEO_EXTS
            and p.stat().st_size > 0
            and (creator_norm in str(p.parent).lower() or creator_norm in str(p).lower())
        ]

    # dedupe + sort newest first
    cand = list({str(p): p for p in cand}.values())
    cand = sorted(cand, key=lambda p: p.stat().st_mtime, reverse=True)[:max_items]
    return cand

def run_face_density_pipeline_for_creators_local(
    creator_list,
    cache_dir: str = "reels_cache",
    max_reels_per_creator: int = 5,
    max_frames_per_reel: int = 16,
    min_face_width_frac: float = 0.06,
):
    rows = []

    for creator in creator_list:
        creator_norm = str(creator).strip().lstrip("@").lower()
        vids = list_cached_videos_for_creator(cache_dir, creator, max_items=max_reels_per_creator)

        print(f"\n=== Face-density (LOCAL ONLY): {creator_norm} ===")
        print(f"Found {len(vids)} cached videos in {cache_dir}")

        for i, p in enumerate(vids):
            video_path = str(p)
            print(f"  ‚ñ∂ {i}: {video_path}")

            metrics = compute_face_density_for_video(
                video_path,
                max_frames=max_frames_per_reel,
                min_face_width_frac=min_face_width_frac,
            )

            rows.append({
                "creator": creator_norm,
                "reel_idx": i,
                "video_path": video_path,
                **metrics,
            })

    df_reels_face = pd.DataFrame(rows)

    if not df_reels_face.empty:
        df_creator_face = (
            df_reels_face
            .groupby("creator", as_index=False)
            .agg(
                n_reels=("reel_idx", "count"),
                mean_face_density=("face_frame_density", "mean"),
                mean_face_density_0_10=("face_density_0_10", "mean"),
            )
        )
        df_creator_face["creator_norm"] = df_creator_face["creator"]
    else:
        df_creator_face = pd.DataFrame(columns=[
            "creator", "n_reels", "mean_face_density", "mean_face_density_0_10", "creator_norm"
        ])

    return df_reels_face, df_creator_face

# ----------------------------
# RUN (change cache_dir to your actual folder name)
# ----------------------------
df_reels_face, df_creator_face = run_face_density_pipeline_for_creators_local(
    CREATOR_LIST,
    cache_dir="reel_cache",  # <-- you used this in your error; change if your folder is reels_cache
    max_reels_per_creator=MAX_REELS_PER_CREATOR,
    max_frames_per_reel=MAX_FRAMES_PER_REEL,
    min_face_width_frac=0.06,
)

display(df_reels_face.head(10))
display(df_creator_face)



=== Face-density (LOCAL ONLY): badassbrownbeauty ===
Found 9 cached videos in reel_cache
  ‚ñ∂ 0: reel_cache\badassbrownbeauty\DR9Y07wkX8b.mp4
    ‚úì Sampled 16 frames (uniform across 1336 total).
  ‚ñ∂ 1: reel_cache\badassbrownbeauty\DRo-zZAEf55.mp4
    ‚úì Sampled 16 frames (uniform across 1780 total).
  ‚ñ∂ 2: reel_cache\badassbrownbeauty\DR4oT2ijOhC.mp4
    ‚úì Sampled 16 frames (uniform across 2729 total).
  ‚ñ∂ 3: reel_cache\badassbrownbeauty\DRxArMLDBOa.mp4
    ‚úì Sampled 16 frames (uniform across 3681 total).
  ‚ñ∂ 4: reel_cache\badassbrownbeauty\DRo_FVpEWjD.mp4
    ‚úì Sampled 16 frames (uniform across 1780 total).
  ‚ñ∂ 5: reel_cache\badassbrownbeauty\DSCzayUjW1f.mp4
    ‚úì Sampled 16 frames (uniform across 2001 total).
  ‚ñ∂ 6: reel_cache\badassbrownbeauty\DSCzM7mDY95.mp4
    ‚úì Sampled 16 frames (uniform across 2001 total).
  ‚ñ∂ 7: reel_cache\badassbrownbeauty\DR4oqOqDEUR.mp4
    ‚úì Sampled 16 frames (uniform across 2729 total).
  ‚ñ∂ 8: reel_cache\badassbrownbeauty\

Unnamed: 0,creator,reel_idx,video_path,n_frames_used,n_face_frames,face_frame_density,face_density_0_10
0,badassbrownbeauty,0,reel_cache\badassbrownbeauty\DR9Y07wkX8b.mp4,16,16,1.0,10.0
1,badassbrownbeauty,1,reel_cache\badassbrownbeauty\DRo-zZAEf55.mp4,16,16,1.0,10.0
2,badassbrownbeauty,2,reel_cache\badassbrownbeauty\DR4oT2ijOhC.mp4,16,16,1.0,10.0
3,badassbrownbeauty,3,reel_cache\badassbrownbeauty\DRxArMLDBOa.mp4,16,15,0.9375,9.38
4,badassbrownbeauty,4,reel_cache\badassbrownbeauty\DRo_FVpEWjD.mp4,16,16,1.0,10.0
5,badassbrownbeauty,5,reel_cache\badassbrownbeauty\DSCzayUjW1f.mp4,16,16,1.0,10.0
6,badassbrownbeauty,6,reel_cache\badassbrownbeauty\DSCzM7mDY95.mp4,16,16,1.0,10.0
7,badassbrownbeauty,7,reel_cache\badassbrownbeauty\DR4oqOqDEUR.mp4,16,16,1.0,10.0
8,badassbrownbeauty,8,reel_cache\badassbrownbeauty\DSFZKKMkaHP.mp4,16,16,1.0,10.0
9,museumofsoum,0,reel_cache\museumofsoum\DPbcj8rjONK.mp4,16,16,1.0,10.0


Unnamed: 0,creator,n_reels,mean_face_density,mean_face_density_0_10,creator_norm
0,badassbrownbeauty,9,0.993056,9.931111,badassbrownbeauty
1,mahiekasharma,10,0.5875,5.875,mahiekasharma
2,museumofsoum,7,0.741071,7.411429,museumofsoum
3,nevaforevaa,10,0.69375,6.939,nevaforevaa
4,riapalkar,10,0.54375,5.436,riapalkar


In [None]:
import pandas as pd
import numpy as np

df_train = pd.read_csv("train_data.csv")
df_train.columns = [c.strip().lower() for c in df_train.columns]
df_train["creator_norm"] = df_train["creator"].astype(str).str.strip().str.lstrip("@").str.lower()

traits = ["cool", "aspirational", "relatable", "credible", "communication", "story_telling"]

df_labels = (
    df_train.groupby("creator_norm", as_index=False)
    .agg(**{t: (t, "mean") for t in traits})
)

df_m = df_labels.merge(
    df_creator_face[["creator_norm", "mean_face_density"]],
    on="creator_norm",
    how="inner"
)

def corr(x, y):
    x = pd.to_numeric(x, errors="coerce")
    y = pd.to_numeric(y, errors="coerce")
    m = x.notna() & y.notna()
    if m.sum() < 3:
        return {"n": int(m.sum()), "pearson": np.nan, "spearman": np.nan}
    return {
        "n": int(m.sum()),
        "pearson": float(x[m].corr(y[m], method="pearson")),
        "spearman": float(x[m].corr(y[m], method="spearman")),
    }

out_rows = []
for t in traits:
    out_rows.append({
        "label": t,
        "metric": "mean_face_density",
        **corr(df_m[t], df_m["mean_face_density"])
    })

out = pd.DataFrame(out_rows).sort_values("pearson", ascending=False)

display(out)
display(df_m.sort_values("mean_face_density", ascending=False))


Unnamed: 0,label,metric,n,pearson,spearman
0,cool,mean_face_density,5,-0.635939,-0.6
1,aspirational,mean_face_density,5,-0.615553,-0.6


Unnamed: 0,creator_norm,cool,aspirational,mean_face_density
0,badassbrownbeauty,3.0,3.0,0.993056
2,museumofsoum,4.0,4.0,0.741071
3,nevaforevaa,2.0,2.0,0.69375
1,mahiekasharma,8.0,9.0,0.5875
4,riapalkar,6.0,6.0,0.54375


In [18]:
import os
import cv2
import numpy as np
import pandas as pd
from pathlib import Path

# ----------------------------
# HAAR FACE HELPERS (self-contained)
# ----------------------------
import cv2
import numpy as np

_FRONTAL = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
_PROFILE = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_profileface.xml")
VIDEO_EXTS = {".mp4", ".mov", ".mkv", ".webm"}


def _detect_any(cascade, gray, min_w):
    faces = cascade.detectMultiScale(
        gray,
        scaleFactor=1.1,
        minNeighbors=5,
        minSize=(min_w, min_w),
        flags=cv2.CASCADE_SCALE_IMAGE,
    )
    return len(faces) > 0

def frame_has_face_haar(frame_bgr, min_face_width_frac=0.06, include_side=True):
    """
    Returns True if a face is detected:
      - frontal cascade (always)
      - plus profile cascade (optional) on original + flipped frame
    """
    if frame_bgr is None:
        return False

    h, w = frame_bgr.shape[:2]
    if h == 0 or w == 0:
        return False

    gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
    gray = cv2.equalizeHist(gray)

    min_w = max(24, int(w * min_face_width_frac))

    # 1) frontal
    if _detect_any(_FRONTAL, gray, min_w):
        return True

    if not include_side:
        return False

    # 2) side/profile (original)
    if _detect_any(_PROFILE, gray, min_w):
        return True

    # 3) side/profile (flipped) to catch opposite direction
    gray_flip = cv2.flip(gray, 1)
    if _detect_any(_PROFILE, gray_flip, min_w):
        return True

    return False


def compute_face_density_for_frames(frames, min_face_width_frac=0.06):
    if not frames:
        return 0, 0.0
    flags = [frame_has_face_haar(f, min_face_width_frac=min_face_width_frac) for f in frames]
    n_face = int(np.sum(flags))
    dens = float(n_face / len(frames))
    return n_face, dens

def compute_face_density_for_video(video_path: str, max_frames: int = 16, min_face_width_frac: float = 0.06):
    frames = sample_uniform_frames(video_path, max_frames=max_frames)  # uses your existing sampler
    n = len(frames)
    if n == 0:
        return {"n_frames_used": 0, "n_face_frames": 0, "face_frame_density": 0.0, "face_density_0_10": 0.0}

    n_face, dens = compute_face_density_for_frames(frames, min_face_width_frac=min_face_width_frac)
    return {
        "n_frames_used": int(n),
        "n_face_frames": int(n_face),
        "face_frame_density": float(dens),
        "face_density_0_10": round(float(np.clip(dens, 0.0, 1.0) * 10.0), 2),
    }

# ----------------------------
# LOCAL CACHE SCAN
# ----------------------------
def list_cached_videos_for_creator(cache_root: str, creator: str, max_items: int = 10):
    creator_norm = str(creator).strip().lstrip("@").lower()
    root = Path(cache_root)

    cand = []
    # 1) reels_cache/<creator>/**/*
    p1 = root / creator_norm
    if p1.exists():
        cand += [p for p in p1.rglob("*") if p.is_file() and p.suffix.lower() in VIDEO_EXTS and p.stat().st_size > 0]

    # 2) reels_cache/**/*<creator>*/*.*
    if root.exists():
        cand += [
            p for p in root.rglob("*")
            if p.is_file()
            and p.suffix.lower() in VIDEO_EXTS
            and p.stat().st_size > 0
            and (creator_norm in str(p.parent).lower() or creator_norm in str(p).lower())
        ]

    # dedupe + sort newest first
    cand = list({str(p): p for p in cand}.values())
    cand = sorted(cand, key=lambda p: p.stat().st_mtime, reverse=True)[:max_items]
    return cand

def run_face_density_pipeline_for_creators_local(
    creator_list,
    cache_dir: str = "reels_cache",
    max_reels_per_creator: int = 5,
    max_frames_per_reel: int = 16,
    min_face_width_frac: float = 0.06,
):
    rows = []

    for creator in creator_list:
        creator_norm = str(creator).strip().lstrip("@").lower()
        vids = list_cached_videos_for_creator(cache_dir, creator, max_items=max_reels_per_creator)

        print(f"\n=== Face-density (LOCAL ONLY): {creator_norm} ===")
        print(f"Found {len(vids)} cached videos in {cache_dir}")

        for i, p in enumerate(vids):
            video_path = str(p)
            print(f"  ‚ñ∂ {i}: {video_path}")

            metrics = compute_face_density_for_video(
                video_path,
                max_frames=max_frames_per_reel,
                min_face_width_frac=min_face_width_frac,
            )

            rows.append({
                "creator": creator_norm,
                "reel_idx": i,
                "video_path": video_path,
                **metrics,
            })

    df_reels_face = pd.DataFrame(rows)

    if not df_reels_face.empty:
        df_creator_face = (
            df_reels_face
            .groupby("creator", as_index=False)
            .agg(
                n_reels=("reel_idx", "count"),
                mean_face_density=("face_frame_density", "mean"),
                mean_face_density_0_10=("face_density_0_10", "mean"),
            )
        )
        df_creator_face["creator_norm"] = df_creator_face["creator"]
    else:
        df_creator_face = pd.DataFrame(columns=[
            "creator", "n_reels", "mean_face_density", "mean_face_density_0_10", "creator_norm"
        ])

    return df_reels_face, df_creator_face

# ----------------------------
# RUN (change cache_dir to your actual folder name)
# ----------------------------
df_reels_face, df_creator_face = run_face_density_pipeline_for_creators_local(
    CREATOR_LIST,
    cache_dir="reel_cache",  # <-- you used this in your error; change if your folder is reels_cache
    max_reels_per_creator=MAX_REELS_PER_CREATOR,
    max_frames_per_reel=MAX_FRAMES_PER_REEL,
    min_face_width_frac=0.06,
)

display(df_reels_face.head(10))
display(df_creator_face)



=== Face-density (LOCAL ONLY): badassbrownbeauty ===
Found 9 cached videos in reel_cache
  ‚ñ∂ 0: reel_cache\badassbrownbeauty\DR9Y07wkX8b.mp4
    ‚úì Sampled 16 frames (uniform across 1336 total).
  ‚ñ∂ 1: reel_cache\badassbrownbeauty\DRo-zZAEf55.mp4
    ‚úì Sampled 16 frames (uniform across 1780 total).
  ‚ñ∂ 2: reel_cache\badassbrownbeauty\DR4oT2ijOhC.mp4
    ‚úì Sampled 16 frames (uniform across 2729 total).
  ‚ñ∂ 3: reel_cache\badassbrownbeauty\DRxArMLDBOa.mp4
    ‚úì Sampled 16 frames (uniform across 3681 total).
  ‚ñ∂ 4: reel_cache\badassbrownbeauty\DRo_FVpEWjD.mp4
    ‚úì Sampled 16 frames (uniform across 1780 total).
  ‚ñ∂ 5: reel_cache\badassbrownbeauty\DSCzayUjW1f.mp4
    ‚úì Sampled 16 frames (uniform across 2001 total).
  ‚ñ∂ 6: reel_cache\badassbrownbeauty\DSCzM7mDY95.mp4
    ‚úì Sampled 16 frames (uniform across 2001 total).
  ‚ñ∂ 7: reel_cache\badassbrownbeauty\DR4oqOqDEUR.mp4
    ‚úì Sampled 16 frames (uniform across 2729 total).
  ‚ñ∂ 8: reel_cache\badassbrownbeauty\

Unnamed: 0,creator,reel_idx,video_path,n_frames_used,n_face_frames,face_frame_density,face_density_0_10
0,badassbrownbeauty,0,reel_cache\badassbrownbeauty\DR9Y07wkX8b.mp4,16,16,1.0,10.0
1,badassbrownbeauty,1,reel_cache\badassbrownbeauty\DRo-zZAEf55.mp4,16,16,1.0,10.0
2,badassbrownbeauty,2,reel_cache\badassbrownbeauty\DR4oT2ijOhC.mp4,16,16,1.0,10.0
3,badassbrownbeauty,3,reel_cache\badassbrownbeauty\DRxArMLDBOa.mp4,16,15,0.9375,9.38
4,badassbrownbeauty,4,reel_cache\badassbrownbeauty\DRo_FVpEWjD.mp4,16,16,1.0,10.0
5,badassbrownbeauty,5,reel_cache\badassbrownbeauty\DSCzayUjW1f.mp4,16,16,1.0,10.0
6,badassbrownbeauty,6,reel_cache\badassbrownbeauty\DSCzM7mDY95.mp4,16,16,1.0,10.0
7,badassbrownbeauty,7,reel_cache\badassbrownbeauty\DR4oqOqDEUR.mp4,16,16,1.0,10.0
8,badassbrownbeauty,8,reel_cache\badassbrownbeauty\DSFZKKMkaHP.mp4,16,16,1.0,10.0
9,museumofsoum,0,reel_cache\museumofsoum\DPbcj8rjONK.mp4,16,16,1.0,10.0


Unnamed: 0,creator,n_reels,mean_face_density,mean_face_density_0_10,creator_norm
0,badassbrownbeauty,9,0.993056,9.931111,badassbrownbeauty
1,mahiekasharma,10,0.6375,6.373,mahiekasharma
2,museumofsoum,7,0.839286,8.392857,museumofsoum
3,nevaforevaa,10,0.78125,7.815,nevaforevaa
4,riapalkar,10,0.7,6.999,riapalkar


In [19]:
import pandas as pd
import numpy as np

df_train = pd.read_csv("train_data.csv")
df_train.columns = [c.strip().lower() for c in df_train.columns]
df_train["creator_norm"] = df_train["creator"].astype(str).str.strip().str.lstrip("@").str.lower()

traits = ["cool", "aspirational", "relatable", "credible", "communication", "story_telling"]

df_labels = (
    df_train.groupby("creator_norm", as_index=False)
    .agg(**{t: (t, "mean") for t in traits})
)

df_m = df_labels.merge(
    df_creator_face[["creator_norm", "mean_face_density"]],
    on="creator_norm",
    how="inner"
)

def corr(x, y):
    x = pd.to_numeric(x, errors="coerce")
    y = pd.to_numeric(y, errors="coerce")
    m = x.notna() & y.notna()
    if m.sum() < 3:
        return {"n": int(m.sum()), "pearson": np.nan, "spearman": np.nan}
    return {
        "n": int(m.sum()),
        "pearson": float(x[m].corr(y[m], method="pearson")),
        "spearman": float(x[m].corr(y[m], method="spearman")),
    }

out_rows = []
for t in traits:
    out_rows.append({
        "label": t,
        "metric": "mean_face_density",
        **corr(df_m[t], df_m["mean_face_density"])
    })

out = pd.DataFrame(out_rows).sort_values("pearson", ascending=False)

display(out)
display(df_m.sort_values("mean_face_density", ascending=False))


Unnamed: 0,label,metric,n,pearson,spearman
3,credible,mean_face_density,5,0.838407,0.666886
2,relatable,mean_face_density,5,0.752135,0.527046
5,story_telling,mean_face_density,5,0.613666,0.153897
4,communication,mean_face_density,5,0.312286,0.1
0,cool,mean_face_density,5,-0.739604,-0.7
1,aspirational,mean_face_density,5,-0.742319,-0.7


Unnamed: 0,creator_norm,cool,aspirational,relatable,credible,communication,story_telling,mean_face_density
0,badassbrownbeauty,3.0,3.0,7.0,9.0,9.0,9.0,0.993056
2,museumofsoum,4.0,4.0,5.0,4.0,2.0,3.0,0.839286
3,nevaforevaa,2.0,2.0,4.0,2.0,1.0,1.0,0.78125
4,riapalkar,6.0,6.0,4.0,3.0,4.0,4.0,0.7
1,mahiekasharma,8.0,9.0,5.0,3.0,6.5,4.0,0.6375


In [15]:

# ------------------------------------------------------------
# Global word_count distribution + creator outlier ratios
# + correlation vs train_data.csv attributes
# ------------------------------------------------------------
import os
import numpy as np
import pandas as pd

def norm_creator(x: str) -> str:
    if pd.isna(x):
        return np.nan
    return str(x).strip().lstrip("@").lower()

# ---- Paths (edit if needed) ----
REELS_PATH = "my_current_data.csv"   # your uploaded "spreadsheet" CSV
TRAIN_PATH = "train_data.csv"        # you said you have this file

# ---- 1) Load reels + fit global distribution on word_count ----
reels = pd.read_csv(REELS_PATH)

if "creator" not in reels.columns or "word_count" not in reels.columns:
    raise ValueError(f"Expected columns ['creator','word_count'] in {REELS_PATH}. Found: {list(reels.columns)}")

reels["creator_norm"] = reels["creator"].map(norm_creator)
reels["word_count"] = pd.to_numeric(reels["word_count"], errors="coerce")

wc = reels["word_count"].dropna().astype(float)
if len(wc) < 5:
    raise ValueError("Not enough non-null word_count values to fit a distribution.")

# "Fit a global distribution" (Normal): for Normal, MLE mu/sigma are mean/std (ddof=0)
mu = float(wc.mean())
sigma = float(wc.std(ddof=0))

low_thr  = mu - 2.0 * sigma
high_thr = mu + 2.0 * sigma
# low_thr  = mu - sigma
# high_thr = mu + sigma

print(f"Global Normal fit on word_count: mu={mu:.3f}, sigma={sigma:.3f}")
print(f"2œÉ thresholds: low<{low_thr:.3f}, high>{high_thr:.3f}")

# ---- 2) Per-creator ratios on either side of ¬±2œÉ ----
tmp = reels.dropna(subset=["creator_norm"]).copy()
tmp["is_low_2sigma"]  = tmp["word_count"].lt(low_thr)
tmp["is_high_2sigma"] = tmp["word_count"].gt(high_thr)

creator_metrics = (
    tmp.groupby("creator_norm", as_index=False)
       .agg(
           n_posts=("word_count", "size"),
           n_wc_nonnull=("word_count", lambda s: int(s.notna().sum())),
           low_2sigma_count=("is_low_2sigma", "sum"),
           high_2sigma_count=("is_high_2sigma", "sum"),
       )
)

# ratios use ONLY non-null word_count rows (so NaNs don‚Äôt dilute ratios)
den = creator_metrics["n_wc_nonnull"].replace(0, np.nan)
creator_metrics["low_2sigma_ratio"]  = creator_metrics["low_2sigma_count"]  / den
creator_metrics["high_2sigma_ratio"] = creator_metrics["high_2sigma_count"] / den
creator_metrics["outlier_2sigma_ratio"] = (creator_metrics["low_2sigma_count"] + creator_metrics["high_2sigma_count"]) / den

# ---- 3) Load train_data + correlations with attributes ----
if not os.path.exists(TRAIN_PATH):
    raise FileNotFoundError(
        f"Couldn't find {TRAIN_PATH}. Put train_data.csv at that path (or change TRAIN_PATH)."
    )

train = pd.read_csv(TRAIN_PATH)

# Assume train has a creator column; try common variants
creator_col = None
for c in ["creator", "handle", "username", "creator_handle", "ig_handle"]:
    if c in train.columns:
        creator_col = c
        break
if creator_col is None:
    raise ValueError(f"train_data.csv must have a creator identifier column. Found: {list(train.columns)}")

train["creator_norm"] = train[creator_col].map(norm_creator)

# Merge creator-level metrics into train
merged = train.merge(creator_metrics, on="creator_norm", how="left")

# Pick attribute columns that exist (assume these may be present)
candidate_attrs = [
    "cool", "aspirational", "relatable", "credible",
    "communication", "story_telling", "storytelling"
]
attr_cols = [c for c in candidate_attrs if c in merged.columns]

if not attr_cols:
    raise ValueError(
        f"None of the expected attribute columns found in train_data.csv. "
        f"Expected one of: {candidate_attrs}. Found: {list(merged.columns)}"
    )

metric_cols = ["low_2sigma_ratio", "high_2sigma_ratio", "outlier_2sigma_ratio"]

# Helper: compute Pearson + Spearman (with p-values)
from scipy.stats import pearsonr, spearmanr

rows = []
for metric in metric_cols:
    for attr in attr_cols:
        sub = merged[[metric, attr]].dropna()
        if len(sub) < 3:
            continue
        x = sub[metric].astype(float).values
        y = pd.to_numeric(sub[attr], errors="coerce").astype(float).values
        mask = np.isfinite(x) & np.isfinite(y)
        x, y = x[mask], y[mask]
        if len(x) < 3:
            continue

        pr, pp = pearsonr(x, y)
        sr, sp = spearmanr(x, y)

        rows.append({
            "metric": metric,
            "attribute": attr,
            "n": int(len(x)),
            "pearson_r": float(pr),
            "pearson_p": float(pp),
            "spearman_r": float(sr),
            "spearman_p": float(sp),
        })

corr_df = pd.DataFrame(rows).sort_values(
    by=["attribute", "metric", "pearson_p"], ascending=[True, True, True]
)

print("\nTop correlations (by smallest Pearson p-value):")
print(corr_df.sort_values("pearson_p").head(20).to_string(index=False))

# # Optional: save outputs
# creator_metrics.to_csv("/mnt/data/creator_wordcount_outlier_metrics.csv", index=False)
# corr_df.to_csv("/mnt/data/wordcount_outlier_correlations.csv", index=False)
# print("\nSaved:")
# print(" - /mnt/data/creator_wordcount_outlier_metrics.csv")
# print(" - /mnt/data/wordcount_outlier_correlations.csv")

# Optional: quick correlation matrix (Pearson) for the merged table
# (only numeric columns among metrics + attributes)
numeric_cols = metric_cols + attr_cols
corr_mat = merged[numeric_cols].apply(pd.to_numeric, errors="coerce").corr(method="pearson")
print("\nPearson correlation matrix (metrics + attributes):")
print(corr_mat.to_string())


Global Normal fit on word_count: mu=43.107, sigma=50.016
2œÉ thresholds: low<-56.926, high>143.139

Top correlations (by smallest Pearson p-value):
              metric     attribute  n  pearson_r  pearson_p  spearman_r  spearman_p
   high_2sigma_ratio  aspirational 12   0.622368   0.030679    0.759897    0.004131
outlier_2sigma_ratio  aspirational 12   0.622368   0.030679    0.759897    0.004131
   high_2sigma_ratio communication 12   0.550194   0.063813    0.637756    0.025670
outlier_2sigma_ratio communication 12   0.550194   0.063813    0.637756    0.025670
   high_2sigma_ratio          cool 12   0.535226   0.072938    0.600134    0.039107
outlier_2sigma_ratio          cool 12   0.535226   0.072938    0.600134    0.039107
   high_2sigma_ratio story_telling 12   0.464869   0.127829    0.605543    0.036922
outlier_2sigma_ratio story_telling 12   0.464869   0.127829    0.605543    0.036922
outlier_2sigma_ratio     relatable 12   0.092279   0.775465    0.167428    0.602988
   high_2sig

  pr, pp = pearsonr(x, y)
  sr, sp = spearmanr(x, y)
  pr, pp = pearsonr(x, y)
  sr, sp = spearmanr(x, y)
  pr, pp = pearsonr(x, y)
  sr, sp = spearmanr(x, y)
  pr, pp = pearsonr(x, y)
  sr, sp = spearmanr(x, y)
  pr, pp = pearsonr(x, y)
  sr, sp = spearmanr(x, y)
  pr, pp = pearsonr(x, y)
  sr, sp = spearmanr(x, y)


In [None]:
# =============================================================================
# GEMINI MODULE ‚Äî call_gemini_for_reel
# Target: per-reel `gemini_raw` (JSON string with numeric features)
# =============================================================================

import os
import json
import textwrap
from typing import List, Optional

from google import genai
from dotenv import load_dotenv

# -------------------------------------------------------------------------
# ENV + CLIENT SETUP (matching notebook pattern)
# -------------------------------------------------------------------------
load_dotenv()


# GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
    raise RuntimeError("Missing GEMINI_API_KEY (set it in .env or env vars)")

# Use the same style of model name as in the notebook
GEMINI_MODEL_NAME = "models/gemini-2.0-flash-001"

_gemini_client: Optional[genai.Client] = None


def get_gemini_client() -> genai.Client:
    """Singleton-style client initialisation (same pattern as notebook)."""
    global _gemini_client
    if _gemini_client is None:
        _gemini_client = genai.Client(api_key=GEMINI_API_KEY)
    return _gemini_client


# -------------------------------------------------------------------------
# Prompt template (aligned with how you want gemini_raw to look)
# -------------------------------------------------------------------------
GEMINI_PROMPT_TEMPLATE = textwrap.dedent(
    """
    You are helping a beauty and personal-care brand evaluate Instagram creators
    for potential collaborations.

    You will receive ONLY text data for ONE reel in this format:

    INSTAGRAM REEL TEXT DATA

    --- CAPTION ---
    {caption}

    --- TRANSCRIPT (ASR) ---
    {transcript}

    --- COMMENTS (if any) ---
    {comments_block}

    Language may be English, Hinglish (Hindi in Latin script), or a mix.

    Your job is to analyse this reel and return ONLY a compact JSON summary with
    the following fields. Do NOT repeat the caption, transcript, or comments in
    your output. Do NOT add extra text or explanations.

    For each reel, infer:

    1) genz_word_count (integer)
       - Count how many Gen Z / internet slang terms are used in the TRANSCRIPT. 
       - The spellings in transcript may vary.
       - Examples: "lit", "low-key", "high-key", "slay", "vibe", "aesthetic",
         "fr", "no cap", "cap", "on fleek", "serving", "ate", "mood", "delulu",
         "rizz", "iykyk", "brooo", "lol", "lmao", etc.
       - Count total occurrences (if "slay" appears 3 times, that is 3).

    2) is_marketing (0 or 1)
       - 1 if the reel is doing ANY kind of marketing or promotion of beauty /
         personal-care products, brands, or routines.
       - This includes sponsored content, product mentions, recommendations,
         discount codes, affiliate links, "shop now", "use my code", etc.
       - Else 0.

    3) is_educational (0 or 1)
       - 1 if the reel is primarily educational or informative (beauty/hair/skin
         tips, how-to steps, ingredient explanations, routines, "do this / don't
         do this").
       - Else 0.

    4) is_vlog (0 or 1)
       - 1 if the reel is a vlog-style snippet of the creator's life (day in the
         life, GRWM, routine, "come with me", events narrated in first person).
       - Else 0.

    5) has_humour (0 or 1)
       - Look at both TRANSCRIPT and COMMENTS.
       - 1 if there is clear humour or playful/comedic tone, or comments react
         with laughter (üòÇ, ü§£, "I'm dead", "too funny", etc.).
       - Else 0.

    6) comment_sentiment_counts (object)
       - For each TOP COMMENT, classify it into exactly ONE of these buckets:
         - "questioning"   ‚Üí asking questions, clarifications, doubts
         - "agreeing"      ‚Üí agreeing or saying "same", "relatable", "me too"
         - "appreciating"  ‚Üí compliments, praise, admiration
         - "negative"      ‚Üí criticism, dislike, disagreement
         - "neutral"       ‚Üí factual/unclear/irrelevant/any other not fitting above
       - Then return only the aggregate counts of how many comments fall into each
         bucket.

    ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    OUTPUT FORMAT (STRICT)
    ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

    Return your answer as VALID JSON inside <res> ... </res> and nothing else.

    <res>
    {{
      "genz_word_count": INTEGER,
      "is_marketing": 0,
      "is_educational": 0,
      "is_vlog": 0,
      "has_humour": 0,
      "comment_sentiment_counts": {{
        "questioning": INTEGER,
        "agreeing": INTEGER,
        "appreciating": INTEGER,
        "negative": INTEGER,
        "neutral": INTEGER
      }}
    }}
    </res>

    Rules:
    - Do NOT include reasons, explanations, or any extra fields.
    - Do NOT repeat or summarise the caption, transcript, or comments.
    - Always fill every field with an integer (for counts) or 0/1 for binary flags.
    """
).strip()



def _build_gemini_prompt(caption: str, transcript: str, comments: List[str]) -> str:
    """Format the prompt with caption / transcript / comments."""
    caption = caption or ""
    transcript = transcript or ""

    # ‚îÄ‚îÄ NORMALISE COMMENTS TO A SIMPLE LIST OF STRINGS ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    # comments can be:
    # - None
    # - Python list/tuple of strings
    # - numpy array / pandas Series
    # - a single string
    if comments is None:
        comments_list = []
    elif isinstance(comments, (list, tuple)):
        comments_list = list(comments)
    elif hasattr(comments, "tolist"):  # e.g. numpy array, pandas Series
        comments_list = comments.tolist()
    else:
        # single scalar (string or something else) ‚Üí wrap in a list
        comments_list = [comments]

    # Ensure everything is a stripped string and non-empty
    cleaned_comments = []
    for c in comments_list:
        if c is None:
            continue
        s = str(c).strip()
        if s:
            cleaned_comments.append(s)

    if cleaned_comments:
        comments_block = "\n".join(f"- {c}" for c in cleaned_comments[:20])
    else:
        comments_block = "None"

    return GEMINI_PROMPT_TEMPLATE.format(
        caption=caption,
        transcript=transcript,
        comments_block=comments_block,
    )



def _extract_json_object(text: str) -> str:
    """
    Try to extract a JSON object substring from the model output.
    Returns the substring if it parses as JSON, else raises ValueError.
    """
    text = text.strip()
    # Fast path: whole string is JSON
    try:
        json.loads(text)
        return text
    except Exception:
        pass

    # Try to find first '{' and last '}' and parse that substring
    start = text.find("{")
    end = text.rfind("}")
    if start == -1 or end == -1 or end <= start:
        raise ValueError("No JSON object found in Gemini output.")

    candidate = text[start : end + 1]
    json.loads(candidate)  # will raise if invalid
    return candidate


def call_gemini_for_reel(
    caption: str,
    transcript: str,
    comments: List[str],
    temperature: float = 0.1,
) -> str:
    """
    Call Gemini on a single reel's text information and return a JSON string.

    Inputs:
        caption    - reel caption (string, can be empty)
        transcript - Whisper transcript (string, can be empty)
        comments   - list of comment strings (can be empty)

    Output:
        gemini_raw (str): a JSON string representing a dict of numeric features.
                          This is what you store in your dataframe column 'gemini_raw'.
    """
    prompt = _build_gemini_prompt(caption, transcript, comments)
    client = get_gemini_client()

    try:
        resp = client.models.generate_content(
            model=GEMINI_MODEL_NAME,
            contents=prompt,
            config={"temperature": temperature},
        )
        # New google-genai client: text is on resp.text
        raw_text = (getattr(resp, "text", None) or "").strip()
        if not raw_text:
            print("    ‚úó Gemini returned empty text")
            return ""
    except Exception as e:
        print(f"    ‚úó Gemini API error: {e}")
        return ""

    # Try to extract a valid JSON object from the output
    try:
        json_str = _extract_json_object(raw_text)
    except Exception as e:
        print(f"    ‚úó Could not extract JSON from Gemini output: {e}")
        return ""

    # Final validation: ensure it loads and values are numeric
    try:
        data = json.loads(json_str)
        if not isinstance(data, dict):
            raise ValueError("Gemini JSON is not an object.")
        # Coerce numeric-looking strings into numbers
        for k, v in list(data.items()):
            if isinstance(v, str):
                try:
                    if "." in v:
                        data[k] = float(v)
                    else:
                        data[k] = int(v)
                except Exception:
                    # leave non-numeric strings as-is; they'll be ignored with numeric_only=True
                    pass
    except Exception as e:
        print(f"    ‚úó Gemini JSON validation failed: {e}")
        return ""

    # Return the cleaned JSON string
    return json.dumps(data)


In [25]:
# =============================================================================
# FULL LOCAL-ONLY PIPELINE: cached reels -> change metrics + whisper + gemini
# =============================================================================

import os
import json
import re
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import whisper

# ----------------------------
# CONFIG
# ----------------------------
CACHE_DIR = "reel_cache"          # <-- your cache folder root
VIDEO_EXTS = {".mp4", ".mov", ".mkv", ".webm"}

WHISPER_MODEL_NAME = "medium"      # "tiny" | "base" | "small" | "medium" | "large"
WHISPER_LANGUAGE = None            # set "en" if you want force English; else None = auto
MAX_TRANSCRIPT_CHARS = 12000       # keep prompt manageable

# =============================================================================
# 1) LOCAL CACHE DISCOVERY
# =============================================================================

def list_cached_videos_for_creator(cache_root: str, creator: str, max_items: int = 10):
    """
    Local-only: find cached videos for creator under cache_root.
    Supports common layouts:
      - reels_cache/<creator>/**/*.mp4
      - reels_cache/**/*<creator>*/*.mp4   (e.g. reels_cache/change3_<creator>/0.mp4)
    Returns list[Path], newest first.
    """
    creator_norm = str(creator).strip().lstrip("@").lower()
    root = Path(cache_root)

    cand = []

    # Layout 1: cache_root/<creator>/**
    p1 = root / creator_norm
    if p1.exists():
        cand += [p for p in p1.rglob("*")
                 if p.is_file() and p.suffix.lower() in VIDEO_EXTS and p.stat().st_size > 0]

    # Layout 2: anywhere containing creator in path
    if root.exists():
        cand += [
            p for p in root.rglob("*")
            if p.is_file()
            and p.suffix.lower() in VIDEO_EXTS
            and p.stat().st_size > 0
            and (creator_norm in str(p.parent).lower() or creator_norm in str(p).lower())
        ]

    # Dedup + newest first
    cand = list({str(p): p for p in cand}.values())
    cand = sorted(cand, key=lambda p: p.stat().st_mtime, reverse=True)[:max_items]
    return cand


# =============================================================================
# 2) WHISPER TRANSCRIPTION (in-memory + disk cache)
# =============================================================================

print(f"Loading Whisper model: {WHISPER_MODEL_NAME} ...")
_whisper_model = whisper.load_model(WHISPER_MODEL_NAME)
print("Whisper model loaded.")

_transcript_cache: dict[str, str] = {}

def transcript_cache_path(video_path: str) -> str:
    # store next to mp4
    vp = Path(video_path)
    return str(vp.with_suffix(vp.suffix + ".whisper.txt"))

def load_transcript_cache(video_path: str) -> str:
    p = transcript_cache_path(video_path)
    if os.path.exists(p) and os.path.getsize(p) > 0:
        try:
            return Path(p).read_text(encoding="utf-8").strip()
        except Exception:
            return ""
    return ""

def save_transcript_cache(video_path: str, text: str):
    p = transcript_cache_path(video_path)
    try:
        Path(p).parent.mkdir(parents=True, exist_ok=True)
        Path(p).write_text(text or "", encoding="utf-8")
    except Exception as e:
        print(f"    ‚ö†Ô∏è Could not write transcript cache: {e}")

def transcribe_reel(video_path: str, reel_url: str | None = None, force: bool = False) -> str:
    """
    Shared transcript helper.

    - Takes local `video_path` (already cached).
    - Optional `reel_url` is only used as a cache key.
    - Returns a plain text transcript ("" on failure).
    """
    if not video_path or not os.path.exists(video_path):
        print("    ‚úó Video file not found for transcription:", video_path)
        return ""

    cache_key = reel_url or video_path

    # in-memory cache
    if not force and cache_key in _transcript_cache:
        return _transcript_cache[cache_key]

    # disk cache
    if not force:
        disk = load_transcript_cache(video_path)
        if disk:
            _transcript_cache[cache_key] = disk
            return disk

    try:
        print("    üéô Transcribing audio with Whisper ...")
        use_fp16 = torch.cuda.is_available()
        result = _whisper_model.transcribe(
            video_path,
            fp16=use_fp16,
            language=WHISPER_LANGUAGE,
        )
        text = (result.get("text") or "").strip()
    except Exception as e:
        print(f"    ‚úó Whisper transcription failed: {e}")
        text = ""

    # truncate for prompt safety
    if len(text) > MAX_TRANSCRIPT_CHARS:
        text = text[:MAX_TRANSCRIPT_CHARS] + " ..."

    _transcript_cache[cache_key] = text
    save_transcript_cache(video_path, text)
    return text


# =============================================================================
# 3) GEMINI CACHE (disk)
# =============================================================================

def gemini_cache_path_for_video(video_path: str) -> str:
    # store next to mp4
    vp = Path(video_path)
    return str(vp.with_suffix(vp.suffix + ".gemini.json"))

def load_gemini_cache(video_path: str) -> str:
    p = gemini_cache_path_for_video(video_path)
    if os.path.exists(p) and os.path.getsize(p) > 0:
        try:
            return Path(p).read_text(encoding="utf-8").strip()
        except Exception:
            return ""
    return ""

def save_gemini_cache(video_path: str, gemini_raw: str):
    p = gemini_cache_path_for_video(video_path)
    try:
        Path(p).parent.mkdir(parents=True, exist_ok=True)
        Path(p).write_text(gemini_raw or "", encoding="utf-8")
    except Exception as e:
        print(f"    ‚ö†Ô∏è Could not write Gemini cache: {e}")


# =============================================================================
# 4) MAIN LOCAL-ONLY PIPELINE
# =============================================================================

def run_local_cached_pipeline_for_creators(
    creator_list,
    cache_dir: str = CACHE_DIR,
    max_reels_per_creator: int = 5,
    max_frames_per_reel: int = 16,
    run_whisper: bool = True,
    force_whisper: bool = False,
    run_gemini: bool = True,
    force_gemini: bool = False,
):
    """
    Local-only: NO Apify, NO downloads.
    For each cached video:
      - compute change metrics
      - (optional) whisper transcript
      - (optional) gemini features
    """
    rows = []

    for creator in creator_list:
        creator_norm = str(creator).strip().lstrip("@").lower()
        vids = list_cached_videos_for_creator(cache_dir, creator_norm, max_items=max_reels_per_creator)

        print(f"\n=== LOCAL PIPELINE: {creator_norm} ===")
        print(f"Found {len(vids)} cached videos under {cache_dir}")

        if not vids:
            continue

        for i, p in enumerate(vids):
            video_path = str(p)
            reel_url = None     # unknown in local-only mode
            caption = ""        # optional: load from your own manifest if you have it
            comments = []       # optional: load from manifest

            print(f"\n  ‚ñ∂ Reel {i}: {video_path}")

            # --- change metrics ---
            try:
                change_metrics = compute_three_change_metrics_for_video(
                    video_path,
                    max_frames=max_frames_per_reel
                )
            except Exception as e:
                print(f"    ‚úó Change metric failed: {e}")
                change_metrics = {
                    "n_frames_used": 0,
                    "scene_change_count": 0,
                    "scene_change_density": 0.0,
                    "scene_score_0_10": 0.0,
                    "mean_clip_dist": 0.0,
                    "std_clip_dist": 0.0,
                    "clip_score_0_10": 0.0,
                    "mean_hist_dist": 0.0,
                    "std_hist_dist": 0.0,
                    "hist_score_0_10": 0.0,
                }

            # --- transcript ---
            transcript = ""
            if run_whisper:
                transcript = transcribe_reel(video_path, reel_url=reel_url, force=force_whisper)

            # --- gemini ---
            gemini_raw = ""
            if run_gemini:
                if not force_gemini:
                    gemini_raw = load_gemini_cache(video_path)

                if gemini_raw:
                    print("    ‚úÖ Gemini cache hit")
                else:
                    print("    ü§ñ Calling Gemini ...")
                    gemini_raw = call_gemini_for_reel(
                        caption=caption,
                        transcript=transcript,
                        comments=comments,
                        temperature=0.1,
                    )
                    save_gemini_cache(video_path, gemini_raw)
                    print("    üíæ Saved Gemini cache")

            row = {
                "creator": creator_norm,
                "reel_idx": i,
                "reel_url": reel_url,
                "caption": caption,
                "video_path": video_path,
                "transcript": transcript,
                "gemini_raw": gemini_raw,
            }
            row.update(change_metrics)
            rows.append(row)

    df_reels_all = pd.DataFrame(rows)

    # per-creator aggregation
    if not df_reels_all.empty:
        df_creator_all = (
            df_reels_all
            .groupby("creator", as_index=False)
            .agg(
                n_reels=("reel_idx", "count"),
                mean_scene_score=("scene_score_0_10", "mean"),
                mean_clip_score=("clip_score_0_10", "mean"),
                mean_hist_score=("hist_score_0_10", "mean"),
                max_scene_score=("scene_score_0_10", "max"),
                max_clip_score=("clip_score_0_10", "max"),
                max_hist_score=("hist_score_0_10", "max"),
            )
        )
    else:
        df_creator_all = pd.DataFrame()

    return df_reels_all, df_creator_all


# =============================================================================
# 5) RUN
# =============================================================================

df_reels_all, df_creator_all = run_local_cached_pipeline_for_creators(
    CREATOR_LIST,
    cache_dir=CACHE_DIR,
    max_reels_per_creator=MAX_REELS_PER_CREATOR,
    max_frames_per_reel=MAX_FRAMES_PER_REEL,
    run_whisper=True,
    run_gemini=True,
    force_whisper=False,
    force_gemini=False,
)

display(df_reels_all.head(10))
display(df_creator_all)


Loading Whisper model: medium ...


KeyboardInterrupt: 

In [2]:
# =============================================================================
# FULL LOCAL-ONLY PIPELINE (WITH GEMINI PROMPT + WHISPER + CACHES)
# - No Apify
# - No downloads
# - Reads cached mp4s from CACHE_DIR
# - Uses caption/comments from sidecar JSON if available
# =============================================================================

import os
import re
import json
import textwrap
from pathlib import Path
from typing import List, Optional

import numpy as np
import pandas as pd

from dotenv import load_dotenv
from google import genai

import torch
import whisper

# ----------------------------
# CONFIG
# ----------------------------
CACHE_DIR = "reel_cache"          # <-- set this to your cache root
VIDEO_EXTS = {".mp4", ".mov", ".mkv", ".webm"}

WHISPER_MODEL_NAME = "medium"
GEMINI_MODEL_NAME = "models/gemini-2.0-flash-001"

MAX_FRAMES_PER_REEL = 16
MAX_REELS_PER_CREATOR = 5
MAX_TRANSCRIPT_CHARS = 12000

# =============================================================================
# 1) GEMINI MODULE (prompt + helpers) ‚Äî FROM YOUR CELL (with small fixes)
# =============================================================================

load_dotenv()
GEMINI_API_KEY="AIzaSyCKS5i6kcsAPK_TuAOno9OUHdoqFRx-PRU"

if not GEMINI_API_KEY:
    raise RuntimeError("Missing GEMINI_API_KEY (set it in .env or env vars)")


GEMINI_MODEL_NAME = "models/gemini-2.0-flash-001"

_gemini_client: Optional[genai.Client] = None

def get_gemini_client() -> genai.Client:
    global _gemini_client
    if _gemini_client is None:
        _gemini_client = genai.Client(api_key=GEMINI_API_KEY)
    return _gemini_client


# -------------------------------------------------------------------------
# Prompt template (UPDATED)
# -------------------------------------------------------------------------
GEMINI_PROMPT_TEMPLATE = textwrap.dedent(
    """
    You are helping a beauty and personal-care brand evaluate Instagram creators
    for potential collaborations.

    You will receive ONLY text data for ONE reel in this format:

    INSTAGRAM REEL TEXT DATA

    --- CAPTION ---
    {caption}

    --- TRANSCRIPT (ASR) ---
    {transcript}

    --- COMMENTS (if any) ---
    {comments_block}

    Language may be English, Hinglish (Hindi in Latin script), or a mix.

    Your job is to analyse this reel and return ONLY a compact JSON summary with
    the following fields. Do NOT repeat the caption, transcript, or comments in
    your output. Do NOT add extra text or explanations.

    For each reel, infer:

    1) genz_word_count (integer)
       - Count how many Gen Z / internet slang terms are used in the TRANSCRIPT.
       - Count total occurrences.

    2) is_marketing (0 or 1)
       - 1 if the reel is doing ANY kind of marketing or promotion.

    3) is_educational (0 or 1)
       - 1 if the reel is primarily educational or informative.

    4) is_vlog (0 or 1)
       - 1 if the reel is vlog-style / day-in-life / GRWM / narrated routine.

    5) has_humour (0 or 1)
       - Look at TRANSCRIPT and COMMENTS.
       - 1 if there is clear humour / comedic tone / laughter reactions.

    6) comment_sentiment_counts (object)
       - For each TOP COMMENT, classify it into exactly ONE bucket:
         "questioning", "agreeing", "appreciating", "negative", "neutral"
       - Return only aggregate counts.

    7) is_arts_culture (0 or 1)   ‚úÖ NEW
       - 1 if the reel meaningfully discusses or references topics like:
         - Art / artists / painting / sculpture / design / architecture
         - Museums / exhibitions / galleries
         - Literature / poetry / books / authors
         - Theatre / dance / performance / classical arts
         - Culture / cultural identity / heritage / tradition / mythology
         - Cultural history / historical narratives / history explanations
         - Movies / cinema / film / TV / documentaries / famous scenes
         - Pop-culture commentary is allowed IF it‚Äôs about movies/film/media
       - Use CAPTION + TRANSCRIPT primarily (COMMENTS can provide weak evidence).
       - If it's only a passing mention (e.g., ‚Äúmovie night lol‚Äù), keep it 0.

    ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    OUTPUT FORMAT (STRICT)
    ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

    Return your answer as VALID JSON inside <res> ... </res> and nothing else.

    <res>
    {{
      "genz_word_count": INTEGER,
      "is_marketing": 0,
      "is_educational": 0,
      "is_vlog": 0,
      "has_humour": 0,
      "is_arts_culture": 0,
      "comment_sentiment_counts": {{
        "questioning": INTEGER,
        "agreeing": INTEGER,
        "appreciating": INTEGER,
        "negative": INTEGER,
        "neutral": INTEGER
      }}
    }}
    </res>

    Rules:
    - Do NOT include reasons, explanations, or any extra fields.
    - Do NOT repeat or summarise the caption, transcript, or comments.
    - Always fill every field with an integer (for counts) or 0/1 for binary flags.
    """
).strip()


def _build_gemini_prompt(caption: str, transcript: str, comments: List[str]) -> str:
    caption = caption or ""
    transcript = transcript or ""

    if comments is None:
        comments_list = []
    elif isinstance(comments, (list, tuple)):
        comments_list = list(comments)
    elif hasattr(comments, "tolist"):
        comments_list = comments.tolist()
    else:
        comments_list = [comments]

    cleaned_comments = []
    for c in comments_list:
        if c is None:
            continue
        s = str(c).strip()
        if s:
            cleaned_comments.append(s)

    comments_block = "\n".join(f"- {c}" for c in cleaned_comments[:20]) if cleaned_comments else "None"

    return GEMINI_PROMPT_TEMPLATE.format(
        caption=caption,
        transcript=transcript,
        comments_block=comments_block,
    )


def _extract_json_object(text: str) -> str:
    text = (text or "").strip()

    # Prefer <res> ... </res>
    m = re.search(r"<res>\s*(\{.*?\})\s*</res>", text, flags=re.DOTALL)
    if m:
        candidate = m.group(1).strip()
        json.loads(candidate)
        return candidate

    # Fast path: whole string is JSON
    try:
        json.loads(text)
        return text
    except Exception:
        pass

    # Fallback: first '{' ... last '}'
    start = text.find("{")
    end = text.rfind("}")
    if start == -1 or end == -1 or end <= start:
        raise ValueError("No JSON object found in Gemini output.")
    candidate = text[start:end+1]
    json.loads(candidate)
    return candidate


# Expected schema (so your pipeline doesn't break if Gemini misses a field)
_EXPECTED_TOP_LEVEL = {
    "genz_word_count": 0,
    "is_marketing": 0,
    "is_educational": 0,
    "is_vlog": 0,
    "has_humour": 0,
    "is_arts_culture": 0,  # ‚úÖ NEW
    "comment_sentiment_counts": {
        "questioning": 0,
        "agreeing": 0,
        "appreciating": 0,
        "negative": 0,
        "neutral": 0,
    },
}

def _coerce_int01(x, default=0):
    try:
        v = int(float(x))
        return 1 if v >= 1 else 0
    except Exception:
        return default

def _coerce_int(x, default=0):
    try:
        return int(float(x))
    except Exception:
        return default


def call_gemini_for_reel(
    caption: str,
    transcript: str,
    comments: List[str],
    temperature: float = 0.1,
) -> str:
    prompt = _build_gemini_prompt(caption, transcript, comments)
    client = get_gemini_client()

    try:
        resp = client.models.generate_content(
            model=GEMINI_MODEL_NAME,
            contents=prompt,
            config={"temperature": temperature},
        )
        raw_text = (getattr(resp, "text", None) or "").strip()
        if not raw_text:
            print("    ‚úó Gemini returned empty text")
            return ""
    except Exception as e:
        print(f"    ‚úó Gemini API error: {e}")
        return ""

    try:
        json_str = _extract_json_object(raw_text)
        data = json.loads(json_str)
        if not isinstance(data, dict):
            raise ValueError("Gemini JSON is not an object.")
    except Exception as e:
        print(f"    ‚úó Gemini JSON parse failed: {e}")
        return ""

    # ---- Schema enforcement + coercion ----
    out = dict(_EXPECTED_TOP_LEVEL)  # defaults

    out["genz_word_count"] = _coerce_int(data.get("genz_word_count", 0), 0)
    out["is_marketing"] = _coerce_int01(data.get("is_marketing", 0), 0)
    out["is_educational"] = _coerce_int01(data.get("is_educational", 0), 0)
    out["is_vlog"] = _coerce_int01(data.get("is_vlog", 0), 0)
    out["has_humour"] = _coerce_int01(data.get("has_humour", 0), 0)
    out["is_arts_culture"] = _coerce_int01(data.get("is_arts_culture", 0), 0)  # ‚úÖ NEW

    csc = data.get("comment_sentiment_counts", {}) or {}
    out["comment_sentiment_counts"] = {
        "questioning": _coerce_int(csc.get("questioning", 0), 0),
        "agreeing": _coerce_int(csc.get("agreeing", 0), 0),
        "appreciating": _coerce_int(csc.get("appreciating", 0), 0),
        "negative": _coerce_int(csc.get("negative", 0), 0),
        "neutral": _coerce_int(csc.get("neutral", 0), 0),
    }

    return json.dumps(out)

# =============================================================================
# 2) WHISPER TRANSCRIPT MODULE (your logic + disk cache)
# =============================================================================

print(f"Loading Whisper model: {WHISPER_MODEL_NAME} ...")
_whisper_model = whisper.load_model(WHISPER_MODEL_NAME)
print("Whisper model loaded.")

_transcript_cache: dict[str, str] = {}

def transcript_cache_path(video_path: str) -> str:
    vp = Path(video_path)
    return str(vp.with_suffix(vp.suffix + ".whisper.txt"))

def load_transcript_cache(video_path: str) -> str:
    p = transcript_cache_path(video_path)
    if os.path.exists(p) and os.path.getsize(p) > 0:
        try:
            return Path(p).read_text(encoding="utf-8").strip()
        except Exception:
            return ""
    return ""

def save_transcript_cache(video_path: str, text: str):
    p = transcript_cache_path(video_path)
    try:
        Path(p).parent.mkdir(parents=True, exist_ok=True)
        Path(p).write_text(text or "", encoding="utf-8")
    except Exception as e:
        print(f"    ‚ö†Ô∏è Could not write transcript cache: {e}")

def transcribe_reel(video_path: str, reel_url: str | None = None, force: bool = False) -> str:
    if not video_path or not os.path.exists(video_path):
        print("    ‚úó Video file not found for transcription:", video_path)
        return ""

    cache_key = reel_url or video_path
    if not force and cache_key in _transcript_cache:
        return _transcript_cache[cache_key]

    if not force:
        disk = load_transcript_cache(video_path)
        if disk:
            _transcript_cache[cache_key] = disk
            return disk

    try:
        print("    üéô Transcribing audio with Whisper ...")
        use_fp16 = torch.cuda.is_available()
        result = _whisper_model.transcribe(video_path, fp16=use_fp16)
        text = (result.get("text") or "").strip()
    except Exception as e:
        print(f"    ‚úó Whisper transcription failed: {e}")
        text = ""

    if len(text) > MAX_TRANSCRIPT_CHARS:
        text = text[:MAX_TRANSCRIPT_CHARS] + " ..."

    _transcript_cache[cache_key] = text
    save_transcript_cache(video_path, text)
    return text


# =============================================================================
# 3) GEMINI DISK CACHE
# =============================================================================

def gemini_cache_path(video_path: str) -> str:
    vp = Path(video_path)
    return str(vp.with_suffix(vp.suffix + ".gemini.json"))

def load_gemini_cache(video_path: str) -> str:
    p = gemini_cache_path(video_path)
    if os.path.exists(p) and os.path.getsize(p) > 0:
        try:
            return Path(p).read_text(encoding="utf-8").strip()
        except Exception:
            return ""
    return ""

def save_gemini_cache(video_path: str, gemini_raw: str):
    p = gemini_cache_path(video_path)
    try:
        Path(p).parent.mkdir(parents=True, exist_ok=True)
        Path(p).write_text(gemini_raw or "", encoding="utf-8")
    except Exception as e:
        print(f"    ‚ö†Ô∏è Could not write Gemini cache: {e}")


# =============================================================================
# 4) OPTIONAL: LOAD CAPTION/COMMENTS FROM SIDECAR JSON
#    Put a JSON next to mp4 like: <video>.meta.json
#    Example fields it can contain:
#      { "reel_url": "...", "caption": "...", "latestComments": [...], "flat_comments": [...] }
# =============================================================================

def flatten_comments(latest_comments, max_n=50):
    if not isinstance(latest_comments, list):
        return []
    out = []
    for c in latest_comments[:max_n]:
        if isinstance(c, dict):
            txt = c.get("text") or c.get("body") or ""
            if txt.strip():
                out.append(txt.strip())
        elif isinstance(c, str) and c.strip():
            out.append(c.strip())
    return out

def load_text_sidecar(video_path: str):
    """
    Looks for:
      - <video>.meta.json
      - OR any *.json in the same folder (first match) that contains caption/url/comments
    Returns: (reel_url, caption, flat_comments)
    """
    vp = Path(video_path)
    meta1 = vp.with_suffix(vp.suffix + ".meta.json")

    candidates = []
    if meta1.exists():
        candidates.append(meta1)
    # fallback: any json in same directory
    candidates += list(vp.parent.glob("*.json"))

    for p in candidates:
        try:
            data = json.loads(p.read_text(encoding="utf-8"))
            if not isinstance(data, dict):
                continue
            reel_url = data.get("reel_url") or data.get("url") or data.get("postUrl") or None
            caption = data.get("caption") or ""
            # accept either "flat_comments" already or build from "latestComments"
            if "flat_comments" in data:
                flat_comments = data.get("flat_comments") or []
            else:
                flat_comments = flatten_comments(data.get("latestComments"), max_n=50)
            return reel_url, caption, flat_comments
        except Exception:
            continue

    return None, "", []


# =============================================================================
# 5) LOCAL CACHE DISCOVERY
# =============================================================================

def list_cached_videos_for_creator(cache_root: str, creator: str, max_items: int = 10):
    creator_norm = str(creator).strip().lstrip("@").lower()
    root = Path(cache_root)

    cand = []

    # Layout 1: cache_root/<creator>/**
    p1 = root / creator_norm
    if p1.exists():
        cand += [p for p in p1.rglob("*")
                 if p.is_file() and p.suffix.lower() in VIDEO_EXTS and p.stat().st_size > 0]

    # Layout 2: anywhere containing creator in path
    if root.exists():
        cand += [
            p for p in root.rglob("*")
            if p.is_file()
            and p.suffix.lower() in VIDEO_EXTS
            and p.stat().st_size > 0
            and (creator_norm in str(p.parent).lower() or creator_norm in str(p).lower())
        ]

    cand = list({str(p): p for p in cand}.values())
    cand = sorted(cand, key=lambda p: p.stat().st_mtime, reverse=True)[:max_items]
    return cand


# =============================================================================
# 6) MAIN LOCAL PIPELINE
#    Requires your existing function:
#      compute_three_change_metrics_for_video(video_path, max_frames=...)
# =============================================================================

def run_local_cached_pipeline_for_creators(
    creator_list,
    cache_dir: str = CACHE_DIR,
    max_reels_per_creator: int = MAX_REELS_PER_CREATOR,
    max_frames_per_reel: int = MAX_FRAMES_PER_REEL,
    run_whisper: bool = True,
    force_whisper: bool = False,
    run_gemini: bool = True,
    force_gemini: bool = False,
):
    rows = []

    for creator in creator_list:
        creator_norm = str(creator).strip().lstrip("@").lower()
        vids = list_cached_videos_for_creator(cache_dir, creator_norm, max_items=max_reels_per_creator)

        print(f"\n=== LOCAL PIPELINE: {creator_norm} ===")
        print(f"Found {len(vids)} cached videos under {cache_dir}")

        if not vids:
            continue

        for i, p in enumerate(vids):
            video_path = str(p)
            print(f"\n  ‚ñ∂ Reel {i}: {video_path}")

            # Load text bundle (caption/comments/url) for Gemini prompt
            reel_url, caption, flat_comments = load_text_sidecar(video_path)

            # 1) change metrics
            try:
                change_metrics = compute_three_change_metrics_for_video(
                    video_path,
                    max_frames=max_frames_per_reel
                )
            except Exception as e:
                print(f"    ‚úó Change metric failed: {e}")
                change_metrics = {
                    "n_frames_used": 0,
                    "scene_change_count": 0,
                    "scene_change_density": 0.0,
                    "scene_score_0_10": 0.0,
                    "mean_clip_dist": 0.0,
                    "std_clip_dist": 0.0,
                    "clip_score_0_10": 0.0,
                    "mean_hist_dist": 0.0,
                    "std_hist_dist": 0.0,
                    "hist_score_0_10": 0.0,
                }

            # 2) whisper transcript
            transcript = ""
            if run_whisper:
                transcript = transcribe_reel(video_path, reel_url=reel_url, force=force_whisper)

            # 3) gemini (cached)
            gemini_raw = ""
            if run_gemini:
                if not force_gemini:
                    gemini_raw = load_gemini_cache(video_path)

                if gemini_raw:
                    print("    ‚úÖ Gemini cache hit")
                else:
                    print("    ü§ñ Calling Gemini ...")
                    gemini_raw = call_gemini_for_reel(
                        caption=caption,
                        transcript=transcript,
                        comments=flat_comments,
                        temperature=0.1,
                    )
                    save_gemini_cache(video_path, gemini_raw)
                    print("    üíæ Saved Gemini cache")

            row = {
                "creator": creator_norm,
                "reel_idx": i,
                "reel_url": reel_url,
                "caption": caption,
                "video_path": video_path,
                "transcript": transcript,
                "gemini_raw": gemini_raw,
            }
            row.update(change_metrics)
            rows.append(row)

    df_reels_all = pd.DataFrame(rows)

    if not df_reels_all.empty:
        df_creator_all = (
            df_reels_all
            .groupby("creator", as_index=False)
            .agg(
                n_reels=("reel_idx", "count"),
                mean_scene_score=("scene_score_0_10", "mean"),
                mean_clip_score=("clip_score_0_10", "mean"),
                mean_hist_score=("hist_score_0_10", "mean"),
                max_scene_score=("scene_score_0_10", "max"),
                max_clip_score=("clip_score_0_10", "max"),
                max_hist_score=("hist_score_0_10", "max"),
            )
        )
    else:
        df_creator_all = pd.DataFrame()

    return df_reels_all, df_creator_all


# =============================================================================
# RUN
# =============================================================================
df_reels_all, df_creator_all = run_local_cached_pipeline_for_creators(
    CREATOR_LIST,
    cache_dir=CACHE_DIR,
    max_reels_per_creator=MAX_REELS_PER_CREATOR,
    max_frames_per_reel=MAX_FRAMES_PER_REEL,
    run_whisper=True,
    run_gemini=True,
    force_whisper=False,
    force_gemini=False,
)

display(df_reels_all.head(10))
display(df_creator_all)


Loading Whisper model: medium ...
Whisper model loaded.

=== LOCAL PIPELINE: badassbrownbeauty ===
Found 5 cached videos under reel_cache

  ‚ñ∂ Reel 0: reel_cache\badassbrownbeauty\DR9Y07wkX8b.mp4
    ‚úó Change metric failed: name 'compute_three_change_metrics_for_video' is not defined
    ‚úÖ Gemini cache hit

  ‚ñ∂ Reel 1: reel_cache\badassbrownbeauty\DRo-zZAEf55.mp4
    ‚úó Change metric failed: name 'compute_three_change_metrics_for_video' is not defined
    ‚úÖ Gemini cache hit

  ‚ñ∂ Reel 2: reel_cache\badassbrownbeauty\DR4oT2ijOhC.mp4
    ‚úó Change metric failed: name 'compute_three_change_metrics_for_video' is not defined
    ‚úÖ Gemini cache hit

  ‚ñ∂ Reel 3: reel_cache\badassbrownbeauty\DRxArMLDBOa.mp4
    ‚úó Change metric failed: name 'compute_three_change_metrics_for_video' is not defined
    ‚úÖ Gemini cache hit

  ‚ñ∂ Reel 4: reel_cache\badassbrownbeauty\DRo_FVpEWjD.mp4
    ‚úó Change metric failed: name 'compute_three_change_metrics_for_video' is not defined
    ‚úÖ

Unnamed: 0,creator,reel_idx,reel_url,caption,video_path,transcript,gemini_raw,n_frames_used,scene_change_count,scene_change_density,scene_score_0_10,mean_clip_dist,std_clip_dist,clip_score_0_10,mean_hist_dist,std_hist_dist,hist_score_0_10
0,badassbrownbeauty,0,,,reel_cache\badassbrownbeauty\DR9Y07wkX8b.mp4,Imagine this being the vibe every time you tak...,"{""genz_word_count"": 1, ""is_marketing"": 1, ""is_...",0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,badassbrownbeauty,1,,,reel_cache\badassbrownbeauty\DRo-zZAEf55.mp4,"I'm just gonna come out and say it, I think th...","{""genz_word_count"": 0, ""is_marketing"": 1, ""is_...",0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,badassbrownbeauty,2,,,reel_cache\badassbrownbeauty\DR4oT2ijOhC.mp4,If you're as obsessed as I am with beautiful g...,"{""genz_word_count"": 0, ""is_marketing"": 1, ""is_...",0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,badassbrownbeauty,3,,,reel_cache\badassbrownbeauty\DRxArMLDBOa.mp4,Here's everything I bought from the NYCA Pink ...,"{""genz_word_count"": 0, ""is_marketing"": 1, ""is_...",0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,badassbrownbeauty,4,,,reel_cache\badassbrownbeauty\DRo_FVpEWjD.mp4,"I'm just gonna come out and say it, I think th...","{""genz_word_count"": 0, ""is_marketing"": 1, ""is_...",0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,museumofsoum,0,,,reel_cache\museumofsoum\DPbcj8rjONK.mp4,,"{""genz_word_count"": 0, ""is_marketing"": 0, ""is_...",0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,museumofsoum,1,,,reel_cache\museumofsoum\C_5FVUuql9u.mp4,[(The Andy Dre Ver.),"{""genz_word_count"": 0, ""is_marketing"": 0, ""is_...",0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,museumofsoum,2,,,reel_cache\museumofsoum\DOOlRBMk3Rj.mp4,,"{""genz_word_count"": 0, ""is_marketing"": 0, ""is_...",0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,museumofsoum,3,,,reel_cache\museumofsoum\DPyJR1jkx3i.mp4,Thanks for watching!,"{""genz_word_count"": 0, ""is_marketing"": 0, ""is_...",0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,museumofsoum,4,,,reel_cache\museumofsoum\DPbb36Ok0zb.mp4,"Words don't affect me that much. I told you, C...","{""genz_word_count"": 0, ""is_marketing"": 0, ""is_...",0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,creator,n_reels,mean_scene_score,mean_clip_score,mean_hist_score,max_scene_score,max_clip_score,max_hist_score
0,badassbrownbeauty,5,0.0,0.0,0.0,0.0,0.0,0.0
1,mahiekasharma,5,0.0,0.0,0.0,0.0,0.0,0.0
2,museumofsoum,5,0.0,0.0,0.0,0.0,0.0,0.0
3,nevaforevaa,5,0.0,0.0,0.0,0.0,0.0,0.0
4,riapalkar,5,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
import pandas as pd
import numpy as np
import json
import re

# ---- 1) Labels (train_data.csv) ----
df_train = pd.read_csv("train_data.csv")

def norm_col(c):
    c = str(c).replace("\r", "").replace("\n", "").replace("\t", "")
    c = c.strip().lower()
    c = re.sub(r"\s+", "_", c)
    c = re.sub(r"_+", "_", c)
    return c

df_train.columns = [norm_col(c) for c in df_train.columns]
df_train["creator_norm"] = df_train["creator"].astype(str).str.strip().str.lstrip("@").str.lower()

traits = ["cool", "aspirational", "relatable", "credible", "communication", "story_telling"]
missing_traits = [t for t in traits if t not in df_train.columns]
if missing_traits:
    raise ValueError(f"Missing trait cols {missing_traits}. Available: {list(df_train.columns)}")

df_labels = (
    df_train.groupby("creator_norm", as_index=False)
    .agg(**{t: (t, "mean") for t in traits})
)

# ---- 2) Parse gemini_raw from df_reels_all ----
if "df_reels_all" not in globals():
    raise ValueError("df_reels_all not found. Run your pipeline first to create df_reels_all.")

df_g = df_reels_all.copy()
df_g["creator_norm"] = df_g["creator"].astype(str).str.strip().str.lstrip("@").str.lower()

def parse_gemini_raw(s):
    if not isinstance(s, str) or not s.strip():
        return {}
    try:
        d = json.loads(s)
        if not isinstance(d, dict):
            return {}
    except Exception:
        return {}

    out = {}
    # top-level fields
    for k, v in d.items():
        if k == "comment_sentiment_counts":
            continue
        out[k] = v

    # flatten comment sentiment counts
    csc = d.get("comment_sentiment_counts") or {}
    if isinstance(csc, dict):
        for kk, vv in csc.items():
            out[f"comment_{kk}"] = vv

    return out

df_gem = pd.json_normalize(df_g["gemini_raw"].apply(parse_gemini_raw))
df_g2 = pd.concat([df_g[["creator_norm"]], df_gem], axis=1)

# numeric coercion
for c in df_g2.columns:
    if c != "creator_norm":
        df_g2[c] = pd.to_numeric(df_g2[c], errors="coerce")

gemini_metrics = [c for c in df_g2.columns if c != "creator_norm"]
if not gemini_metrics:
    raise ValueError("No Gemini metrics parsed from gemini_raw.")

# ---- 3) Aggregate gemini metrics per creator + merge ----
df_gem_creator = (
    df_g2.groupby("creator_norm", as_index=False)
    .agg(**{m: (m, "mean") for m in gemini_metrics})
)

df_m = df_labels.merge(df_gem_creator, on="creator_norm", how="inner")

print("Creators matched:", len(df_m))

# ---- 4) Correlation matrix (Pearson) ----
corr_matrix = df_m[traits + gemini_metrics].corr(method="pearson").loc[traits, gemini_metrics]

display(corr_matrix)


Creators matched: 5


Unnamed: 0,genz_word_count,is_marketing,is_educational,is_vlog,has_humour,comment_questioning,comment_agreeing,comment_appreciating,comment_negative,comment_neutral,is_arts_culture
cool,0.1392715,0.010012,,0.909718,-0.60351,,,,,,0.324967
aspirational,0.1611646,0.026068,,0.888235,-0.564076,,,,,,0.241747
relatable,1.266863e-16,0.885937,,-0.372678,-0.456435,,,,,,-0.456435
credible,0.04029115,0.88632,,-0.394771,-0.443203,,,,,,-0.241747
communication,0.4262465,0.956037,,0.208817,-0.596745,,,,,,-0.085249
story_telling,0.2274294,0.956449,,-0.061898,-0.606478,,,,,,-0.037905


In [5]:
df_gem_creator

Unnamed: 0,creator_norm,genz_word_count,is_marketing,is_educational,is_vlog,has_humour,comment_questioning,comment_agreeing,comment_appreciating,comment_negative,comment_neutral,is_arts_culture
0,badassbrownbeauty,0.2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,mahiekasharma,0.2,0.4,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,museumofsoum,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,nevaforevaa,0.2,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0
4,riapalkar,0.2,0.2,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.2
