In [4]:
import json
import pandas as pd
from datetime import datetime

# Load JSON data
with open('hm_reels.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Flatten nested structures
# json_normalize will automatically expand nested dicts; lists remain as lists
df = pd.json_normalize(
    data,
    sep='.'  # use dot to separate nested keys
)

# Drop columns with all missing values
df.dropna(axis=1, how='all', inplace=True)

# Convert Unix timestamps to datetime for start_date and end_date
# for col in ['start_date', 'end_date']:
#     if col in df.columns:
#         df[col] = pd.to_datetime(df[col], unit='s', errors='coerce')

# Handle missing values
# For numeric columns, fill missing with 0
num_cols = df.select_dtypes(include=['number']).columns
df[num_cols] = df[num_cols].fillna(0)

# For object/string columns, fill missing with empty string
obj_cols = df.select_dtypes(include=['object']).columns
df[obj_cols] = df[obj_cols].fillna('')

# Example: if there are list columns you may want to convert them to strings
list_cols = [c for c in df.columns if df[c].apply(lambda x: isinstance(x, list)).any()]
for col in list_cols:
    df[col] = df[col].apply(lambda x: ','.join(map(str, x)) if isinstance(x, list) else x)

print(df.head())


                       inputUrl                   id   type    shortCode  \
0  https://www.instagram.com/hm  3631580891738343489  Video  DJl95SBKQhB   
1  https://www.instagram.com/hm  3627140706824916001  Video  DJWMUFkqIwh   
2  https://www.instagram.com/hm  3623604822922045578  Video  DJJoWNEqSiK   
3  https://www.instagram.com/hm  3627835858547757447  Video  DJYqX4EKdGH   
4  https://www.instagram.com/hm  3625841975412565105  Video  DJRlBDsKqRx   

                                             caption hashtags     mentions  \
0  Summer dreaming. The H&M Summer 2025 collectio...                         
1  The looks you give when someone says life isn’...                         
2                                A walk in the park.                         
3  The H&M Studio Resort Capsule pays homage to t...           eyesrodgers   
4  Electric blue is everything. The H&M Studio Re...           eyesrodgers   

                                        url  commentsCount     firstCommen

In [11]:
# Prints an Index of all column names
print(df.columns)

Index(['inputUrl', 'id', 'type', 'shortCode', 'caption', 'hashtags',
       'mentions', 'url', 'commentsCount', 'firstComment', 'latestComments',
       'dimensionsHeight', 'dimensionsWidth', 'displayUrl', 'images',
       'videoUrl', 'likesCount', 'videoViewCount', 'videoPlayCount',
       'timestamp', 'childPosts', 'ownerFullName', 'ownerUsername', 'ownerId',
       'productType', 'videoDuration', 'isSponsored', 'isCommentsDisabled',
       'musicInfo.artist_name', 'musicInfo.song_name',
       'musicInfo.uses_original_audio', 'musicInfo.should_mute_audio',
       'musicInfo.should_mute_audio_reason', 'musicInfo.audio_id',
       'taggedUsers', 'coauthorProducers'],
      dtype='object')


In [None]:
# Using a list comprehension for a bit more control
all_tags = df['musicInfo.song_name'].tolist()
non_empty_unique = list({tag for tag in all_tags if tag not in (None, '', float('nan'))})
print(non_empty_unique)

In [3]:
import pandas as pd
hm = pd.read_json('hm_filtered_reels.json')
columns_to_keep = ['caption', 'hashtags',
       'url', 'commentsCount', 'latestComments',
       'displayUrl', 'videoUrl', 'likesCount', 'videoViewCount', 'videoPlayCount',
       'timestamp', 'childPosts', 'ownerFullName', 'ownerUsername']

# Filter the DataFrame
hm = hm[columns_to_keep]
hm.head(10)

Unnamed: 0,caption,hashtags,url,commentsCount,latestComments,displayUrl,videoUrl,likesCount,videoViewCount,videoPlayCount,timestamp,childPosts,ownerFullName,ownerUsername
0,Summer dreaming. The H&M Summer 2025 collectio...,[],https://www.instagram.com/p/DJl95SBKQhB/,33,"[{'id': '18063384263039109', 'text': '❤️❤️❤️❤️...",https://scontent-for2-2.cdninstagram.com/v/t51...,https://scontent-for2-1.cdninstagram.com/o1/v/...,2656,0,267902,2025-05-13 12:11:19,[],H&M,hm
1,The looks you give when someone says life isn’...,[],https://www.instagram.com/p/DJWMUFkqIwh/,71,"[{'id': '18096455674562438', 'text': '😮😮😮', 'o...",https://scontent-ord5-2.cdninstagram.com/v/t51...,https://scontent-ord5-3.cdninstagram.com/o1/v/...,3870,0,2316217,2025-05-07 09:09:18,[],H&M,hm
2,The H&M Studio Resort Capsule pays homage to t...,[],https://www.instagram.com/p/DJYqX4EKdGH/,73,"[{'id': '18514462213016502', 'text': 'Love thi...",https://scontent-hou1-1.cdninstagram.com/v/t51...,https://scontent-hou1-1.cdninstagram.com/o1/v/...,4047,0,2687868,2025-05-08 08:09:56,[],H&M,hm
3,Electric blue is everything. The H&M Studio Re...,[],https://www.instagram.com/p/DJRlBDsKqRx/,77,"[{'id': '18016492700713949', 'text': '😍', 'own...",https://scontent-iad3-1.cdninstagram.com/v/t51...,https://scontent-iad3-1.cdninstagram.com/o1/v/...,5433,84533,17820992,2025-05-05 14:09:27,[],H&M,hm
4,Waterside fit check.,[],https://www.instagram.com/p/DJEDGgfq4O5/,131,"[{'id': '18063063457887718', 'text': '😍😍😍', 'o...",https://z-p4-instagram.fjsr1-1.fna.fbcdn.net/v...,https://z-p4-instagram.fjsr1-2.fna.fbcdn.net/o...,9066,866016,8851011,2025-04-30 08:01:58,[],H&M,hm
5,Soft launching summer.,[],https://www.instagram.com/p/DI_AkfsKPKB/,56,"[{'id': '17921496396066159', 'text': '❤️', 'ow...",https://instagram.fhdd1-1.fna.fbcdn.net/v/t51....,https://instagram.fhdd1-1.fna.fbcdn.net/o1/v/t...,12298,548014,8183462,2025-04-28 09:03:33,[],H&M,hm
6,“Be your own muse.” @magdabutrym 🌹 \n\nMagda B...,[],https://www.instagram.com/p/DIvcusQKBxD/,102,"[{'id': '18038938685634165', 'text': '👌🏽👌🏽👌🏽❤️...",https://scontent-atl3-1.cdninstagram.com/v/t51...,https://scontent-atl3-3.cdninstagram.com/o1/v/...,5287,143306,1562263,2025-04-22 08:02:16,[],H&M,hm
7,Coming soon: Magda Butrym H&M 🌹 “Fashion serve...,[],https://www.instagram.com/p/DIin0DUKH1W/,111,"[{'id': '18268150939285473', 'text': 'Tutto so...",https://scontent-sof1-2.cdninstagram.com/v/t51...,https://scontent-sof1-2.cdninstagram.com/o1/v/...,7686,473832,3758602,2025-04-17 08:29:27,[],H&M,hm
8,"“The rose is an ongoing source of inspiration,...",[],https://www.instagram.com/p/DIyEWPqKHKo/,215,"[{'id': '17931596457029987', 'text': '🔥', 'own...",https://scontent-del2-2.cdninstagram.com/v/t51...,https://scontent-del2-2.cdninstagram.com/o1/v/...,15575,335550,2606363,2025-04-23 08:26:57,[],H&M,hm
9,The H&M Studio Resort Capsule is almost here. ...,[],https://www.instagram.com/p/DJTlNk7q_fS/,65,"[{'id': '18031589135377685', 'text': '❤️❤️', '...",https://scontent-dfw5-3.cdninstagram.com/v/t51...,https://scontent-dfw5-3.cdninstagram.com/o1/v/...,4029,0,4768782,2025-05-06 08:50:56,[],H&M,hm


In [4]:
hm.describe()

Unnamed: 0,commentsCount,likesCount,videoViewCount,videoPlayCount,timestamp
count,57.0,57.0,57.0,57.0,57
mean,154.473684,10920.824561,622756.4,6774806.0,2025-02-28 08:19:09.649123072
min,17.0,2431.0,0.0,267902.0,2024-10-31 16:04:02
25%,56.0,5433.0,175673.0,1571704.0,2025-01-09 10:32:23
50%,77.0,8213.0,393760.0,3981593.0,2025-03-31 12:11:50
75%,135.0,13028.0,719793.0,11221430.0,2025-04-10 17:30:08
max,1151.0,63305.0,3363985.0,21848330.0,2025-05-13 12:11:19
std,218.930771,9927.622732,704384.4,6093128.0,


In [9]:
hm_ft = hm[(hm['likesCount']>=8000) & (hm['commentsCount']>=75)]
hm_ft.shape

(20, 14)

In [None]:
records = hm_ft.to_dict

In [12]:
import json
# Convert Timestamp columns to string
hm_ft['timestamp'] = hm_ft['timestamp'].astype(str)
output_path = 'hm_top_reels.json'
with open(output_path, 'w', encoding='utf-8') as fout:
    for rec in hm_ft.to_dict(orient='records'):
        json.dump(rec, fout, ensure_ascii=False, indent=4)
        fout.write('\n\n')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hm_ft['timestamp'] = hm_ft['timestamp'].astype(str)


In [None]:
#!/usr/bin/env python3
import os
import re
import time
import json
import tempfile
import requests
import numpy as np
import pandas as pd
import cv2
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import torch

from google import genai
from google.genai.errors import ClientError
from ratelimit import limits, sleep_and_retry
import backoff
from dotenv import load_dotenv

load_dotenv()

# ─── CONFIG ────────────────────────────────────────────────────────────────────
API_KEY     = os.getenv('GEMINI_API_KEY')
MODEL_NAME  = "gemini-2.0-flash-001"
INPUT_FILE  = "hm_reels.json"
OUTPUT_FILE = "hm_filtered_reels(1).json"
DEVICE      = "cuda" if torch.cuda.is_available() else "cpu"
MAX_RETRIES = 5
RETRY_DELAY = 5
# ────────────────────────────────────────────────────────────────────────────────

# Instantiate Gemini client (text-only)  
client = genai.Client(api_key=API_KEY)

# CLIP setup for vision fallback  
_clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
_clip_model     = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")


# ─── 1. TEXT FILTER: simple regex ────────────────────────────────────────────────
def regex_filter(caption: str, hashtags: list) -> bool:
    text = " ".join([caption or ""] + (hashtags or []))
    return bool(re.search(r'\bpolo\b|\bt[\s-]?shirt\b', text.lower()))


# ─── 2. TEXT FILTER: Gemini zero-shot ──────────────────────────────────────────
_ONE_MINUTE = 60

@sleep_and_retry
@limits(calls=15, period=_ONE_MINUTE)
@backoff.on_exception(
    backoff.expo,
    ClientError,
    max_time=60,
    giveup=lambda e: not hasattr(e, 'status_code')
)
def gemini_filter(caption: str, hashtags: list, video_url: str) -> bool:
    """
    Zero-shot text call to Gemini. We include the video URL
    in the prompt as text, not as a binary part.
    """
    prompt = (
        "You are a classifier. Reply with exactly 'Yes' or 'No'.\n\n"
        "Question: Does this Instagram reel feature a Polo shirt or a Polo t-shirt?\n"
        f"Video URL: {video_url}\n"
        f"Caption: {caption}\n"
        f"Hashtags: {' '.join(hashtags)}\n"
        "Answer 'Yes' or 'No'."
    )

    attempts = 0
    while attempts < MAX_RETRIES:
        try:
            resp = client.models.generate_content(
                model=MODEL_NAME,
                contents=prompt,
                # You can still pass config if desired:
                # config=types.GenerateContentConfig(temperature=0.0)
            )
            text = resp.text.strip().lower()
            return text.startswith("yes")

        except ClientError as e:
            code = getattr(e, 'status_code', None)
            # Retry only on 429 (rate limit) or 503 (unavailable)
            if code in (429, 503):
                attempts += 1
                wait = RETRY_DELAY * attempts
                print(f"[Gemini] transient error {code}, retry {attempts}/{MAX_RETRIES} after {wait}s")
                time.sleep(wait)
                continue
            # For other errors, don't retry
            raise

    # If we exhaust retries, default to False (or choose to return True)
    print(f"[Gemini] failed after {MAX_RETRIES} attempts, skipping LLM filter")
    return False

# ─── 3. VISION FILTER: CLIP fallback ────────────────────────────────────────────
def extract_frames(video_url: str, num_frames: int = 3) -> list:
    resp = requests.get(video_url, timeout=30)
    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
    tmp.write(resp.content)
    tmp.flush()

    cap = cv2.VideoCapture(tmp.name)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frames = []
    if total > 0:
        for i in np.linspace(0, total - 1, num_frames, dtype=int):
            cap.set(cv2.CAP_PROP_POS_FRAMES, int(i))
            ok, f = cap.read()
            if not ok: continue
            rgb = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
            frames.append(Image.fromarray(rgb))
    cap.release()
    return frames

def vision_filter(frames: list) -> bool:
    if not frames:
        return False
    labels = ["a photo of a polo shirt", "a photo of a t-shirt", "other"]
    inputs = _clip_processor(text=labels, images=frames, return_tensors="pt", padding=True)
    out    = _clip_model(**inputs)
    probs  = out.logits_per_image.softmax(dim=-1).mean(dim=0)
    return int(probs.argmax()) != 2




import ast
import pandas as pd

def is_relevant(row: pd.Series) -> bool:
    caption  = row.get("caption", "") or ""
    hashtags = row.get("hashtags") if isinstance(row.get("hashtags"), list) else []
    video    = row.get("videoUrl", "") or ""

    # 1) quick regex
    if regex_filter(caption, hashtags):
        return True

    # 2) zero-shot Gemini text filter (includes video URL as text)
    if gemini_filter(caption, hashtags, video):
        return True

    # 3) CLIP-based vision fallback
    return vision_filter(extract_frames(video))


def filter_reels(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Fix and sanitize hashtags
    for i in range(len(df)):
        raw_hashtags = df.at[i, "hashtags"]
        if isinstance(raw_hashtags, list):
            continue
        try:
            parsed = json.loads(raw_hashtags)
            df.at[i, "hashtags"] = parsed if isinstance(parsed, list) else []
        except Exception:
            try:
                parsed = ast.literal_eval(raw_hashtags)
                df.at[i, "hashtags"] = parsed if isinstance(parsed, list) else []
            except Exception:
                df.at[i, "hashtags"] = []

    # Run is_relevant step-by-step
    is_relevant_flags = []
    for i in range(len(df)):
        try:
            row = df.iloc[i]
            relevant = is_relevant(row)
        except Exception:
            relevant = False
        is_relevant_flags.append(relevant)
    df["is_relevant"] = is_relevant_flags

    # Filter
    df = df[df["is_relevant"]].reset_index(drop=True)
    return df


if __name__ == "__main__":
    df = pd.read_json(INPUT_FILE, orient="records")

    # Apply filtering
    filtered = filter_reels(df)
    
    # Save and report
    filtered.to_json(OUTPUT_FILE, orient="records", indent=2)
    print(f"Kept {len(filtered)}/{len(df)} relevant reels.")


  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Kept 95/100 relevant reels.


In [14]:
with open(OUTPUT_FILE, "r+", encoding="utf-8") as f:
    txt = f.read().replace(r"\/", "/")
    f.seek(0)
    f.write(txt)
    f.truncate()

In [9]:
#!/usr/bin/env python3
import os
import re
import time
import json
import tempfile
import requests
import numpy as np
import pandas as pd
import cv2
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import torch

from google import genai
from google.genai.errors import ClientError
from ratelimit import limits, sleep_and_retry
import backoff
from dotenv import load_dotenv

load_dotenv()

# ─── CONFIG ────────────────────────────────────────────────────────────────────
API_KEY     = os.getenv('GEMINI_API_KEY')
MODEL_NAME  = "gemini-2.0-flash-001"
INPUT_FILE  = "snitch.co.in_reels.json"
OUTPUT_FILE = "snitch_filtered_reels(3).json"
DEVICE      = "cuda" if torch.cuda.is_available() else "cpu"
MAX_RETRIES = 5
RETRY_DELAY = 5
# ────────────────────────────────────────────────────────────────────────────────

# Instantiate Gemini client (text-only)  
client = genai.Client(api_key=API_KEY)

# CLIP setup for vision fallback  
_clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
_clip_model     = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")


# ─── 1. TEXT FILTER: simple regex ────────────────────────────────────────────────
def regex_filter(caption: str, hashtags: list) -> bool:
    text = " ".join([caption or ""] + (hashtags or []))
    return bool(re.search(r'\bpolo\b|\bt[\s-]?shirt\b', text.lower()))


#─── 2. TEXT FILTER: Gemini zero-shot ──────────────────────────────────────────
_ONE_MINUTE = 60

@sleep_and_retry
@limits(calls=15, period=_ONE_MINUTE)
@backoff.on_exception(
    backoff.expo,
    ClientError,
    max_time=60,
    giveup=lambda e: not hasattr(e, 'status_code')
)
def gemini_filter(caption: str, hashtags: list, video_url: str) -> bool:
    """
    Zero-shot text call to Gemini. We include the video URL
    in the prompt as text, not as a binary part.
    """
    prompt = (
        "You are a classifier. Reply with exactly 'Yes' or 'No'.\n\n"
        "Question: Does this Instagram reel feature a Polo shirt or a Polo t-shirt?\n"
        f"Video URL: {video_url}\n"
        f"Caption: {caption}\n"
        f"Hashtags: {' '.join(hashtags)}\n"
        "Answer 'Yes' or 'No'."
    )

    attempts = 0
    while attempts < MAX_RETRIES:
        try:
            resp = client.models.generate_content(
                model=MODEL_NAME,
                contents=prompt,
                # You can still pass config if desired:
                # config=types.GenerateContentConfig(temperature=0.0)
            )
            text = resp.text.strip().lower()
            return text.startswith("yes")

        except ClientError as e:
            code = getattr(e, 'status_code', None)
            # Retry only on 429 (rate limit) or 503 (unavailable)
            if code in (429, 503):
                attempts += 1
                wait = RETRY_DELAY * attempts
                print(f"[Gemini] transient error {code}, retry {attempts}/{MAX_RETRIES} after {wait}s")
                time.sleep(wait)
                continue
            # For other errors, don't retry
            raise

    # If we exhaust retries, default to False (or choose to return True)
    print(f"[Gemini] failed after {MAX_RETRIES} attempts, skipping LLM filter")
    return False

# ─── 3. VISION FILTER: CLIP fallback ────────────────────────────────────────────
def extract_frames(video_url: str, num_frames: int = 3) -> list:
    resp = requests.get(video_url, timeout=30)
    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
    tmp.write(resp.content)
    tmp.flush()

    cap = cv2.VideoCapture(tmp.name)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frames = []
    if total > 0:
        for i in np.linspace(0, total - 1, num_frames, dtype=int):
            cap.set(cv2.CAP_PROP_POS_FRAMES, int(i))
            ok, f = cap.read()
            if not ok: continue
            rgb = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
            frames.append(Image.fromarray(rgb))
    cap.release()
    return frames

def vision_filter(frames: list) -> bool:
    if not frames:
        return False
    labels = ["a photo of a polo shirt", "a photo of a t-shirt", "other"]
    inputs = _clip_processor(text=labels, images=frames, return_tensors="pt", padding=True)
    out    = _clip_model(**inputs)
    probs  = out.logits_per_image.softmax(dim=-1).mean(dim=0)
    return int(probs.argmax()) != 2




import ast
import pandas as pd

def is_relevant(row: pd.Series) -> bool:
    caption  = row.get("caption", "") or ""
    hashtags = row.get("hashtags") if isinstance(row.get("hashtags"), list) else []
    video    = row.get("videoUrl", "") or ""

    # # 1) quick regex
    if regex_filter(caption, hashtags):
        return True

    # 2) zero-shot Gemini text filter (includes video URL as text)
    # if gemini_filter(caption, hashtags, video):
    #     return True

    # 3) CLIP-based vision fallback
    return vision_filter(extract_frames(video))


def filter_reels(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Fix and sanitize hashtags
    for i in range(len(df)):
        raw_hashtags = df.at[i, "hashtags"]
        if isinstance(raw_hashtags, list):
            continue
        try:
            parsed = json.loads(raw_hashtags)
            df.at[i, "hashtags"] = parsed if isinstance(parsed, list) else []
        except Exception:
            try:
                parsed = ast.literal_eval(raw_hashtags)
                df.at[i, "hashtags"] = parsed if isinstance(parsed, list) else []
            except Exception:
                df.at[i, "hashtags"] = []

    # Run is_relevant step-by-step
    is_relevant_flags = []
    for i in range(len(df)):
        try:
            row = df.iloc[i]
            relevant = is_relevant(row)
        except Exception:
            relevant = False
        is_relevant_flags.append(relevant)
    df["is_relevant"] = is_relevant_flags

    # Filter
    df = df[df["is_relevant"]].reset_index(drop=True)
    return df


if __name__ == "__main__":
    df = pd.read_json(INPUT_FILE, orient="records")

    # Apply filtering
    filtered = filter_reels(df)
    
    records = filtered.to_dict(orient="records")
    for record in records:
        if 'timestamp' in record:
            record['timestamp'] = str(record['timestamp'])

    # 2) Serialize with ensure_ascii=False (so emojis stay emojis)
    #    and indent=2 for pretty‑printing.
    s = json.dumps(records, ensure_ascii=False, indent=2)

    # 3) (Optional) If you still want to un‑escape forward slashes:
    s = s.replace(r"\/", "/")

    # 4) Write it out in one go (this truncates and overwrites)
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        f.write(s)
    
    print(f"Kept {len(filtered)}/{len(df)} relevant reels.")


Kept 66/100 relevant reels.
