In [33]:
import cv2
import numpy as np
import os
import glob
from openai import OpenAI
import csv
from PIL import Image
import base64
from io import BytesIO
from dotenv import load_dotenv
import pandas as pd

# ---------------------------- Configuration ---------------------------- #

# Load environment variables (here, OPENAI_API_KEY)
load_dotenv()

# Initialize OpenAI client (uses key from env variable)
client = OpenAI()

# Output CSV file
OUTPUT_CSV = "detected_objects.csv"



# ---------------------------- Setup: Load Previous Results ---------------------------- #

# Load existing results if any
if os.path.exists(OUTPUT_CSV):
    existing_df = pd.read_csv(OUTPUT_CSV)
    processed_videos = set(existing_df['video'].tolist())
else:
    existing_df = pd.DataFrame(columns=["video", "products"])
    processed_videos = set()



# ---------------------------- Scene Change Detection ---------------------------- #

def is_scene_change(prev_frame, curr_frame, threshold):
    """
    Compares two grayscale frames using histogram correlation.
    Returns True if the similarity is below the threshold, indicating a scene change.
    """

    # Compute grayscale histograms for both frames
    prev_hist = cv2.calcHist([prev_frame], [0], None, [256], [0, 256])
    curr_hist = cv2.calcHist([curr_frame], [0], None, [256], [0, 256])
    
    # Normalize the histograms so they are comparable
    cv2.normalize(prev_hist, prev_hist)
    cv2.normalize(curr_hist, curr_hist)

    # Compare histograms using correlation; closer to 1 means similar
    similarity = cv2.compareHist(prev_hist, curr_hist, cv2.HISTCMP_CORREL)

    # If similarity is below threshold, it indicates a scene change
    return similarity < threshold



# --------------------- Function to convert image to base64 --------------------- #

def image_to_base64(image):
    """
    Converts a CV2 image to a base64-encoded JPEG string.
    
    Args:
        image (np.ndarray): OpenCV image (BGR format).
    
    Returns:
        str: Base64-encoded string of the JPEG image, or None if conversion fails.
    """

    try:

        # Encode the image to JPEG format
        # The first element of the tuple is a success flag, the second is the encoded image buffer
        retval, buffer = cv2.imencode('.jpg', image)
        if not retval:
            raise ValueError("Image encoding to JPEG failed.")

        # Convert the buffer (NumPy array) to bytes
        jpg_as_bytes = buffer.tobytes()

        # Encode the bytes to Base64
        jpg_as_base64 = base64.b64encode(jpg_as_bytes)

        # Convert to base64 string (not bytes)
        jpg_as_string = jpg_as_base64.decode('utf-8')

        return jpg_as_string

    except Exception as e:

        print(f"⚠️ Error in image_to_base64: {e}")
        return None



# ---------------------------- Vision Model (OpenAI) ---------------------------- #

def identify_objects(frame):
    """
    Sends an image frame to OpenAI Vision model to detect branded products.

    Args:
        frame (np.ndarray): BGR image frame from OpenCV.

    Returns:
        str: a string of identified products, or 'ERROR' if failed.
    """

    # === Prompt to use ===
    prompt = (
        "Identify and list all branded or commercial products visible in this image."
        "Don't give a complete description of what you have identified. Just return the product with brand name."
        "for example - 'Pepsi can', 'Doritos chips', 'Reebok tracksuit', 'Fruity loops cereal', etc"
        # "include the full product name like 'Maybelline Fit Me Blush' or 'Maybelline Super Stay Foundation'. Avoid generic brand names like 'Maybelline Fit Me' without stating what the product is. Do not guess, but infer based on visible packaging and design."
        # "Pay attention to packaging — tube, bottle, compact, etc. Avoid assuming based on product line names like 'Fit Me' or 'Dewy + Smooth'."
        "Use your best judgment, but avoid listing products that are not visibly present. "
        "If a product is partially obscured but clearly identifiable by packaging, include it. "
        "Do not guess products that are not visually indicated."
        "If there is no product visible in the image, simply return None"
    )

    image_b64 = image_to_base64(frame) #  Convert to base64 for API

    try:
        response = client.chat.completions.create(
            model="gpt-4o", # Vision-capable model
            messages=[
                {"role": "user", "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {
                        "url": f"data:image/jpeg;base64,{image_b64}"
                    }}
                ]}
            ],
            temperature=0.2
        )

        answer = response.choices[0].message.content.strip()

    except Exception as e:
        answer = None

    return answer



# ---------------------------- Output Cleaning ---------------------------- #

def get_final_products(all_products):
    """
    Deduplicates and returns a clean string of unique products.

    Args:
        product_list (list of str): Raw model responses.

    Returns:
        str: Sorted, comma-separated list of unique products.
    """

    prompt2 = f'''

    You are given a list or block of text that may contain repeated, messy, or slightly varied product names.

    Your task is to extract a clean, alphabetically sorted list of unique product names from it.

    Instructions:

    Ignore values like "None", "N/A", or empty entries.

    Split multiple products if they appear in one string separated by commas or dashes.

    Normalize duplicates with slight variations. For example, "Maybelline Master Chrome" and "Maybelline Master Chrome Highlighter" should be merged as "Maybelline Master Chrome Highlighter".

    Do not include vague labels like just "Maybelline Fit Me" — prefer specific ones like "Maybelline Fit Me Blush" or "Maybelline Fit Me Dewy + Smooth Primer".

    Do not make up products. Only include what's visibly or clearly listed.

    Return the final result as a plain list of comma separated strings (not as a JSON object or explanation).

    List: {all_products}
    '''

    try:
        response = client.chat.completions.create(
                    model="gpt-4o",
                    messages=[
                        {"role": "user", "content": prompt2    
                        }
                    ],
                    temperature=0.2
                )

        answer = response.choices[0].message.content.strip()
        return answer

    except Exception as e:
        return None



# ---------------------------- Main Pipeline ---------------------------- #

vid_dir = os.path.join(os.getcwd(), 'videos')

for vid in glob.glob(vid_dir+'/*.mp4'):

    if vid in processed_videos:
        print(f"✅ Skipping already processed video: {vid}")
        continue

    all_products = []

    try:
    
        # Open video file
        cap = cv2.VideoCapture(vid)
        prev_frame = None # Store previous frame for comparison
        frame_id = 0 # Track the current frame number
        scene_id = 0 # Track the number of detected scenes

        # Create a directory to save scene images
        # os.makedirs('scenes')

        # Loop through video frames
        while True:
            ret, frame = cap.read() # Read next frame
            if not ret:
                break # Exit loop if video ends or fails to read

            # Convert current frame to grayscale for histogram analysis
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            
            # Skip comparison for the first frame
            if prev_frame is None:
                prev_frame = gray
                continue

            # Detect scene change by comparing current and previous frame
            if is_scene_change(prev_frame, gray, threshold=0.8):
                print(f"Scene changed at frame {frame_id}")

                objs = identify_objects(frame)
                all_products.append(objs)

                # # Save frame as an image in the output directory, uncomment to refer to images of scenes saved
                # cv2.imwrite(f"scenes/scene_{scene_id:03d}.jpg", frame)

                scene_id += 1
            
            # Update previous frame and frame counter
            prev_frame = gray
            frame_id += 1

        # Release the video capture object
        cap.release()

        # Get final list of products and update CSV
        final_output = get_final_products(all_products)
        new_entry = pd.DataFrame([[vid, final_output]], columns=["video", "products"])
        existing_df = pd.concat([existing_df, new_entry], ignore_index=True)
        existing_df.to_csv(OUTPUT_CSV, index=False)

    except Exception as e:
        print(f"❌ Failed to process {vid_path}: {e}")

✅ Skipping already processed video: /Users/shrutiagarwal/Desktop/objects_extractor/videos/maybelline.mp4
