In [None]:
# gap_analyzer.py
import os
from supabase import create_client, Client
from typing import List, Dict
import pandas as pd
from collections import Counter
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import json


In [5]:
# Initialize Supabase

# SUPABASE_URL = os.getenv("SUPABASE_URL", )
# SUPABASE_KEY = os.getenv("SUPABASE_KEY", )
print(SUPABASE_URL)
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)


https://shpsvnybkezljaoippva.supabase.co


In [6]:


def fetch_all_videos_with_summaries() -> pd.DataFrame:
    """
    Fetch all videos and their summaries from Supabase.
    Returns a DataFrame with video metadata and aggregated summaries.
    """
    print("Fetching videos from Supabase...")

    # Fetch videos
    videos_response = supabase.table('videos').select('*').execute()
    videos = videos_response.data

    # Fetch all summaries
    summaries_response = supabase.table(
        'video_summaries').select('*').execute()
    summaries = summaries_response.data

    # Group summaries by video_id
    summaries_by_video = {}
    for summary in summaries:
        video_id = summary['video_id']
        if video_id not in summaries_by_video:
            summaries_by_video[video_id] = []
        summaries_by_video[video_id].append(summary)

    # Combine data
    video_data = []
    for video in videos:
        video_id = video['id']
        video_summaries = summaries_by_video.get(video_id, [])

        video_data.append({
            'video_id': video_id,
            'title': video.get('title', ''),
            'duration': video['duration'],
            'status': video['status'],
            'key_topics': video.get('key_topics', ''),
            'frame_interval': video['frame_interval'],
            'total_frames': video['total_frames'],
            'created_at': video['created_at'],
            'summaries': video_summaries
        })

    df = pd.DataFrame(video_data)
    print(f"Fetched {len(df)} videos with summaries")
    return df


def parse_duration(duration_str: str) -> float:
    """
    Parse duration string (e.g., '5:30', '1:15:45') to seconds.
    """
    parts = duration_str.split(':')
    if len(parts) == 2:  # MM:SS
        return int(parts[0]) * 60 + int(parts[1])
    elif len(parts) == 3:  # HH:MM:SS
        return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
    else:
        return 0.0


def extract_all_descriptions(summaries: List[Dict]) -> str:
    """
    Concatenate all frame descriptions into a single text.
    """
    if not summaries:
        return ""
    return " ".join([s['description'] for s in summaries])


def calculate_scene_changes(summaries: List[Dict]) -> int:
    """
    Estimate scene changes by comparing consecutive descriptions.
    Simple heuristic: count when description similarity drops.
    """
    if len(summaries) < 2:
        return 0

    changes = 0
    for i in range(1, len(summaries)):
        prev_desc = set(summaries[i-1]['description'].lower().split())
        curr_desc = set(summaries[i]['description'].lower().split())

        # If less than 30% overlap, consider it a scene change
        if len(prev_desc) > 0:
            overlap = len(prev_desc & curr_desc) / len(prev_desc)
            if overlap < 0.3:
                changes += 1

    return changes


def extract_topics_from_text(text: str, top_n: int = 10) -> List[str]:
    """
    Extract key topics/keywords from text using simple frequency.
    """
    if not text:
        return []

    # Remove common stop words
    stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
                  'is', 'was', 'are', 'were', 'has', 'have', 'had', 'this', 'that',
                  'with', 'from', 'by', 'of', 'as', 'it', 'be', 'can', 'will'}

    words = text.lower().split()
    filtered_words = [w for w in words if w not in stop_words and len(w) > 3]

    word_counts = Counter(filtered_words)
    return [word for word, count in word_counts.most_common(top_n)]


# gap_analyzer.py (continued)

def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Engineer features from raw video data for WoodWide clustering.
    """
    print("Engineering features...")

    features = []

    for idx, row in df.iterrows():
        summaries = row['summaries']
        duration_seconds = parse_duration(row['duration'])

        # Concatenate all descriptions
        all_descriptions = extract_all_descriptions(summaries)

        # Extract topics
        topics_from_desc = extract_topics_from_text(all_descriptions, top_n=5)
        topics_from_key = row['key_topics'].split(
            ',') if row['key_topics'] else []
        all_topics = list(set(topics_from_desc + topics_from_key))

        # Feature calculations
        feature_dict = {
            'video_id': row['video_id'],
            'title': row['title'],

            # Duration features
            'duration_seconds': duration_seconds,
            'duration_minutes': duration_seconds / 60,

            # Frame density
            'total_frames': row['total_frames'],
            'frames_per_minute': row['total_frames'] / (duration_seconds / 60) if duration_seconds > 0 else 0,

            # Content complexity
            'unique_topics_count': len(all_topics),
            'topic_density': len(all_topics) / (duration_seconds / 60) if duration_seconds > 0 else 0,

            # Description analysis
            'avg_description_length': np.mean([len(s['description']) for s in summaries]) if summaries else 0,
            'total_description_length': sum([len(s['description']) for s in summaries]),
            'description_variance': np.var([len(s['description']) for s in summaries]) if len(summaries) > 1 else 0,

            # Scene dynamics
            'scene_changes': calculate_scene_changes(summaries),
            'scene_change_rate': calculate_scene_changes(summaries) / (duration_seconds / 60) if duration_seconds > 0 else 0,

            # Pacing
            'frame_interval': row['frame_interval'],

            # Store topics as JSON string for later analysis
            'topics_json': json.dumps(all_topics),
            'key_topics': row['key_topics'],
        }

        features.append(feature_dict)

    features_df = pd.DataFrame(features)
    print(
        f"Engineered {len(features_df.columns)} features for {len(features_df)} videos")
    return features_df


def create_topic_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create additional features based on topic patterns.
    This helps identify content gaps.
    """
    print("Creating topic-based features...")

    # Extract all unique topics across all videos
    all_topics = []
    for topics_json in df['topics_json']:
        all_topics.extend(json.loads(topics_json))

    # Get top 20 most common topics
    topic_counts = Counter(all_topics)
    top_topics = [topic for topic, count in topic_counts.most_common(20)]

    print(f"Top topics in your content: {top_topics[:10]}")

    # Create binary features for each top topic
    for topic in top_topics:
        df[f'has_{topic}'] = df['topics_json'].apply(
            lambda x: 1 if topic in json.loads(x) else 0
        )

    # Topic diversity score (entropy-like measure)
    df['topic_diversity'] = df['topics_json'].apply(
        lambda x: len(set(json.loads(x)))
    )

    return df


def prepare_clustering_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """
    Prepare final dataset for WoodWide clustering.
    Select only numeric features suitable for clustering.
    """
    print("Preparing clustering dataset...")

    # Select numeric features for clustering
    clustering_features = [
        'duration_minutes',
        'frames_per_minute',
        'unique_topics_count',
        'topic_density',
        'avg_description_length',
        'description_variance',
        'scene_change_rate',
        'topic_diversity',
    ]

    # Add topic binary features (has_*)
    topic_cols = [col for col in df.columns if col.startswith('has_')]
    clustering_features.extend(topic_cols)

    # Create final dataset
    cluster_df = df[['video_id', 'title'] + clustering_features].copy()

    # Handle any NaN values
    cluster_df = cluster_df.fillna(0)

    print(f"Clustering dataset shape: {cluster_df.shape}")
    print(f"Features: {clustering_features[:5]}...")

    return cluster_df


In [7]:
import time
from woodwide import WoodWide

In [20]:
# gap_analyzer.py (continued)




def save_to_csv(df: pd.DataFrame, filename: str) -> str:
    """Save DataFrame to CSV file."""
    filepath = f"/tmp/{filename}"
    df.to_csv(filepath, index=False)
    print(f"Saved to {filepath}")
    return filepath


def train_content_gap_model(
    features_df: pd.DataFrame,
    api_key: str,
    base_url: str = "https://beta.woodwide.ai/"
) -> str:
    """
    Train a WoodWide clustering model to identify content patterns.
    
    Returns:
        model_id: The trained model ID
    """
    print("\n" + "="*50)
    print("TRAINING WOODWIDE CLUSTERING MODEL")
    print("="*50 + "\n")
    
    # Initialize WoodWide client
    client = WoodWide(api_key=api_key, base_url=base_url)
    
    # Prepare clustering dataset (only numeric features)
    cluster_df = prepare_clustering_dataset(features_df)
    
    # Save to CSV (remove video_id and title for training)
    training_df = cluster_df.drop(['video_id', 'title'], axis=1)
    csv_path = save_to_csv(training_df, 'content_clustering.csv')
    
    try:
        # Step 1: Upload dataset
        print("Step 1: Uploading dataset to WoodWide...")
        dataset_name = f"content_patterns_{int(time.time())}"
        
        with open(csv_path, 'rb') as f:
            dataset = client.api.datasets.upload(
                file=f,
                name=dataset_name,
                overwrite=True
            )
        
        dataset_id = dataset.id
        print(f"âœ“ Dataset uploaded. ID: {dataset_id}\n")
        
        # Step 2: Train clustering model
        print("Step 2: Training clustering model...")
        model_name = f"gap_analyzer_{int(time.time())}"
        
        # Use raw HTTP request for clustering endpoint
        response = client._client.post(
            "/api/models/clustering/train",
            params={"dataset_name": dataset_name},
            data={
                "model_name": model_name,
                "overwrite": "true",
            },
            headers=client.auth_headers,
        )
        
        if response.status_code != 200:
            print(f"Error starting training: {response.status_code}")
            print(response.text)
            raise Exception("Training failed to start")
        
        response_json = response.json()
        model_id = response_json.get("id")
        
        if not model_id:
            raise Exception("No model ID returned")
        
        print(f"âœ“ Training started. Model ID: {model_id}\n")
        
        # Step 3: Wait for training to complete
        print("Step 3: Waiting for training to complete...")
        start_time = time.time()
        timeout = 600  # 10 minutes
        
        while True:
            model = client.api.models.retrieve(model_id)
            training_status = model.training_status
            
            if training_status == "COMPLETE":
                elapsed = time.time() - start_time
                print(f"âœ“ Training complete! (took {elapsed:.2f}s)\n")
                break
            elif training_status == "FAILED":
                print("âœ— Training failed!")
                print(model)
                raise Exception("Training failed")
            
            elapsed = time.time() - start_time
            if elapsed >= timeout:
                raise Exception(f"Training timeout after {timeout}s")
            
            print(f"  Status: {training_status}... ({elapsed:.0f}s elapsed)")
            time.sleep(5)
        
        print("\n" + "="*50)
        print("MODEL TRAINING SUCCESSFUL")
        print("="*50 + "\n")
        
        return model_id
        
    finally:
        # Cleanup
        if os.path.exists(csv_path):
            os.remove(csv_path)

def run_clustering_inference(
    model_id: str,
    features_df: pd.DataFrame,
    api_key: str,
    base_url: str = "https://beta.woodwide.ai/"
) -> pd.DataFrame:
    """
    Run inference on the clustering model to assign clusters.
    
    Returns:
        DataFrame with cluster assignments and cluster descriptions
    """
    print("\n" + "="*50)
    print("RUNNING CLUSTERING INFERENCE")
    print("="*50 + "\n")
    
    client = WoodWide(api_key=api_key, base_url=base_url)
    
    # Prepare same features as training
    cluster_df = prepare_clustering_dataset(features_df)
    
    # Save inference dataset
    inference_df = cluster_df.drop(['video_id', 'title'], axis=1)
    csv_path = save_to_csv(inference_df, 'inference_data.csv')
    
    try:
        # Upload inference dataset
        print("Uploading inference dataset...")
        inference_dataset_name = f"inference_{int(time.time())}"
        
        with open(csv_path, 'rb') as f:
            inference_dataset = client.api.datasets.upload(
                file=f,
                name=inference_dataset_name,
                overwrite=True
            )
        
        inference_dataset_id = inference_dataset.id
        print(f"âœ“ Dataset uploaded. ID: {inference_dataset_id}\n")
        
        # Run inference
        print("Running clustering inference...")
        result = client.api.models.clustering.infer(
            model_id=model_id,
            dataset_id=inference_dataset_id
        )
        
        print("âœ“ Inference complete!\n")
        
        # Parse WoodWide clustering response
        # Format: {"cluster_label": {"0": 2, "1": 3, ...}, "cluster_descriptions": {...}}
        if hasattr(result, 'model_dump'):
            result_dict = result.model_dump()
        elif isinstance(result, dict):
            result_dict = result
        else:
            raise ValueError(f"Unexpected result type: {type(result)}")
        
        # Extract cluster labels (mapping: row_index -> cluster_id)
        cluster_label_dict = result_dict.get('cluster_label', {})
        cluster_descriptions = result_dict.get('cluster_descriptions', {})
        
        if not cluster_label_dict:
            raise ValueError("No cluster_label found in response")
        
        # Convert cluster labels to list (sorted by index)
        clusters = []
        for i in range(len(cluster_df)):
            cluster_id = cluster_label_dict.get(str(i))
            if cluster_id is None:
                raise ValueError(f"Missing cluster assignment for row {i}")
            clusters.append(cluster_id)
        
        print(f"Successfully extracted {len(clusters)} cluster assignments")
        print(f"Found {len(cluster_descriptions)} unique clusters\n")
        
        # Create final dataframe with cluster assignments
        final_df = features_df[['video_id', 'title']].copy()
        final_df['cluster'] = clusters
        
        # Add cluster descriptions
        final_df['cluster_description'] = final_df['cluster'].apply(
            lambda x: cluster_descriptions.get(str(x), 'No description')
        )
        
        # Add key metrics for analysis
        final_df['duration_minutes'] = features_df['duration_minutes']
        final_df['unique_topics'] = features_df['unique_topics_count']
        final_df['scene_change_rate'] = features_df['scene_change_rate']
        final_df['topics'] = features_df['topics_json']
        
        print(f"Cluster distribution:")
        print(final_df['cluster'].value_counts().sort_index())
        print()
        
        # Save cluster descriptions separately
        cluster_info = {
            'cluster_descriptions': cluster_descriptions,
            'cluster_counts': final_df['cluster'].value_counts().to_dict()
        }
        
        with open('cluster_info.json', 'w') as f:
            json.dump(cluster_info, f, indent=2)
        print("Cluster descriptions saved to cluster_info.json\n")
        
        return final_df
        
    finally:
        if os.path.exists(csv_path):
            os.remove(csv_path)
def run_clustering_inference_debug(
    model_id: str,
    features_df: pd.DataFrame,
    api_key: str,
    base_url: str = "https://beta.woodwide.ai/"
):
    """
    Debug version to see what WoodWide returns.
    """
    import requests
    
    client = WoodWide(api_key=api_key, base_url=base_url)
    
    # Prepare dataset
    cluster_df = prepare_clustering_dataset(features_df)
    inference_df = cluster_df.drop(['video_id', 'title'], axis=1)
    csv_path = save_to_csv(inference_df, 'inference_data.csv')
    
    try:
        # Upload dataset
        print("Uploading dataset...")
        with open(csv_path, 'rb') as f:
            inference_dataset = client.api.datasets.upload(
                file=f,
                name=f"debug_inference_{int(time.time())}",
                overwrite=True
            )
        
        inference_dataset_id = inference_dataset.id
        print(f"Dataset ID: {inference_dataset_id}")
        
        # Make raw API call to see response
        print("\nMaking raw API call...")
        url = f"{base_url}/api/models/clustering/{model_id}/infer?dataset_id={inference_dataset_id}"
        
        headers = {
            "accept": "application/json",
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/x-www-form-urlencoded"
        }
        
        response = requests.post(url, headers=headers)
        
        print(f"Status Code: {response.status_code}")
        print(f"Response Headers: {dict(response.headers)}")
        print(f"\nRaw Response Text:")
        print(response.text)
        print()
        
        if response.status_code == 200:
            try:
                json_response = response.json()
                print(f"Parsed JSON:")
                print(json.dumps(json_response, indent=2))
            except:
                print("Could not parse as JSON")
        
        return response
        
    finally:
        if os.path.exists(csv_path):
            os.remove(csv_path)

# Run this instead
# debug_response = run_clustering_inference_debug(
#     model_id,
#     features_df,
#     api_key=args["api_key"]
#)

In [23]:
# gap_analyzer.py (continued)

def analyze_clusters(clustered_df: pd.DataFrame) -> Dict:
    """
    Analyze clusters to identify content patterns and gaps.
    """
    print("\n" + "="*50)
    print("CLUSTER ANALYSIS")
    print("="*50 + "\n")
    
    analysis = {
        'total_videos': len(clustered_df),
        'num_clusters': clustered_df['cluster'].nunique(),
        'clusters': {}
    }
    
    for cluster_id in sorted(clustered_df['cluster'].unique()):
        cluster_videos = clustered_df[clustered_df['cluster'] == cluster_id]
        
        # Aggregate topics in this cluster
        all_topics = []
        for topics_json in cluster_videos['topics']:
            all_topics.extend(json.loads(topics_json))
        topic_counts = Counter(all_topics)
        top_topics = [t for t, c in topic_counts.most_common(5)]
        
        cluster_info = {
            'video_count': len(cluster_videos),
            'percentage': len(cluster_videos) / len(clustered_df) * 100,
            'avg_duration': cluster_videos['duration_minutes'].mean(),
            'avg_topics': cluster_videos['unique_topics'].mean(),
            'avg_scene_change_rate': cluster_videos['scene_change_rate'].mean(),
            'top_topics': top_topics,
            'video_titles': cluster_videos['title'].tolist()
        }
        
        analysis['clusters'][f'cluster_{cluster_id}'] = cluster_info
        
        # Print cluster summary
        print(f"Cluster {cluster_id}: ({cluster_info['video_count']} videos, {cluster_info['percentage']:.1f}%)")
        print(f"  Avg Duration: {cluster_info['avg_duration']:.1f} min")
        print(f"  Top Topics: {', '.join(top_topics[:3])}")
        print(f"  Sample Videos:")
        for title in cluster_info['video_titles'][:3]:
            print(f"    - {title}")
        print()
    
    return analysis


def identify_content_gaps(analysis: Dict) -> Dict:
    """
    Identify content gaps based on cluster distribution.
    """
    print("\n" + "="*50)
    print("CONTENT GAP IDENTIFICATION")
    print("="*50 + "\n")
    
    clusters = analysis['clusters']
    
    # Find underrepresented clusters
    avg_percentage = 100 / len(clusters)
    
    gaps = {
        'underrepresented_patterns': [],
        'overrepresented_patterns': [],
        'recommendations': []
    }
    
    for cluster_name, cluster_data in clusters.items():
        percentage = cluster_data['percentage']
        
        if percentage < avg_percentage * 0.5:  # Less than 50% of average
            gaps['underrepresented_patterns'].append({
                'cluster': cluster_name,
                'current_count': cluster_data['video_count'],
                'percentage': percentage,
                'pattern': {
                    'duration': cluster_data['avg_duration'],
                    'topics': cluster_data['top_topics']
                }
            })
        elif percentage > avg_percentage * 1.5:  # More than 150% of average
            gaps['overrepresented_patterns'].append({
                'cluster': cluster_name,
                'current_count': cluster_data['video_count'],
                'percentage': percentage,
                'pattern': {
                    'duration': cluster_data['avg_duration'],
                    'topics': cluster_data['top_topics']
                }
            })
    
    # Generate recommendations
    for gap in gaps['underrepresented_patterns']:
        recommendation = f"Create more {gap['pattern']['duration']:.0f}-minute videos about: {', '.join(gap['pattern']['topics'][:3])}"
        gaps['recommendations'].append(recommendation)
    
    # Print gaps
    print("ðŸ”´ UNDERREPRESENTED CONTENT PATTERNS:")
    for gap in gaps['underrepresented_patterns']:
        print(f"\n  {gap['cluster']}: Only {gap['current_count']} videos ({gap['percentage']:.1f}%)")
        print(f"    Typical Duration: {gap['pattern']['duration']:.1f} minutes")
        print(f"    Topics: {', '.join(gap['pattern']['topics'])}")
    
    print("\n\nðŸŸ¢ OVERREPRESENTED CONTENT PATTERNS:")
    for gap in gaps['overrepresented_patterns']:
        print(f"\n  {gap['cluster']}: {gap['current_count']} videos ({gap['percentage']:.1f}%)")
        print(f"    Typical Duration: {gap['pattern']['duration']:.1f} minutes")
        print(f"    Topics: {', '.join(gap['pattern']['topics'])}")
    
    print("\n\nðŸ’¡ RECOMMENDATIONS:")
    for i, rec in enumerate(gaps['recommendations'], 1):
        print(f"  {i}. {rec}")
    
    print()
    
    return gaps

In [27]:
args = {
    "api_key": "sk_QXg4S5ZpXNkonM9i-QB2Td5pEWCcmnZuLGYZT5YYtEE",
    "train": True,
    "model_id": "Run Test 1",
    "output": "gap_analysis.json"
}

In [10]:
df = fetch_all_videos_with_summaries()

if len(df) == 0:
    print("No videos found in database!")



Fetching videos from Supabase...
Fetched 14 videos with summaries


In [11]:
# Step 2: Engineer features
features_df = engineer_features(df)
features_df = create_topic_features(features_df)

Engineering features...
Engineered 16 features for 14 videos
Creating topic-based features...
Top topics in your content: ['image', 'white', 'scene', 'black', 'boat', 'The image depicts a scene from a commercial for Pringles potato chips. The setting is a kitchen', ' The image shows...', ' including a stove', ' a refrigerator', 'potato']


In [12]:
features_df.to_csv('video_features.csv', index=False)
print(f"Features saved to video_features.csv\n")

Features saved to video_features.csv



In [15]:

# Step 3: Train or use existing model
if True:
    print("Training model...")
    model_id = train_content_gap_model(
        features_df,
        api_key=args["api_key"]
    )
    print(f"\nâœ“ Model trained successfully!")
    print(f"Model ID: {model_id}")
    print(f"Save this ID for future inference!\n")
else:
    if not args.model_id:
        print("Error: Either --train or --model-id must be provided")
    model_id = args.model_id


Training model...

TRAINING WOODWIDE CLUSTERING MODEL

Preparing clustering dataset...
Clustering dataset shape: (14, 30)
Features: ['duration_minutes', 'frames_per_minute', 'unique_topics_count', 'topic_density', 'avg_description_length']...
Saved to /tmp/content_clustering.csv
Step 1: Uploading dataset to WoodWide...
âœ“ Dataset uploaded. ID: M7HBmqtvXfI5G0ANnHNG

Step 2: Training clustering model...
âœ“ Training started. Model ID: 99qKv5bY7hcobPraftoj

Step 3: Waiting for training to complete...
  Status: PENDING... (0s elapsed)
  Status: PENDING... (5s elapsed)
  Status: PENDING... (11s elapsed)
  Status: PENDING... (16s elapsed)
  Status: PENDING... (21s elapsed)
  Status: PENDING... (26s elapsed)
  Status: PENDING... (32s elapsed)
  Status: RUNNING... (37s elapsed)
  Status: RUNNING... (42s elapsed)
  Status: RUNNING... (47s elapsed)
  Status: RUNNING... (52s elapsed)
âœ“ Training complete! (took 57.68s)


MODEL TRAINING SUCCESSFUL


âœ“ Model trained successfully!
Model ID: 99qK

In [21]:
clustered_df = run_clustering_inference(
    model_id,
    features_df,
    api_key=args["api_key"]
)
# debug_response = run_clustering_inference_debug(
#     model_id,
#     features_df,
#     api_key=args["api_key"]
# )

# Save clustered results
clustered_df.to_csv('clustered_videos.csv', index=False)
print(f"Clustered videos saved to clustered_videos.csv\n")



RUNNING CLUSTERING INFERENCE

Preparing clustering dataset...
Clustering dataset shape: (14, 30)
Features: ['duration_minutes', 'frames_per_minute', 'unique_topics_count', 'topic_density', 'avg_description_length']...
Saved to /tmp/inference_data.csv
Uploading inference dataset...
âœ“ Dataset uploaded. ID: GsUOh1dgGKfogAi15bde

Running clustering inference...
âœ“ Inference complete!

Successfully extracted 14 cluster assignments
Found 8 unique clusters

Cluster distribution:
cluster
0    4
1    2
2    3
3    1
4    1
5    1
6    1
7    1
Name: count, dtype: int64

Cluster descriptions saved to cluster_info.json

Clustered videos saved to clustered_videos.csv



In [30]:
analysis = analyze_clusters(clustered_df)
gaps = identify_content_gaps(analysis)

# Step 6: Save final results
results = {
    'model_id': model_id,
    'analysis': analysis,
    'gaps': gaps,
    'clustered_videos': clustered_df.to_dict(orient='records')
}

with open("gap_analysis.json", 'w') as f:
    json.dump(results, f, indent=2)

#print(f"\nâœ“ Analysis complete! Results saved to {args.output}\n")


CLUSTER ANALYSIS

Cluster 0: (4 videos, 28.6%)
  Avg Duration: 0.0 min
  Top Topics: 
  Sample Videos:
    - TEST MODEL??
    - TEST MODEL??
    - TEST MODEL??

Cluster 1: (2 videos, 14.3%)
  Avg Duration: 0.5 min
  Top Topics: The image depicts a scene from a commercial for Pringles potato chips. The setting is a kitchen,  The image shows..., image
  Sample Videos:
    - Pringles Ad
    - Pringles Ad 2

Cluster 2: (3 videos, 21.4%)
  Avg Duration: 0.2 min
  Top Topics: image, white, boat
  Sample Videos:
    - Rahul Motorcycle
    - Animation Video
    - Model 123

Cluster 3: (1 videos, 7.1%)
  Avg Duration: 0.1 min
  Top Topics:  wearing a helmet and a dark shirt. The motorcycle is parked on the street, motorcycle,  also wearing a helmet and sunglasses. In the background
  Sample Videos:
    - Rahul Motorcycle

Cluster 4: (1 videos, 7.1%)
  Avg Duration: 0.0 min
  Top Topics: image,  which is part of the house where the movie is set. The hallway is well-lit,  portrayed by the actor 

In [None]:
"""
Main execution function.
"""
import argparse

# parser = argparse.ArgumentParser(description='Content Gap Analyzer using WoodWide AI')
# parser.add_argument('--api-key', required=True, help='WoodWide API key')
# parser.add_argument('--supabase-url', required=True, help='Supabase URL')
# parser.add_argument('--supabase-key', required=True, help='Supabase API key')
# parser.add_argument('--train', action='store_true', help='Train a new model')
# parser.add_argument('--model-id', help='Existing model ID for inference')
# parser.add_argument('--output', default='gap_analysis.json', help='Output file for results')

#args = parser.parse_args()



# Set environment variables
# os.environ['SUPABASE_URL'] = args.supabase_url
# os.environ['SUPABASE_KEY'] = args.supabase_key

# Step 1: Fetch data from Supabase



# Save features for inspection


# Step 4: Run clustering inference

# Step 5: Analyze clusters and identify gaps

