In [1]:
# Import required modules
import sys
sys.path.append('./')  # Add the current directory to the Python path

from utils import download_video, get_transcript_vtt
from pathlib import Path
import json
import os

# Download YouTube video and transcript
video_url = "https://www.youtube.com/watch?v=7Hcg-rLYwdM"  # specify your URL
video_dir = "./shared_data/videos/video"
video_filepath = download_video(video_url, video_dir)
video_transcript_filepath = get_transcript_vtt(video_url, video_dir)

# Display paths
print(f"Video saved to: {video_filepath}")
print(f"Transcript saved to: {video_transcript_filepath}")


Getting video information for https://www.youtube.com/watch?v=7Hcg-rLYwdM
Downloading video from YouTube...


Downloading video from YouTube: 100%|██████████| 4578531/4578531 [00:00<00:00, 5297510.46bytes/s]


Video saved to: ./shared_data/videos/video\Welcome back to Planet Earth.mp4
Transcript saved to: ./shared_data/videos/video\captions.vtt


In [2]:
%pip install webvtt-py

from utils import str2time, maintain_aspect_ratio_resize
import webvtt
import cv2

# Define paths
extracted_frames_path = os.path.join(video_dir, 'extracted_frames')
metadata_path = video_dir
Path(extracted_frames_path).mkdir(parents=True, exist_ok=True)

# Function to extract frames and metadata
def extract_frames_and_metadata(video_path, transcript_path, frames_path, metadata_output_path):
    metadata = []
    video = cv2.VideoCapture(video_path)
    transcript = webvtt.read(transcript_path)
    
    for idx, caption in enumerate(transcript):
        start_time = str2time(caption.start)
        end_time = str2time(caption.end)
        mid_time = (start_time + end_time) / 2
        
        video.set(cv2.CAP_PROP_POS_MSEC, mid_time)
        success, frame = video.read()
        
        if success:
            frame_path = os.path.join(frames_path, f"frame_{idx}.jpg")
            cv2.imwrite(frame_path, maintain_aspect_ratio_resize(frame, height=350))
            
            metadata.append({
                'frame_path': frame_path,
                'transcript': caption.text.replace("\n", " "),
                'mid_time': mid_time,
                'video_path': video_path
            })
        else:
            print(f"Could not extract frame at idx: {idx}")

    # Save metadata as JSON
    metadata_file = os.path.join(metadata_output_path, 'metadata.json')
    with open(metadata_file, 'w') as f:
        json.dump(metadata, f)
    return metadata

# Run extraction
metadata = extract_frames_and_metadata(video_filepath, video_transcript_filepath, extracted_frames_path, metadata_path)
print("Metadata extraction completed.")


Defaulting to user installation because normal site-packages is not writeable
Collecting webvtt-py
  Downloading webvtt_py-0.5.1-py3-none-any.whl.metadata (3.4 kB)
Downloading webvtt_py-0.5.1-py3-none-any.whl (19 kB)
Installing collected packages: webvtt-py
Successfully installed webvtt-py-0.5.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Metadata extraction completed.


In [16]:
import lancedb
from mm_rag.embeddings.bridgetower_embeddings import BridgeTowerEmbeddings
from mm_rag.vectorstores.multimodal_lancedb import MultimodalLanceDB

# LanceDB setup
LANCEDB_HOST_FILE = "./shared_data/.lancedb"
TABLE_NAME = "video_data"

# Initialize BridgeTower embeddings
embedder = BridgeTowerEmbeddings()

# Setup LanceDB vector store
vector_store = MultimodalLanceDB(uri=LANCEDB_HOST_FILE, embedding=embedder, table_name=TABLE_NAME)

# Insert metadata into LanceDB
# Initialize BridgeTower embedder 
embedder = BridgeTowerEmbeddings()

# Prepare text and image data for ingestion
texts = [data["transcript"] for data in metadata]
image_paths = [data["frame_path"] for data in metadata]
metadatas = metadata  # Each metadata dictionary already contains necessary fields

# Use from_text_image_pairs for ingestion into LanceDB
_ = MultimodalLanceDB.from_text_image_pairs(
    texts=texts,
    image_paths=image_paths,
    embedding=embedder,
    metadatas=metadatas,
    connection=lancedb.connect(LANCEDB_HOST_FILE),  # Connection to LanceDB
    table_name=TABLE_NAME,
    mode="overwrite"  # Choose "append" if adding to an existing table
)

print("Data ingested into LanceDB.")




  0%|          | 0/26 [00:59<?, ?it/s]


ValueError: No api_key provided or in environment. Please provide the api_key as client = PredictionGuard(api_key=<your_api_key>) or as PREDICTIONGUARD_API_KEY in your environment.