# Link to Working Colab Notebook:

If this doesn't work for you, try it in colab.

https://colab.research.google.com/drive/1T7lFV7xyOlWHkG1KkKEBnuvrpVgjGF3a?usp=sharing

# Load Dataset

In [1]:
from datasets import load_dataset

In [2]:
video_dataset = load_dataset("JohnVitz/CV_Final_Project_Video_With_Chunked_Captions_Bert_Topic_1")

In [3]:
video_dataset = video_dataset['train']

# Initialize Embedding Model

In [4]:
import torch
from sentence_transformers import SentenceTransformer

# Select device ('cpu' or 'cuda' if available)
device = "cpu"  # Set to "cuda" if you want to use GPU (change to "cuda" to force GPU usage)
# Check if CUDA is available and if the user wants to use it
if torch.cuda.is_available() and device == "cuda":
    device = "cuda"
    print("Using GPU (CUDA)")
else:
    print("Using CPU")

# Load pre-trained Sentence Transformer model
sentence_model = SentenceTransformer('all-mpnet-base-v2', device=device)

Using CPU


# Initialize Qdrant Store

In [5]:
# Qdrant Vector Database
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams

In [6]:
qdrant_client = QdrantClient(location=":memory:")

In [7]:
MY_SHORT_COLLECTION = "short_search_collection"
MY_LONG_COLLECTION = "long_search_collection"

embedding_size = sentence_model.get_sentence_embedding_dimension()

first_collection = qdrant_client.create_collection(
    collection_name=MY_SHORT_COLLECTION,
    vectors_config=VectorParams(size=embedding_size, # Size of Snowflake Embedding Dimensions
                                distance=Distance.COSINE), # Cosine similarity for vector search
)

second_collection = qdrant_client.create_collection(
    collection_name=MY_LONG_COLLECTION,
    vectors_config=VectorParams(size=embedding_size, # Size of Snowflake Embedding Dimensions
                                distance=Distance.COSINE), # Cosine similarity for vector search
)

In [8]:
qdrant_client.get_collection(collection_name=MY_SHORT_COLLECTION)

CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=None, indexed_vectors_count=0, points_count=0, segments_count=1, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=768, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), shard_number=None, sharding_method=None, replication_factor=None, write_consistency_factor=None, read_fan_out_factor=None, on_disk_payload=None, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=None, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), 

In [9]:
qdrant_client.get_collection(collection_name=MY_LONG_COLLECTION)

CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=None, indexed_vectors_count=0, points_count=0, segments_count=1, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=768, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), shard_number=None, sharding_method=None, replication_factor=None, write_consistency_factor=None, read_fan_out_factor=None, on_disk_payload=None, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=None, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), 

# Upload the Segmentation Subsets

In [10]:
def load_qdrant_store_subset_short():

    # assuming `dataset` has columns: video_id, segments, segment_embeddings
    records = []

    for example in video_dataset:
        vid       = example["video_id"]
        segments  = example["segments"]            # List[{"topic","start","end","text","Name"}]
        embeddings = example["segment_embeddings"] # List[List[float]]

        # one record per segment
        for idx, (seg, emb) in enumerate(zip(segments, embeddings)):
            rec_id = f"{vid}_{idx}"              # concat video_id + segment index
            payload = {
                "video_id": vid,
                "segment_index": idx,
                "topic": seg["topic"],
                "start": seg["start"],
                "end": seg["end"],
                "text": seg["text"],
                "Name": seg["Name"],
            }
            records.append({
                #"id": rec_id,
                "vector": emb,
                "payload": payload
            })

    # Insert points into the collection
    from qdrant_client.models import Batch

    # Create a Batch object

    ids      = list(range(len(records)))
    vectors  = [r["vector"]  for r in records]
    payloads = [r["payload"] for r in records]

    my_batch = Batch(ids=ids, vectors=vectors, payloads=payloads)

    qdrant_client.upsert(
        collection_name=MY_SHORT_COLLECTION,
        points=my_batch
)

In [11]:
def load_qdrant_store_subset_long():

    # assuming `dataset` has columns: video_id, segments, segment_embeddings
    records = []

    for example in video_dataset:
        vid       = example["video_id"]
        segments  = example["segments2"]            # List[{"topic","start","end","text","Name"}]
        embeddings = example["segment_embeddings2"] # List[List[float]]

        # one record per segment
        for idx, (seg, emb) in enumerate(zip(segments, embeddings)):
            rec_id = f"{vid}_{idx}"              # concat video_id + segment index
            payload = {
                "video_id": vid,
                "segment_index": idx,
                "topic": seg["topic"],
                "start": seg["start"],
                "end": seg["end"],
                "text": seg["text"],
                "Name": seg["Name"],
            }
            records.append({
                #"id": rec_id,
                "vector": emb,
                "payload": payload
            })

    # Insert points into the collection
    from qdrant_client.models import Batch

    # Create a Batch object

    ids      = list(range(len(records)))
    vectors  = [r["vector"]  for r in records]
    payloads = [r["payload"] for r in records]

    my_batch = Batch(ids=ids, vectors=vectors, payloads=payloads)

    qdrant_client.upsert(
        collection_name=MY_LONG_COLLECTION,
        points=my_batch
)

In [12]:
load_qdrant_store_subset_short()

In [13]:
load_qdrant_store_subset_long()

In [14]:
def get_query_embedding(text):
    return sentence_model.encode(text)  # Encode the question (this returns a normalized embedding)

# Gets closest similarity score chunks
def get_context_chunks(norm_query_embedding, num_chunks=1, collection='short'):

    if collection == 'long':

        context_chunks = qdrant_client.query_points(
            collection_name=MY_SHORT_COLLECTION,
            query=norm_query_embedding,
            limit=num_chunks
        ).points
    else:
        context_chunks = qdrant_client.query_points(
            collection_name=MY_LONG_COLLECTION,
            query=norm_query_embedding,
            limit=num_chunks
        ).points


    #print(context_chunks)

    return context_chunks

In [None]:
import os
import gradio as gr
import tempfile
from datetime import datetime
from moviepy import VideoFileClip

def to_seconds(ts: str) -> float:
    """Convert 'HH:MM:SS.sss' to seconds."""
    dt = datetime.strptime(ts, "%H:%M:%S.%f")
    return dt.hour*3600 + dt.minute*60 + dt.second + dt.microsecond/1e6

def get_payload_based_on_question(question, collection='short'):
    """
    Returns:
      text (str),                — the retrieved caption text  
      video_id (str),            — video_id to reference the video in video_dataset  
      start_ts (str),            — segment start timestamp  
      end_ts (str)               — segment end timestamp  
    """
    # 1) Embed & get top hit
    question_embedding = get_query_embedding(question)
    hit = get_context_chunks(question_embedding, collection='short')[0]
    payload = hit.payload
    #print("DEBUG payload:", payload)  # you can remove this after verifying

    # 2) Extract fields from payload
    text     = payload['text']
    video_id = payload['video_id']  # Extract video_id from payload
    start_ts = payload['start']
    end_ts   = payload['end']

    return text, video_id, start_ts, end_ts

def serve_video_and_text(question, collection):#, video_dataset):

    # 1) Pull the best‐matching segment payload
    text, video_id, start_ts, end_ts = get_payload_based_on_question(question, collection=collection)

    #print(f'\n\nVideo ID: {video_id}\n')
    #print(type(video_id))

    # 2) Look up the video data from the video_dataset using video_id
    video_row = video_dataset.filter(lambda x: x['video_id'] == video_id)  # Filter the dataset by video_id
    if len(video_row) == 0:
        raise ValueError(f"Video with video_id {video_id} not found in the dataset.")
    
    video_data = video_row['mp4'][0]  # Extract the binary video data (mp4)

    # 3) Write the video data to a temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_file:
        temp_file.write(video_data)  # Write the binary video data to the temporary file
        temp_video_path = temp_file.name  # Path to the temporary video file

    # 4) Extract the subclip from the video file
    segment = VideoFileClip(temp_video_path).subclipped(
        to_seconds(start_ts), 
        to_seconds(end_ts)
    )

    # 5) Save the subclip to a temporary file for further processing
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
    segment.write_videofile(tmp.name, audio_codec="aac", logger=None)
    segment.close()

    return tmp.name, text


In [None]:
def create_video_interface():
    with gr.Blocks() as demo:
        gr.Markdown("### Video and Text Display Based on Question")

        question_input   = gr.Textbox(label="Ask a Question",
                                      placeholder="Enter a question related to the videos…")
        collection_input = gr.Radio(
            choices=['short', 'long'],
            value='short',
            label="Would you prefer your retrieved videos to be shorter (25 second floor), or longer (50 second floor)?"
        )

        video_output = gr.Video()
        text_output  = gr.Markdown()

        show_btn = gr.Button("Show")
        show_btn.click(
            fn=serve_video_and_text,
            inputs=[question_input, collection_input],
            outputs=[video_output, text_output]
        )

    demo.launch(share=True)

create_video_interface()

* Running on local URL:  http://127.0.0.1:7870
* Running on public URL: https://304d094d675bdc7bd6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


DEBUG payload: {'video_id': 'FCQ-rih6cHY', 'segment_index': 4, 'topic': 6, 'start': '00:05:14.990', 'end': '00:07:34.550', 'text': "and from from now on we'll refer to to those networks as rest nets. all right, so those networks as rest nets. all right, so in order for us to understand what is in order for us to understand what is going on with rest Nets, or what I'm going on with rest Nets, or what I'm going to do now is I'm going to draw uh, going to do now is I'm going to draw uh, a very small rest architecture just a very small rest architecture just consisting of three consisting of three units: U and, if I may draw the units: U and, if I may draw the architecture will. that's my uh unit, architecture will. that's my uh unit, which I'll abstract with the letter which I'll abstract with the letter F1, so the input to the. so I'll use a bit F1, so the input to the. so I'll use a bit different terminology from what I was different terminology from what I was used kind of earlier. so 

Filter:   0%|          | 0/8 [00:00<?, ? examples/s]

{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'isom', 'minor_version': '512', 'compatible_brands': 'isomiso2avc1mp41', 'encoder': 'Lavf58.76.100'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [1920, 1080], 'bitrate': 1652, 'fps': 30.0, 'codec_name': 'h264', 'profile': '(High)', 'metadata': {'Metadata': '', 'handler_name': 'VideoHandler', 'vendor_id': '[0][0][0][0]'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': None, 'default': True, 'fps': 44100, 'bitrate': 127, 'metadata': {'Metadata': '', 'handler_name': 'SoundHandler', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 1532.93, 'bitrate': 1788, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_number': 0, 'video_codec_name': 'h264', 'video_profile': '(High)', 'video_size': [1920, 1080], 'video_bitrate': 1652, 'video_fps': 30.0, 'default_audio_input_number': 

Filter:   0%|          | 0/8 [00:00<?, ? examples/s]

{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'isom', 'minor_version': '512', 'compatible_brands': 'isomiso2avc1mp41', 'encoder': 'Lavf58.76.100'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [1920, 1080], 'bitrate': 1652, 'fps': 30.0, 'codec_name': 'h264', 'profile': '(High)', 'metadata': {'Metadata': '', 'handler_name': 'VideoHandler', 'vendor_id': '[0][0][0][0]'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': None, 'default': True, 'fps': 44100, 'bitrate': 127, 'metadata': {'Metadata': '', 'handler_name': 'SoundHandler', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 1602.78, 'bitrate': 1787, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_number': 0, 'video_codec_name': 'h264', 'video_profile': '(High)', 'video_size': [1920, 1080], 'video_bitrate': 1652, 'video_fps': 30.0, 'default_audio_input_number': 