# Load Dataset

In [5]:
from datasets import load_dataset

In [6]:
video_dataset = load_dataset("JohnVitz/CV_Final_Project_Video_With_Chunked_Captions_Bert_Topic_1")

README.md:   0%|          | 0.00/1.76k [00:00<?, ?B/s]

train-00000-of-00005.parquet:   0%|          | 0.00/182M [00:00<?, ?B/s]

train-00001-of-00005.parquet:   0%|          | 0.00/472M [00:00<?, ?B/s]

train-00002-of-00005.parquet:   0%|          | 0.00/453M [00:00<?, ?B/s]

train-00003-of-00005.parquet:   0%|          | 0.00/631M [00:00<?, ?B/s]

train-00004-of-00005.parquet:   0%|          | 0.00/358M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8 [00:00<?, ? examples/s]

In [18]:
video_dataset = video_dataset['train']

# Initialize Embedding Model

In [7]:
import torch
from sentence_transformers import SentenceTransformer

# Select device ('cpu' or 'cuda' if available)
device = "cpu"  # Set to "cuda" if you want to use GPU (change to "cuda" to force GPU usage)
# Check if CUDA is available and if the user wants to use it
if torch.cuda.is_available() and device == "cuda":
    device = "cuda"
    print("Using GPU (CUDA)")
else:
    print("Using CPU")

# Load pre-trained Sentence Transformer model
sentence_model = SentenceTransformer('all-mpnet-base-v2', device=device)

Using CPU


# Initialize Qdrant Store

In [8]:
# Qdrant Vector Database
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams

In [9]:
qdrant_client = QdrantClient(location=":memory:")

In [10]:
MY_SHORT_COLLECTION = "short_search_collection"
MY_LONG_COLLECTION = "long_search_collection"

embedding_size = sentence_model.get_sentence_embedding_dimension()

first_collection = qdrant_client.create_collection(
    collection_name=MY_SHORT_COLLECTION,
    vectors_config=VectorParams(size=embedding_size, # Size of Snowflake Embedding Dimensions
                                distance=Distance.COSINE), # Cosine similarity for vector search
)

second_collection = qdrant_client.create_collection(
    collection_name=MY_LONG_COLLECTION,
    vectors_config=VectorParams(size=embedding_size, # Size of Snowflake Embedding Dimensions
                                distance=Distance.COSINE), # Cosine similarity for vector search
)

In [11]:
qdrant_client.get_collection(collection_name=MY_SHORT_COLLECTION)

CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=None, indexed_vectors_count=0, points_count=0, segments_count=1, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=768, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), shard_number=None, sharding_method=None, replication_factor=None, write_consistency_factor=None, read_fan_out_factor=None, on_disk_payload=None, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=None, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), 

In [12]:
qdrant_client.get_collection(collection_name=MY_LONG_COLLECTION)

CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=None, indexed_vectors_count=0, points_count=0, segments_count=1, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=768, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), shard_number=None, sharding_method=None, replication_factor=None, write_consistency_factor=None, read_fan_out_factor=None, on_disk_payload=None, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=None, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), 

# Upload the Segmentation Subsets

In [19]:
def load_qdrant_store_subset_short():

    # assuming `dataset` has columns: video_id, segments, segment_embeddings
    records = []

    for example in video_dataset:
        vid       = example["video_id"]
        segments  = example["segments"]            # List[{"topic","start","end","text","Name"}]
        embeddings = example["segment_embeddings"] # List[List[float]]

        # one record per segment
        for idx, (seg, emb) in enumerate(zip(segments, embeddings)):
            rec_id = f"{vid}_{idx}"              # concat video_id + segment index
            payload = {
                "video_id": vid,
                "segment_index": idx,
                "topic": seg["topic"],
                "start": seg["start"],
                "end": seg["end"],
                "text": seg["text"],
                "Name": seg["Name"],
            }
            records.append({
                #"id": rec_id,
                "vector": emb,
                "payload": payload
            })

    # Insert points into the collection
    from qdrant_client.models import Batch

    # Create a Batch object

    ids      = list(range(len(records)))
    vectors  = [r["vector"]  for r in records]
    payloads = [r["payload"] for r in records]

    my_batch = Batch(ids=ids, vectors=vectors, payloads=payloads)

    qdrant_client.upsert(
        collection_name=MY_SHORT_COLLECTION,
        points=my_batch
)

In [20]:
def load_qdrant_store_subset_long():

    # assuming `dataset` has columns: video_id, segments, segment_embeddings
    records = []

    for example in video_dataset:
        vid       = example["video_id"]
        segments  = example["segments2"]            # List[{"topic","start","end","text","Name"}]
        embeddings = example["segment_embeddings2"] # List[List[float]]

        # one record per segment
        for idx, (seg, emb) in enumerate(zip(segments, embeddings)):
            rec_id = f"{vid}_{idx}"              # concat video_id + segment index
            payload = {
                "video_id": vid,
                "segment_index": idx,
                "topic": seg["topic"],
                "start": seg["start"],
                "end": seg["end"],
                "text": seg["text"],
                "Name": seg["Name"],
            }
            records.append({
                #"id": rec_id,
                "vector": emb,
                "payload": payload
            })

    # Insert points into the collection
    from qdrant_client.models import Batch

    # Create a Batch object

    ids      = list(range(len(records)))
    vectors  = [r["vector"]  for r in records]
    payloads = [r["payload"] for r in records]

    my_batch = Batch(ids=ids, vectors=vectors, payloads=payloads)

    qdrant_client.upsert(
        collection_name=MY_LONG_COLLECTION,
        points=my_batch
)

In [21]:
load_qdrant_store_subset_short()

In [22]:
load_qdrant_store_subset_long()

In [29]:
def get_query_embedding(text):
    return sentence_model.encode(text)  # Encode the question (this returns a normalized embedding)

# Gets closest similarity score chunks
def get_context_chunks(norm_query_embedding, num_chunks=1, collection='short'):

    if collection == 'long':

        context_chunks = qdrant_client.query_points(
            collection_name=MY_SHORT_COLLECTION,
            query=norm_query_embedding,
            limit=num_chunks
        ).points
    else:
        context_chunks = qdrant_client.query_points(
            collection_name=MY_LONG_COLLECTION,
            query=norm_query_embedding,
            limit=num_chunks
        ).points


    #print(context_chunks)

    return context_chunks

In [30]:
import os
import gradio as gr
import tempfile
from datetime import datetime
from moviepy import VideoFileClip

def to_seconds(ts: str) -> float:
    """Convert 'HH:MM:SS.sss' to seconds."""
    dt = datetime.strptime(ts, "%H:%M:%S.%f")
    return dt.hour*3600 + dt.minute*60 + dt.second + dt.microsecond/1e6

def get_payload_based_on_question(question, collection='short'):
    """
    Returns:
      text (str),                — the retrieved caption text  
      video_file_path (str),     — local path to the full MP4  
      start_ts (str),            — segment start timestamp  
      end_ts (str)               — segment end timestamp  
    """
    # 1) Embed & get top hit
    question_embedding = get_query_embedding(question)
    hit = get_context_chunks(question_embedding, collection='short')[0]
    payload = hit.payload
    print("DEBUG payload:", payload)  # you can remove this after verifying

    # 2) Extract fields
    text     = payload['text']
    video_id = payload['video_id']
    start_ts = payload['start']
    end_ts   = payload['end']

    # 3) Rebuild the path to your full video file
    video_file_path = video_id

    return text, video_file_path, start_ts, end_ts

# Serve video + text with dynamic subclip extraction
def serve_video_and_text(question, collection, folderpath='full_videos'):#, collection='short'):

    # 1) Pull the best‐matching segment payload
    text, video_path, start_ts, end_ts = get_payload_based_on_question(question, collection=collection)

    video_path = video_path + '_full_video.mp4'

    # print("Retrieved text:")
    # print(text, "\n")
    # print(f"Video file: {video_path}")
    # print(f"Time window: {start_ts} → {end_ts}\n")

    # # Extract the subclipped
    # print("Extracting subclip with `subclipped` function…")

    final_path = os.path.join(folderpath, video_path)
    print(final_path)

    segment = VideoFileClip(final_path).subclipped(
        to_seconds(start_ts), 
        to_seconds(end_ts)
    )

    # Save to a temp file
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
    segment.write_videofile(tmp.name, audio_codec="aac", logger=None)
    segment.close()
    
    return tmp.name, text

# Gradio app
def create_video_interface():
    with gr.Blocks() as demo:
        gr.Markdown("### Video and Text Display Based on Question")

        # Create a textbox for the user's question
        question_input = gr.Textbox(label="Ask a Question", placeholder="Enter a question related to the videos...")
        
        # Define the video and text outputs
        video_output = gr.Video()
        text_output = gr.Markdown()
        
        # Button to submit the question and show the relevant video and text
        gr.Button("Show").click(
            fn=serve_video_and_text,
            inputs=[question_input],
            outputs=[video_output, text_output]
        )
    demo.launch(share=False)

In [None]:
create_video_interface()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


DEBUG payload: {'video_id': 'WXoOohWU28Y', 'segment_index': 32, 'topic': 4, 'start': '00:11:37.110', 'end': '00:12:20.990', 'text': 'write over here, if I kind of try to squeeze it: so this is our squeeze it: so this is our image that we have seen earlier, with', 'Name': '4_image_kind image_exactly uh_positioned'}
full_videos/WXoOohWU28Y_full_video.mp4
{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'isom', 'minor_version': '512', 'compatible_brands': 'isomiso2avc1mp41', 'encoder': 'Lavf61.1.100'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [1920, 1080], 'bitrate': 1103, 'fps': 30.0, 'codec_name': 'h264', 'profile': '(High)', 'metadata': {'Metadata': '', 'handler_name': 'VideoHandler', 'vendor_id': '[0][0][0][0]', 'encoder': 'Lavc61.3.100 libx264'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': None, 'default': True, 'fps': 44100, 'bitrate': 128,

In [32]:
def create_video_interface():
    with gr.Blocks() as demo:
        gr.Markdown("### Video and Text Display Based on Question")

        question_input   = gr.Textbox(label="Ask a Question",
                                      placeholder="Enter a question related to the videos…")
        collection_input = gr.Radio(
            choices=['short', 'long'],
            value='short',
            label="Would you perfer your retrieved videos to be short, or long?"
        )

        video_output = gr.Video()
        text_output  = gr.Markdown()

        show_btn = gr.Button("Show")
        show_btn.click(
            fn=serve_video_and_text,
            inputs=[question_input, collection_input],
            outputs=[video_output, text_output]
        )

    demo.launch(share=True)

create_video_interface()

* Running on local URL:  http://127.0.0.1:7864
* Running on public URL: https://56a319d613c6426f4a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
