In [1]:
from datasets import Dataset

# Specify the directory where you want to save it
save_directory = './final_video_dataset'

# Load the saved dataset
loaded_video_dataset = Dataset.load_from_disk(save_directory)

# Verify by checking the first few entries
print(loaded_video_dataset[:5])  # Display first 5 examples to verify

  from .autonotebook import tqdm as notebook_tqdm


{'video_id': ['9CGGh6ivg68', '9CGGh6ivg68', '9CGGh6ivg68', '9CGGh6ivg68', '9CGGh6ivg68'], 'start': ['00:00:02.990', '00:00:31.390', '00:01:50.389', '00:02:23.350', '00:04:26.510'], 'end': ['00:00:31.390', '00:01:50.389', '00:02:23.350', '00:04:26.510', '00:04:46.680'], 'segmentation_group_id': [0, 1, 2, 3, 4], 'text': ['in this video I would like to start the discussion about convolutional new networks which is another architecture of uh neural networks that we are going to see specifically kind of engineered uh to um address problems that we are facing in computer vision I want to start this discussion with um just showing you a picture and uh if I ask you uh to tell me what would actually be the first object that you pay attention to then most people will probably', "to then most people will probably respond to with yellow cab and it's not really accident that lot of cabs in a lot of capitals are painted uh yellow U and of course that attention uh to bright colors originates from tho

In [2]:
loaded_video_dataset[0]['video_file_path']

'./video_zips/9CGGh6ivg68/chunk_1_2.99_31.39.mp4'

In [3]:
import os

In [4]:
import gradio as gr
import os
from IPython.display import display
import random

# Function to serve a video from a valid row index
def serve_video_from_row(row_idx):
    # Ensure the row index is valid
    if row_idx < 0 or row_idx >= len(loaded_video_dataset):
        return "Invalid row index."
    
    # Retrieve the example row
    example = loaded_video_dataset[row_idx]
    
    # Get the video file path
    video_file_path = example.get("video_file_path", None)
    print(video_file_path)
    
    # Check if video file exists and return the video
    if video_file_path and os.path.exists(video_file_path):
        return gr.Video(video_file_path)
    else:
        return "Video file not found or invalid path."

In [5]:
serve_video_from_row(0)

./video_zips/9CGGh6ivg68/chunk_1_2.99_31.39.mp4


<gradio.components.video.Video at 0x7959784752b0>

In [6]:
def serve_text_from_row(row_idx):
    if row_idx < 0 or row_idx >= len(loaded_video_dataset):
        return "Invalid row index."
    
    example = loaded_video_dataset[row_idx]
    text = example.get("text", "No text available.")
    
    return gr.Markdown(text)

In [7]:
serve_text_from_row(0)

<gradio.components.markdown.Markdown at 0x795935691130>

In [8]:
# Wrapper function that calls both video and text functions
def serve_video_and_text(row_idx):
    video = serve_video_from_row(row_idx)
    text = serve_text_from_row(row_idx)
    return video, text

In [9]:
serve_video_and_text(0)

./video_zips/9CGGh6ivg68/chunk_1_2.99_31.39.mp4


(<gradio.components.video.Video at 0x795978475c70>,
 <gradio.components.markdown.Markdown at 0x79593590ea50>)

In [10]:
import gradio as gr
import os

# Function to serve a video from a valid row index
def serve_video_from_row(row_idx):
    # Convert row_idx to integer
    row_idx = int(row_idx)
    
    # Ensure the row index is valid
    if row_idx < 0 or row_idx >= len(loaded_video_dataset):
        return "Invalid row index."
    
    example = loaded_video_dataset[row_idx]
    video_file_path = example.get("video_file_path", None)
    
    if video_file_path and os.path.exists(video_file_path):
        return gr.Video(video_file_path)
    else:
        return "Video file not found or invalid path."

# Function to serve text from a valid row index
def serve_text_from_row(row_idx):
    # Convert row_idx to integer
    row_idx = int(row_idx)
    
    # Ensure the row index is valid
    if row_idx < 0 or row_idx >= len(loaded_video_dataset):
        return "Invalid row index."
    
    example = loaded_video_dataset[row_idx]
    text = example.get("text", "No text available.")
    
    return gr.Markdown(text)

# Wrapper function that calls both video and text functions
def serve_video_and_text(row_idx):
    video = serve_video_from_row(row_idx)
    text = serve_text_from_row(row_idx)
    return video, text

# Create a Gradio interface
def create_video_interface():
    row_idx = 0  # Set the row index to 0 for testing
    
    with gr.Blocks() as demo:
        gr.Markdown("### Video and Text Display")
        
        with gr.Row():
            video_output = gr.Video()
            text_output = gr.Markdown()
        
        # When the button is clicked, both video and text will be updated
        gr.Button("Show Video and Text").click(
            fn=serve_video_and_text,  # Call the wrapper function
            inputs=[gr.Textbox(value=row_idx, visible=False)],
            outputs=[video_output, text_output]
        )
        
    demo.launch()

# Example usage
create_video_interface()


* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


In [11]:
!nvidia-smi

Mon Apr 28 08:21:55 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.120                Driver Version: 550.120        CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3080        Off |   00000000:01:00.0  On |                  N/A |
| 58%   59C    P5             54W /  400W |     700MiB /  12288MiB |     41%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [12]:
from sentence_transformers import SentenceTransformer
import torch
from tqdm import tqdm
import numpy as np

# Select device ('cpu' or 'cuda' if available)
device = "cuda"  # Set to "cuda" if you want to use GPU (change to "cuda" to force GPU usage)
# Check if CUDA is available and if the user wants to use it
if torch.cuda.is_available() and device == "cuda":
    device = "cuda"
    print("Using GPU (CUDA)")
else:
    print("Using CPU")

# Load pre-trained Sentence Transformer model
model = SentenceTransformer('all-mpnet-base-v2', device=device)

# Precompute embeddings for the texts in the dataset
texts = [entry['text'] for entry in loaded_video_dataset]
#text_embeddings = model.encode(texts, show_progress_bar=True)

# If the `SentenceTransformer.encode` method doesn't show a progress bar, you can manually wrap the encoding process with tqdm
text_embeddings = []
for text in tqdm(texts, desc="Encoding texts"):
    text_embeddings.append(model.encode(text))

# Convert list to numpy array or whatever format you need
text_embeddings = np.array(text_embeddings)

Using GPU (CUDA)


Encoding texts: 100%|██████████| 128/128 [00:02<00:00, 59.65it/s]


In [13]:
import gradio as gr
import os

# Function to get row index based on semantic similarity
def get_row_idx_based_on_question(question):
    question_embedding = model.encode([question])  # Encode the question
    similarities = np.dot(text_embeddings, question_embedding.T)  # Compute cosine similarities
    row_idx = np.argmax(similarities)  # Get the index with the highest similarity
    return row_idx

# Function to serve a video from a valid row index
def serve_video_from_row(row_idx):
    row_idx = int(row_idx)  # Convert to integer to avoid errors
    
    # Ensure the row index is valid
    if row_idx < 0 or row_idx >= len(loaded_video_dataset):
        return "Invalid row index."
    
    example = loaded_video_dataset[row_idx]
    video_file_path = example.get("video_file_path", None)
    
    if video_file_path and os.path.exists(video_file_path):
        return gr.Video(video_file_path)
    else:
        return "Video file not found or invalid path."

# Function to serve text from a valid row index
def serve_text_from_row(row_idx):
    row_idx = int(row_idx)  # Convert to integer to avoid errors
    
    # Ensure the row index is valid
    if row_idx < 0 or row_idx >= len(loaded_video_dataset):
        return "Invalid row index."
    
    example = loaded_video_dataset[row_idx]
    text = example.get("text", "No text available.")
    
    return gr.Markdown(text)

# Wrapper function that calls both video and text functions
def serve_video_and_text(question):
    # Get the row index based on the user's question
    row_idx = get_row_idx_based_on_question(question)
    
    if row_idx == -1:
        return "No matching video found for your question.", ""
    
    video = serve_video_from_row(row_idx)
    text = serve_text_from_row(row_idx)
    return video, text

# Create a Gradio interface
def create_video_interface():
    with gr.Blocks() as demo:
        gr.Markdown("### Video and Text Display Based on Question")
        
        # Create a textbox for the user's question
        question_input = gr.Textbox(label="Ask a Question", placeholder="Enter a question related to the videos...")
        
        # Define the video and text outputs
        video_output = gr.Video()
        text_output = gr.Markdown()
        
        # Button to submit the question and show the relevant video and text
        gr.Button("Show Video and Text").click(
            fn=serve_video_and_text,  # Call the function to serve video and text
            inputs=[question_input],  # Input: user question
            outputs=[video_output, text_output]  # Outputs: video and text
        )
        
    demo.launch()

# Example usage
create_video_interface()

* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.
