In [1]:
from datasets import Dataset

# Specify the directory where you want to save it
save_directory = './final_video_dataset'

# Load the saved dataset
loaded_video_dataset = Dataset.load_from_disk(save_directory)

In [2]:
from sentence_transformers import SentenceTransformer
import torch
from tqdm import tqdm
import numpy as np

# Select device ('cpu' or 'cuda' if available)
device = "cpu"  # Set to "cuda" if you want to use GPU (change to "cuda" to force GPU usage)
# Check if CUDA is available and if the user wants to use it
if torch.cuda.is_available() and device == "cuda":
    device = "cuda"
    print("Using GPU (CUDA)")
else:
    print("Using CPU")

# Load pre-trained Sentence Transformer model
model = SentenceTransformer('all-mpnet-base-v2', device=device)

# Precompute embeddings for the texts in the dataset
texts = [entry['text'] for entry in loaded_video_dataset]
#text_embeddings = model.encode(texts, show_progress_bar=True)

# If the `SentenceTransformer.encode` method doesn't show a progress bar, you can manually wrap the encoding process with tqdm
text_embeddings = []
for text in tqdm(texts, desc="Encoding texts"):
    text_embeddings.append(model.encode(text))

# Convert list to numpy array or whatever format you need
text_embeddings = np.array(text_embeddings)

Using CPU


Encoding texts: 100%|██████████| 128/128 [00:26<00:00,  4.90it/s]


In [3]:
embedding_size = model.get_sentence_embedding_dimension()

# Initialize Qdrant Store and Populate It

In [4]:
# Qdrant Vector Database
from qdrant_client import QdrantClient
from qdrant_client import models
from qdrant_client.models import CollectionStatus
from qdrant_client.models import Distance, VectorParams

In [5]:
qdrant_client = QdrantClient(location=":memory:")

In [6]:
MY_COLLECTION = "search_collection"

first_collection = qdrant_client.create_collection(
    collection_name=MY_COLLECTION,
    vectors_config=VectorParams(size=embedding_size, # Size of Snowflake Embedding Dimensions
                                distance=Distance.COSINE), # Cosine similarity for vector search
)

In [7]:
collection_info = qdrant_client.get_collection(collection_name=MY_COLLECTION)
list(collection_info)

[('status', <CollectionStatus.GREEN: 'green'>),
 ('optimizer_status', <OptimizersStatusOneOf.OK: 'ok'>),
 ('vectors_count', None),
 ('indexed_vectors_count', 0),
 ('points_count', 0),
 ('segments_count', 1),
 ('config',
  CollectionConfig(params=CollectionParams(vectors=VectorParams(size=768, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), shard_number=None, sharding_method=None, replication_factor=None, write_consistency_factor=None, read_fan_out_factor=None, on_disk_payload=None, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=None, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb

In [8]:
# Load pre-trained Sentence Transformer model
model = SentenceTransformer('all-mpnet-base-v2', device=device)

# Precompute embeddings for the texts in the dataset
texts = [entry['text'] for entry in loaded_video_dataset]
#text_embeddings = model.encode(texts, show_progress_bar=True)

# If the `SentenceTransformer.encode` method doesn't show a progress bar, you can manually wrap the encoding process with tqdm
text_embeddings = []
for text in tqdm(texts, desc="Encoding texts"):
    text_embeddings.append(model.encode(text))

# Convert list to numpy array or whatever format you need
text_embeddings = np.array(text_embeddings)

Encoding texts: 100%|██████████| 128/128 [00:24<00:00,  5.23it/s]


In [9]:
def get_embeddings(example):
    example['embeddings'] = model.encode(example['text'])

    return example

In [10]:
loaded_video_dataset = loaded_video_dataset.map(get_embeddings)

In [11]:
loaded_video_dataset

Dataset({
    features: ['video_id', 'start', 'end', 'segmentation_group_id', 'text', 'video_file_path', 'embeddings'],
    num_rows: 128
})

# Coerce to Qdrant Format

In [12]:
ids = list(range(0, len(loaded_video_dataset)))
print(len(ids))
vectors = loaded_video_dataset['embeddings']

128


In [13]:
payloads = []

for row in loaded_video_dataset:

    payload = {
        'video_id': row['video_id'],
        'start': row['start'],
        'end': row['end'],
        'segmentation_group_id': row['segmentation_group_id'],
        'text': row['text'],
        'video_file_path': row['video_file_path']
    }

    payloads.append(payload)

In [14]:
# Insert points into the collection
from qdrant_client.models import PointStruct
from qdrant_client.models import Batch

# Create a Batch object
my_batch = Batch(ids=ids, vectors=vectors, payloads=payloads)

first_collection = qdrant_client.upsert(
    collection_name=MY_COLLECTION,
    points=my_batch
)

In [15]:
retrieved_points = qdrant_client.scroll(collection_name=MY_COLLECTION, with_vectors=True, limit=100)

# Check if vectors are part of the payload
for point in retrieved_points[0][:10]:
    print(f"ID: {point.id}")
    print(f"Vector: {point.vector[:5]}")  # Print first 5 elements of the vector for quick check
    print(f"Payload: {point.payload}")

ID: 0
Vector: [-0.003908757120370865, 0.00175646657589823, -0.018195865675807, 0.036522842943668365, -0.0037507042288780212]
Payload: {'video_id': '9CGGh6ivg68', 'start': '00:00:02.990', 'end': '00:00:31.390', 'segmentation_group_id': 0, 'text': 'in this video I would like to start the discussion about convolutional new networks which is another architecture of uh neural networks that we are going to see specifically kind of engineered uh to um address problems that we are facing in computer vision I want to start this discussion with um just showing you a picture and uh if I ask you uh to tell me what would actually be the first object that you pay attention to then most people will probably', 'video_file_path': './video_zips/9CGGh6ivg68/chunk_1_2.99_31.39.mp4'}
ID: 1
Vector: [0.02774043008685112, -0.01656246744096279, -0.021778088063001633, 0.004310879856348038, 0.02978379838168621]
Payload: {'video_id': '9CGGh6ivg68', 'start': '00:00:31.390', 'end': '00:01:50.389', 'segmentation_gro

In [16]:
def get_query_embedding(text):
    return model.encode(text)  # Encode the question (this returns a normalized embedding)

In [17]:
# Gets closest similarity score chunks
def get_context_chunks(norm_query_embedding, num_chunks=1):

    context_chunks = qdrant_client.query_points(
                      collection_name=MY_COLLECTION,
                      query=norm_query_embedding,
                      limit=num_chunks
    ).points

    #print(context_chunks)

    return context_chunks

In [18]:
import gradio as gr
import os

# Function to get payload based on semantic similarity
def get_payload_based_on_question(question):
    question_embedding = get_query_embedding(question)  # Encode the question
    payload = get_context_chunks(question_embedding)[0].payload
    text = payload['text']
    video_file_path = payload['video_file_path']
    
    #print(video_file_path)

    return text, video_file_path

# Wrapper function that calls both video and text functions
def serve_video_and_text(question):
    # Get the row index based on the user's question
    text_string, video_file_path = get_payload_based_on_question(question)
        
    video = video_file_path
    text = text_string
    
    return gr.Video(video), gr.Markdown(text)

# Create a Gradio interface
def create_video_interface():
    with gr.Blocks() as demo:
        gr.Markdown("### Video and Text Display Based on Question")
        
        # Create a textbox for the user's question
        question_input = gr.Textbox(label="Ask a Question", placeholder="Enter a question related to the videos...")
        
        # Define the video and text outputs
        video_output = gr.Video()
        text_output = gr.Markdown()
        
        # Button to submit the question and show the relevant video and text
        gr.Button("Show Video and Text").click(
            fn=serve_video_and_text,  # Call the function to serve video and text
            inputs=[question_input],  # Input: user question
            outputs=[video_output, text_output]  # Outputs: video and text
        )
        
    demo.launch(share=True)

# Example usage
create_video_interface()

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://e1bfa1d83eb830f71a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Apply Bert Topic -> Use high noise chunk embedding to get a good expansion window start point -> 

Bert topic group each caption, use the single caption to refer to a whole bert topic group

Use high noise chunk embedding to get a good expansion window start point -> : 

query -> original chunk

Now use original chunk to examine neighboring chunks (can do it against original query also or original chunks)

chunk_t-2 - chunk_t-1 - original chunk - chunk_t1 - chunk_t2