In [1]:
import os
import numpy as np
import librosa
import torch
import cv2
from sentence_transformers import SentenceTransformer
from torchvision import transforms
from PIL import Image
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams, CollectionConfig
import clip

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
qdrant = QdrantClient('localhost', port=6333)

collection_name = "video_features"
if not qdrant.collection_exists(collection_name):
   qdrant.create_collection(
      collection_name=collection_name,
      vectors_config={
        "title_vector": VectorParams(size=384, distance=Distance.COSINE),
        "image_vector": VectorParams(size=512, distance=Distance.COSINE),
        "audio_vector": VectorParams(size=128, distance=Distance.COSINE),
    }
   )

device = "cuda" if torch.cuda.is_available() else "cpu"

text_model = SentenceTransformer("all-MiniLM-L6-v2")
clip_model, preprocess = clip.load("ViT-B/32", device=device)

In [20]:
# Function to extract text embeddings (e.g., using filename as text)
def extract_text_embedding(text):
    """Generate text vector using Sentence Transformers"""
    return text_model.encode(text).tolist()

# Function to extract video frame embeddings
def extract_video_embedding(video_path, num_frames=5):
    """Extracts frames from a video and computes an average embedding"""
    
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    for i in np.linspace(0, frame_count - 1, num_frames).astype(int):  # Sample evenly spaced frames
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if not ret:
            continue
        
        # Convert to PIL image and preprocess
        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        preprocess = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor()
        ])
        image_tensor = preprocess(image).unsqueeze(0).to(device)
        
        # Encode using CLIP
        with torch.no_grad():
            image_features = clip_model.encode_image(image_tensor).cpu().numpy().flatten()
        
        frames.append(image_features)

    cap.release()
    
    if frames:
        return np.mean(frames, axis=0).tolist()  # Average frame embeddings
    else:
        return [0] * 512  # Default vector if no frames extracted

# Function to extract audio embeddings
def extract_audio_embedding(video_path):
    """Extracts audio from video and computes an MFCC-based embedding"""
    y, sr = librosa.load(video_path, sr=16000, duration=10)  # Load first 10 seconds
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=128)
    audio_vector = np.mean(mfcc, axis=1)  # Take mean across time

    return audio_vector.tolist()

# Function to process and store in Qdrant
def process_video(video_path):
    """Extract features and store them in Qdrant"""
    video_id = os.path.basename(video_path).split(".")[0]  # Use filename as ID
    video_title = os.path.basename(video_path)  # Use filename as title

    # Extract features
    title_vector = extract_text_embedding(video_title)
    video_vector = extract_video_embedding(video_path)
    audio_vector = extract_audio_embedding(video_path)

    # Store in Qdrant
    qdrant.upsert(
        collection_name=collection_name,
        points=[
            PointStruct(
                id=int(hash(video_id) % 1_000_000),  # Unique but simple ID
                vector={
                    "title_vector": title_vector,
                    "video_vector": video_vector,
                    "audio_vector": audio_vector
                },
                payload={"title": video_title}  # Store title as metadata
            )
        ]
    )

    print(f"✅ Processed and stored: {video_title}")

In [21]:
video_files = [os.path.join('data', 'video', filename) for filename in os.listdir('data/video')]
video_files

['data\\video\\As It Was (Harry Styles) - NELL X Sungha Jung [O9-CQndjtEQ].mp4',
 'data\\video\\Classical Gas🔥 [cx6Nva2yUjk].mp4',
 "data\\video\\Don't Stop Me Now (Queen) - Sungha Jung [pqQf1uD5m90].mp4",
 'data\\video\\Flaming (With Kotaro Oshio) - Sungha Jung (Official Music Video) [tJIkyeTqExE].mp4',
 'data\\video\\Seventh #9 - Sungha Jung [i3-mzP_KYVc].mp4',
 'data\\video\\Sukidakara (Yuika) [-Iz-dnVCfto].mp4']

In [22]:
for video_file in video_files:
    process_video(video_file)

  y, sr = librosa.load(video_path, sr=16000, duration=10)  # Load first 10 seconds
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


✅ Processed and stored: As It Was (Harry Styles) - NELL X Sungha Jung [O9-CQndjtEQ].mp4


  y, sr = librosa.load(video_path, sr=16000, duration=10)  # Load first 10 seconds
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


✅ Processed and stored: Classical Gas🔥 [cx6Nva2yUjk].mp4
✅ Processed and stored: Don't Stop Me Now (Queen) - Sungha Jung [pqQf1uD5m90].mp4
✅ Processed and stored: Flaming (With Kotaro Oshio) - Sungha Jung (Official Music Video) [tJIkyeTqExE].mp4
✅ Processed and stored: Seventh #9 - Sungha Jung [i3-mzP_KYVc].mp4
✅ Processed and stored: Sukidakara (Yuika) [-Iz-dnVCfto].mp4


In [3]:

def fetch_all_vectors():
    result = qdrant.scroll(
        collection_name=collection_name,
        limit=100,  # Adjust to fetch more points
        with_vectors=True,   # Include stored vectors
        with_payload=True    # Include metadata (e.g., title)
    )
    
    for point in result[0]:  # Qdrant returns (data, next_offset)
        print(f"🆔 ID: {point.id}")
        print(f"📌 Title: {point.payload.get('title', 'No Title')}")
        print(f"🖼️ Title Vector (First 5): {point.vector.get('title_vector')[:5]}...")
        print(f"📹 Video Vector (First 5): {point.vector.get('video_vector')[:5]}...")
        print(f"🎵 Audio Vector (First 5): {point.vector.get('audio_vector')[:5]}...")
        print("=" * 60)


fetch_all_vectors()

In [10]:
def find_similar_videos(video_id, top_k=5):
    # Fetch the selected video's vector
    response = qdrant.retrieve(collection_name, [video_id], with_vectors=True)
    if not response:
        print("⚠️ Video not found in the collection!")
        return
    
    video_vector = response[0].vector["video_vector"]  # Extract video vector

    # Search for top-k similar videos
    search_results = qdrant.search(
        collection_name=collection_name,
        query_vector=("video_vector", video_vector),
        limit=top_k + 1,  # +1 to exclude itself
    )

    # Print the results
    print(f"🎯 Searching for videos similar to Video ID {video_id}\n")
    for result in search_results:
        if result.id == video_id:  # Skip the query video itself
            continue
        print(f"🆔 Video ID: {result.id}")
        print(f"🔍 Similarity Score: {result.score:.4f}")
        print(f"📌 Title: {result.payload.get('title', 'No Title')}")
        print("-" * 50)


find_similar_videos(video_id=963694, top_k=3)  # Replace with a real video ID

🎯 Searching for videos similar to Video ID 963694

🆔 Video ID: 760266
🔍 Similarity Score: 0.9537
📌 Title: Sukidakara (Yuika) [-Iz-dnVCfto].mp4
--------------------------------------------------
🆔 Video ID: 171697
🔍 Similarity Score: 0.9514
📌 Title: Seventh #9 - Sungha Jung [i3-mzP_KYVc].mp4
--------------------------------------------------
🆔 Video ID: 284614
🔍 Similarity Score: 0.9383
📌 Title: Don't Stop Me Now (Queen) - Sungha Jung [pqQf1uD5m90].mp4
--------------------------------------------------


  search_results = qdrant.search(
