In [1]:
from qdrant_client import QdrantClient

client = QdrantClient(path="../video_embeddings.db")

In [2]:
collection = client.get_collection("segment_video_embeddings")
print(collection)



In [3]:
import asyncio
import base64
import json
from pathlib import Path
from typing import Literal

import boto3


def encode_b64(file_path) -> str:
    with open(file_path, "rb") as file:
        return base64.b64encode(file.read()).decode("utf-8")


class Nova2OmniEmbeddings:

    def __init__(self):
        self.client = boto3.client(
            service_name="bedrock-runtime",
            region_name="us-east-1",
        )

    async def _invoke_model(self, request_body):
        """
        Returns a dictionary with the following structure:
        {
            "request_id": str,
            "embeddings": [
                {
                    "embedding_type": str,
                    "embedding": list[float],
                }
            ]
        }
        """
        response = await asyncio.to_thread(
            self.client.invoke_model,
            body=json.dumps(request_body, indent=2),
            modelId="amazon.nova-2-multimodal-embeddings-v1:0",
            accept="application/json",
            contentType="application/json",
        )
        request_id = response.get("ResponseMetadata").get("RequestId")
        response_body = json.loads(response.get("body").read())

        results = response_body["embeddings"]
        for result in results:
            result["embedding_type"] = result.pop("embeddingType")

        return {
            "request_id": request_id,
            "embeddings": results,
        }

    async def embed_text(
        self,
        text: str,
        embedding_purpose: Literal[
            "GENERIC_INDEX",
            "TEXT_RETRIEVAL",
            "IMAGE_RETRIEVAL",
            "VIDEO_RETRIEVAL",
            "AUDIO_RETRIEVAL",
            "DOCUMENT_RETRIEVAL",
            "GENERIC_RETRIEVAL",
            "CLASSIFICATION",
            "CLUSTERING",
        ] = "GENERIC_INDEX",
        embedding_dimension: int = 3072,
        truncation_mode: Literal["START", "END", "NONE"] = "NONE",
    ):
        request_body = {
            "taskType": "SINGLE_EMBEDDING",
            "singleEmbeddingParams": {
                "embeddingPurpose": embedding_purpose,
                "embeddingDimension": embedding_dimension,
                "text": {"truncationMode": truncation_mode, "value": text},
            },
        }
        return await self._invoke_model(request_body)

    async def embed_video(
        self,
        video_path: str,
        embedding_purpose: Literal[
            "GENERIC_INDEX",
            "TEXT_RETRIEVAL",
            "IMAGE_RETRIEVAL",
            "VIDEO_RETRIEVAL",
            "AUDIO_RETRIEVAL",
            "DOCUMENT_RETRIEVAL",
            "GENERIC_RETRIEVAL",
            "CLASSIFICATION",
            "CLUSTERING",
        ] = "GENERIC_INDEX",
        embedding_dimension: int = 3072,
        embedding_mode: Literal[
            "AUDIO_VIDEO_COMBINED", "AUDIO_VIDEO_SEPARATE"
        ] = "AUDIO_VIDEO_COMBINED",
    ):
        """
        Args:
            video_path: str
            embedding_mode: Literal["AUDIO_VIDEO_COMBINED", "AUDIO_VIDEO_SEPARATE"]
                - "AUDIO_VIDEO_COMBINED" - Will produce a single embedding combining both audible and visual content.
                - "AUDIO_VIDEO_SEPARATE" - Will produce two embeddings, one for the audible content and one for the visual content.
        """
        video_path = Path(video_path)
        video_foramt = video_path.suffix[1:]
        video_b64 = encode_b64(video_path)

        request_body = {
            "taskType": "SINGLE_EMBEDDING",
            "singleEmbeddingParams": {
                "embeddingPurpose": embedding_purpose,
                "embeddingDimension": embedding_dimension,
                "video": {
                    "format": video_foramt,
                    "embeddingMode": embedding_mode,
                    "source": {"bytes": video_b64},
                },
            },
        }
        return await self._invoke_model(request_body)

    async def embed_audio(
        self,
        audio_path: str,
        embedding_purpose: Literal[
            "GENERIC_INDEX",
            "TEXT_RETRIEVAL",
            "IMAGE_RETRIEVAL",
            "VIDEO_RETRIEVAL",
            "AUDIO_RETRIEVAL",
            "DOCUMENT_RETRIEVAL",
            "GENERIC_RETRIEVAL",
            "CLASSIFICATION",
            "CLUSTERING",
        ] = "GENERIC_INDEX",
        embedding_dimension: int = 3072,
    ):
        audio_path = Path(audio_path)
        audio_format = audio_path.suffix[1:]
        audio_b64 = encode_b64(audio_path)

        request_body = {
            "taskType": "SINGLE_EMBEDDING",
            "singleEmbeddingParams": {
                "embeddingPurpose": embedding_purpose,
                "embeddingDimension": embedding_dimension,
                "audio": {"format": audio_format, "source": {"bytes": audio_b64}},
            },
        }
        return await self._invoke_model(request_body)


In [4]:
embed_model = Nova2OmniEmbeddings()

In [8]:
# query_audio_path = "../datasets/explore/songs/renwoxing/clips_and_lyrics/0032_117.590-122.310.wav"

# rst = await embed_model.embed_audio(query_audio_path, embedding_purpose="VIDEO_RETRIEVAL")

rst = await embed_model.embed_text("mouse praying", embedding_purpose="VIDEO_RETRIEVAL")

query = rst["embeddings"][0]["embedding"]

In [9]:
hits = client.query_points(
    "segment_video_embeddings",
    query=query,
    limit=10,
)

In [10]:
for hit in hits.points:
    print(hit.payload, hit.score)

{'segment_id': 'video_1-S1940E01-Scene-043', 'segment_path': 'datasets/explore/videos/video_1/S1940E01-Scene-043.mp4'} 0.3810055213929495
{'segment_id': 'video_3-Tom and Jerry (1940) - S1950E15 - Nit-Witty Kitty (1080p AMZN WEB-DL x265 Ghost)-Scene-094', 'segment_path': 'datasets/explore/videos/video_3/Tom and Jerry (1940) - S1950E15 - Nit-Witty Kitty (1080p AMZN WEB-DL x265 Ghost)-Scene-094.mp4'} 0.3689353452822861
{'segment_id': 'video_1-S1940E01-Scene-056', 'segment_path': 'datasets/explore/videos/video_1/S1940E01-Scene-056.mp4'} 0.3684040031310308
{'segment_id': 'video_3-Tom and Jerry (1940) - S1950E15 - Nit-Witty Kitty (1080p AMZN WEB-DL x265 Ghost)-Scene-079', 'segment_path': 'datasets/explore/videos/video_3/Tom and Jerry (1940) - S1950E15 - Nit-Witty Kitty (1080p AMZN WEB-DL x265 Ghost)-Scene-079.mp4'} 0.3679864581110246
{'segment_id': 'video_1-S1940E01-Scene-076', 'segment_path': 'datasets/explore/videos/video_1/S1940E01-Scene-076.mp4'} 0.36663800933535273
{'segment_id': 'video