In [1]:
import asyncio
import base64
from pathlib import Path
import boto3
import json
from typing import Literal


def encode_b64(file_path) -> str:
    with open(file_path, "rb") as file:
        return base64.b64encode(file.read()).decode("utf-8")


class Nova2OmniEmbeddings:

    def __init__(self):
        self.client = boto3.client(
            service_name="bedrock-runtime",
            region_name="us-east-1",
        )

    async def _invoke_model(self, request_body):
        """
        Returns a dictionary with the following structure:
        {
            "request_id": str,
            "embeddings": [
                {
                    "embedding_type": str,
                    "embedding": list[float],
                }
            ]
        }
        """
        response = await asyncio.to_thread(
            self.client.invoke_model,
            body=json.dumps(request_body, indent=2),
            modelId="amazon.nova-2-multimodal-embeddings-v1:0",
            accept="application/json",
            contentType="application/json",
        )
        request_id = response.get("ResponseMetadata").get("RequestId")
        response_body = json.loads(response.get("body").read())

        results = response_body["embeddings"]
        for result in results:
            result["embedding_type"] = result.pop("embeddingType")

        return {
            "request_id": request_id,
            "embeddings": results,
        }

    async def embed_text(
        self,
        text: str,
        embedding_purpose: Literal[
            "GENERIC_INDEX",
            "TEXT_RETRIEVAL",
            "IMAGE_RETRIEVAL",
            "VIDEO_RETRIEVAL",
            "AUDIO_RETRIEVAL",
            "DOCUMENT_RETRIEVAL",
            "GENERIC_RETRIEVAL",
            "CLASSIFICATION",
            "CLUSTERING",
        ] = "GENERIC_INDEX",
        embedding_dimension: int = 3072,
        truncation_mode: Literal["START", "END", "NONE"] = "NONE",
    ):
        request_body = {
            "taskType": "SINGLE_EMBEDDING",
            "singleEmbeddingParams": {
                "embeddingPurpose": embedding_purpose,
                "embeddingDimension": embedding_dimension,
                "text": {"truncationMode": truncation_mode, "value": text},
            },
        }
        return await self._invoke_model(request_body)

    async def embed_video(
        self,
        video_path: str,
        embedding_purpose: Literal[
            "GENERIC_INDEX",
            "TEXT_RETRIEVAL",
            "IMAGE_RETRIEVAL",
            "VIDEO_RETRIEVAL",
            "AUDIO_RETRIEVAL",
            "DOCUMENT_RETRIEVAL",
            "GENERIC_RETRIEVAL",
            "CLASSIFICATION",
            "CLUSTERING",
        ] = "GENERIC_INDEX",
        embedding_dimension: int = 3072,
        embedding_mode: Literal[
            "AUDIO_VIDEO_COMBINED", "AUDIO_VIDEO_SEPARATE"
        ] = "AUDIO_VIDEO_COMBINED",
    ):
        """
        Args:
            video_path: str
            embedding_mode: Literal["AUDIO_VIDEO_COMBINED", "AUDIO_VIDEO_SEPARATE"]
                - "AUDIO_VIDEO_COMBINED" - Will produce a single embedding combining both audible and visual content.
                - "AUDIO_VIDEO_SEPARATE" - Will produce two embeddings, one for the audible content and one for the visual content.
        """
        video_path = Path(video_path)
        video_foramt = video_path.suffix[1:]
        video_b64 = encode_b64(video_path)

        request_body = {
            "taskType": "SINGLE_EMBEDDING",
            "singleEmbeddingParams": {
                "embeddingPurpose": embedding_purpose,
                "embeddingDimension": embedding_dimension,
                "video": {
                    "format": video_foramt,
                    "embeddingMode": embedding_mode,
                    "source": {"bytes": video_b64},
                },
            },
        }
        return await self._invoke_model(request_body)

    async def embed_audio(
        self,
        audio_path: str,
        embedding_purpose: Literal[
            "GENERIC_INDEX",
            "TEXT_RETRIEVAL",
            "IMAGE_RETRIEVAL",
            "VIDEO_RETRIEVAL",
            "AUDIO_RETRIEVAL",
            "DOCUMENT_RETRIEVAL",
            "GENERIC_RETRIEVAL",
            "CLASSIFICATION",
            "CLUSTERING",
        ] = "GENERIC_INDEX",
        embedding_dimension: int = 3072,
    ):
        audio_path = Path(audio_path)
        audio_format = audio_path.suffix[1:]
        audio_b64 = encode_b64(audio_path)

        request_body = {
            "taskType": "SINGLE_EMBEDDING",
            "singleEmbeddingParams": {
                "embeddingPurpose": embedding_purpose,
                "embeddingDimension": embedding_dimension,
                "audio": {"format": audio_format, "source": {"bytes": audio_b64}},
            },
        }
        return await self._invoke_model(request_body)

In [2]:
embed_model = Nova2OmniEmbeddings()

In [7]:
rst = await embed_model.embed_audio(
    audio_path="/Users/scottcui/projects/mv_synthesis/outputs/tmp_audio/0058_185.020-185.850_16k_mono.wav",
    embedding_purpose="GENERIC_INDEX",
)

ValidationException: An error occurred (ValidationException) when calling the InvokeModel operation: Invalid Input: The input does not adhere to the expected standards. Please refer to the model user guide and adjust the input before trying again.

In [5]:
all_audio_files = list(Path("/Users/scottcui/projects/mv_synthesis/datasets/ds1/songs/counting_stars/clips_and_lyrics").glob("*.wav"))
for audio_file in all_audio_files:
    print(f"Processing {audio_file}")
    rst = await embed_model.embed_audio(
        audio_path=audio_file,
        embedding_purpose="GENERIC_INDEX",
    )


Processing /Users/scottcui/projects/mv_synthesis/datasets/ds1/songs/counting_stars/clips_and_lyrics/0043_133.440-137.840.wav
Processing /Users/scottcui/projects/mv_synthesis/datasets/ds1/songs/counting_stars/clips_and_lyrics/0079_220.540-222.480.wav
Processing /Users/scottcui/projects/mv_synthesis/datasets/ds1/songs/counting_stars/clips_and_lyrics/0044_137.840-141.280.wav
Processing /Users/scottcui/projects/mv_synthesis/datasets/ds1/songs/counting_stars/clips_and_lyrics/0066_192.830-193.860.wav
Processing /Users/scottcui/projects/mv_synthesis/datasets/ds1/songs/counting_stars/clips_and_lyrics/0051_162.610-164.720.wav
Processing /Users/scottcui/projects/mv_synthesis/datasets/ds1/songs/counting_stars/clips_and_lyrics/0027_085.750-089.760.wav
Processing /Users/scottcui/projects/mv_synthesis/datasets/ds1/songs/counting_stars/clips_and_lyrics/0012_040.670-042.580.wav
Processing /Users/scottcui/projects/mv_synthesis/datasets/ds1/songs/counting_stars/clips_and_lyrics/0035_116.750-119.450.wav


  return compile(source, filename, mode, flags,


ValidationException: An error occurred (ValidationException) when calling the InvokeModel operation: Invalid Input: The input does not adhere to the expected standards. Please refer to the model user guide and adjust the input before trying again.

In [6]:
video_rst = await embed_model.embed_video(
    video_path="../datasets/explore/videos/video_1/S1940E01-Scene-055.mp4",
    embedding_purpose="GENERIC_INDEX",
    embedding_mode="AUDIO_VIDEO_SEPARATE",
)

In [7]:
for rst in video_rst["embeddings"]:
    print(rst["embedding_type"])

AUDIO
VIDEO
