# Multimodal demo

This is an example of how to simulate a video- and audio-aware model using existing LLM vision models (that take text and images as input, and generate text as output).

In [1]:
import os
from pathlib import Path

import dotenv
from openai import OpenAI

from media_extractor import split_video
import datauri

In [11]:
# Load OpenAI API key from .env file
dotenv.load_dotenv()
if os.environ.get("API_KEY") is None:
    raise ValueError("API_KEY not found in .env file")

key_val = os.getenv("API_KEY")
client = OpenAI(api_key = key_val)

This is the input video that we'll turn into the user prompt.

In [3]:
video_file = "input.mp4"

from IPython.display import Video
Video(video_file, width=320)

At the time of this writing, the GPT-4o API doesn't directly support video or audio input. Instead, we'll decode the video into frames and feed them to the model as images, and decode the audio into text and feed it to the model as text.

In [4]:
audio_uri, image_uris = split_video(video_file)
audio_uri[:50]

'data:audio/mpeg;base64,SUQzBAAAAAAAf1RYWFgAAAASAAA'

Decode the audio file into text, using OpenAI's `whisper-1` model. The result will serve as the text prompt for the LLM.

In [5]:
with datauri.as_tempfile(audio_uri) as audio_file:
    transcription = client.audio.transcriptions.create(
        model="whisper-1", file=Path(audio_file),
        prompt = "Transcribe it to English Text"
        
    )

user_prompt = transcription.text
user_prompt

transcription

Transcription(text="What am I supposed to do? Am I supposed to chase after them? How am I gonna look after them? I've got no legs!")

In [6]:
'''user_prompt = f"Translate the following English text to Chinese: {transcription.text}"

# Send the prompt to the OpenAI language model to translate since "translations" class isn't working in the cell
# above
translation = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": user_prompt}],
    max_tokens=50
)

user_prompt = translation.choices[0].message.content'''

'user_prompt = f"Translate the following English text to Chinese: {transcription.text}"\n\n# Send the prompt to the OpenAI language model to translate since "translations" class isn\'t working in the cell\n# above\ntranslation = client.chat.completions.create(\n    model="gpt-3.5-turbo",\n    messages=[{"role": "user", "content": user_prompt}],\n    max_tokens=50\n)\n\nuser_prompt = translation.choices[0].message.content'

We're ready to talk to the LLM: use the text and images as input, and get generated text back.

In [7]:
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": user_prompt},
                *[
                    {
                        "type": "image_url",
                        "image_url": {"url": image_uri, "detail": "auto"},
                    }
                    for image_uri in image_uris
                ],
            ],
        },
        {
            "role": "system",
            "content": Path("system_prompt.txt").read_text(),
        },
    ],
)
response_text = response.choices[0].message.content
response_text

"Hey there! It looks like you're in quite the predicament in the video. If you're talking about someone or something running off, it's okay to feel stuck in the moment. Maybe you can get creative with how you look after them from the car. Can you call for help or find another way to keep an eye on things? Remember, you're not alone, and there are always options."

Use OpenAI's text-to-speech model to turn the generated text into audio.

In [8]:
audio = client.audio.speech.create(
    model="tts-1",
    voice="nova",
    input=response_text,
    response_format="mp3",
)
response_audio_uri = datauri.from_bytes(audio.read(), "audio/mpeg")

In [9]:
with datauri.as_tempfile(response_audio_uri) as response_audio_file:
    from IPython.display import Audio
    display(Audio(response_audio_file))