In [12]:
# AudioJudge with Hugging Face Datasets Demo


In [2]:
# Import required libraries
import os
import tempfile
import soundfile as sf
from datasets import load_dataset
from IPython.display import Audio, display
import IPython.display as ipd

# Import AudioJudge
from audiojudge import AudioJudge
from audiojudge.utils import AudioExample


In [None]:
# Set API keys (you can also set these as environment variables)
# os.environ["OPENAI_API_KEY"] = "your-openai-key"
# os.environ["GOOGLE_API_KEY"] = "your-google-key"

# Initialize AudioJudge
judge = AudioJudge(
    openai_api_key="your-openai-key",
    google_api_key="your-google-key",
    temp_dir="temp_audio_demo",
    signal_folder="signal_audios_demo",
    cache_dir=".demo_cache"
)


In [None]:
ds = load_dataset("anonymous/paralinguistic-judge-speaker", split="train")

### Zero-shot evaluation method

In [6]:
# Example1: Same speaker
ex1_audio_a = ds[0]["audio_a"]
ex1_audio_b = ds[0]["audio_b"]
# Play ex1_audio_a (numpy array) in Jupyter notebook
ipd.display(ipd.Audio(ex1_audio_a["array"], rate=ex1_audio_a["sampling_rate"]))
# Play ex1_audio_b (numpy array) in Jupyter notebook
ipd.display(ipd.Audio(ex1_audio_b["array"], rate=ex1_audio_b["sampling_rate"]))
# save the audio to file in `temp_audio_demo`
ex1_audio_a_path = f"temp_audio_demo/ex1_audio_a.wav"
ex1_audio_b_path = f"temp_audio_demo/ex1_audio_b.wav"
sf.write(ex1_audio_a_path, ex1_audio_a["array"], ex1_audio_a["sampling_rate"])
sf.write(ex1_audio_b_path, ex1_audio_b["array"], ex1_audio_b["sampling_rate"])

# Check if the audio is the same speaker
result = judge.judge_audio(
    audio1_path=ex1_audio_a_path,
    audio2_path=ex1_audio_b_path,
    system_prompt="Are these two audio clips from the same speaker?",
    model="gpt-4o-audio-preview"
)
print(result)

Cache hit for _cached_model_call
{'success': True, 'response': "I'm unable to determine if the speakers are the same based on the audio clips.", 'model': 'gpt-4o-audio-preview', 'concatenation_method': 'examples_and_test_concatenation', 'audio1_path': 'temp_audio_demo/ex1_audio_a.wav', 'audio2_path': 'temp_audio_demo/ex1_audio_b.wav', 'instruction_path': None, 'num_examples': 0}


In [7]:
# Example2: Different speakers
ex2_audio_a = ds[1]["audio_a"]
ex2_audio_b = ds[1]["audio_b"]
# Play ex2_audio_a (numpy array) in Jupyter notebook
ipd.display(ipd.Audio(ex2_audio_a["array"], rate=ex2_audio_a["sampling_rate"]))
# Play ex2_audio_b (numpy array) in Jupyter notebook
ipd.display(ipd.Audio(ex2_audio_b["array"], rate=ex2_audio_b["sampling_rate"]))

ex2_audio_a_path = f"temp_audio_demo/ex2_audio_a.wav"
ex2_audio_b_path = f"temp_audio_demo/ex2_audio_b.wav"
sf.write(ex2_audio_a_path, ex2_audio_a["array"], ex2_audio_a["sampling_rate"])
sf.write(ex2_audio_b_path, ex2_audio_b["array"], ex2_audio_b["sampling_rate"])

# Check if the audio is the same speaker
result = judge.judge_audio(
    audio1_path=ex2_audio_a_path,
    audio2_path=ex2_audio_b_path,
    system_prompt="Are these two audio clips from the same speaker?",
    model="gpt-4o-audio-preview"
)
print(result)

Cache hit for _cached_model_call
{'success': True, 'response': 'Based on the sound of the two clips, it does seem like they are from the same speaker. The voice characteristics, such as tone and pitch, appear consistent across both clips.', 'model': 'gpt-4o-audio-preview', 'concatenation_method': 'examples_and_test_concatenation', 'audio1_path': 'temp_audio_demo/ex2_audio_a.wav', 'audio2_path': 'temp_audio_demo/ex2_audio_b.wav', 'instruction_path': None, 'num_examples': 0}


### ICL + Audio Concatenation

In [8]:
# icl_examples
ex_icl1 = ds[2] # same speaker
ex_icl2 = ds[3] # same speaker
ex_icl3 = ds[4] # different speaker
ex_icl4 = ds[6] # different speaker

# save the audio to file in `temp_audio_demo`
ex_icl1_audio_a_path = f"temp_audio_demo/ex_icl1_audio_a.wav"
ex_icl1_audio_b_path = f"temp_audio_demo/ex_icl1_audio_b.wav"
sf.write(ex_icl1_audio_a_path, ex_icl1["audio_a"]["array"], ex_icl1["audio_a"]["sampling_rate"])
sf.write(ex_icl1_audio_b_path, ex_icl1["audio_b"]["array"], ex_icl1["audio_b"]["sampling_rate"])

ex_icl2_audio_a_path = f"temp_audio_demo/ex_icl2_audio_a.wav"
ex_icl2_audio_b_path = f"temp_audio_demo/ex_icl2_audio_b.wav"
sf.write(ex_icl2_audio_a_path, ex_icl2["audio_a"]["array"], ex_icl2["audio_a"]["sampling_rate"])
sf.write(ex_icl2_audio_b_path, ex_icl2["audio_b"]["array"], ex_icl2["audio_b"]["sampling_rate"])

ex_icl3_audio_a_path = f"temp_audio_demo/ex_icl3_audio_a.wav"
ex_icl3_audio_b_path = f"temp_audio_demo/ex_icl3_audio_b.wav"
sf.write(ex_icl3_audio_a_path, ex_icl3["audio_a"]["array"], ex_icl3["audio_a"]["sampling_rate"])
sf.write(ex_icl3_audio_b_path, ex_icl3["audio_b"]["array"], ex_icl3["audio_b"]["sampling_rate"])

ex_icl4_audio_a_path = f"temp_audio_demo/ex_icl4_audio_a.wav"
ex_icl4_audio_b_path = f"temp_audio_demo/ex_icl4_audio_b.wav"
sf.write(ex_icl4_audio_a_path, ex_icl4["audio_a"]["array"], ex_icl4["audio_a"]["sampling_rate"])
sf.write(ex_icl4_audio_b_path, ex_icl4["audio_b"]["array"], ex_icl4["audio_b"]["sampling_rate"])

In [9]:
# create ICL examples
examples = [
    AudioExample(
        audio1_path=ex_icl1_audio_a_path,
        audio2_path=ex_icl1_audio_b_path,
        output="Two audio clips are from the same speaker."
    ),
    AudioExample(
        audio1_path=ex_icl3_audio_a_path,
        audio2_path=ex_icl3_audio_b_path,
        output="Two audio clips are from the different speaker."
    ),
    AudioExample(
        audio1_path=ex_icl2_audio_a_path,
        audio2_path=ex_icl2_audio_b_path,
        output="Two audio clips are from the same speaker."
    ),

    AudioExample(
        audio1_path=ex_icl4_audio_a_path,
        audio2_path=ex_icl4_audio_b_path,
        output="Two audio clips are from the different speaker."
    )
]

In [10]:
# Example1: Same speaker
ex1_audio_a = ds[0]["audio_a"]
ex1_audio_b = ds[0]["audio_b"]

# save the audio to file in `temp_audio_demo`
ex1_audio_a_path = f"temp_audio_demo/ex1_audio_a.wav"
ex1_audio_b_path = f"temp_audio_demo/ex1_audio_b.wav"
sf.write(ex1_audio_a_path, ex1_audio_a["array"], ex1_audio_a["sampling_rate"])
sf.write(ex1_audio_b_path, ex1_audio_b["array"], ex1_audio_b["sampling_rate"])


# Check if the audio is the same speaker
result = judge.judge_audio(
    audio1_path=ex1_audio_a_path,
    audio2_path=ex1_audio_b_path,
    examples=examples,
    system_prompt="Are these two audio clips from the same speaker?",
    model="gpt-4o-audio-preview"
)
print(result)

Cache hit for _cached_model_call
{'success': True, 'response': 'Based on the short duration and limited content of the second clip, it is difficult to make a definitive judgment. However, from the general tone and style, they do sound similar. So, I would say they are likely from the same speaker.', 'model': 'gpt-4o-audio-preview', 'concatenation_method': 'examples_and_test_concatenation', 'audio1_path': 'temp_audio_demo/ex1_audio_a.wav', 'audio2_path': 'temp_audio_demo/ex1_audio_b.wav', 'instruction_path': None, 'num_examples': 4}


In [11]:
# Example2: Different speakers
ex2_audio_a = ds[1]["audio_a"]
ex2_audio_b = ds[1]["audio_b"]

ex2_audio_a_path = f"temp_audio_demo/ex2_audio_a.wav"
ex2_audio_b_path = f"temp_audio_demo/ex2_audio_b.wav"
sf.write(ex2_audio_a_path, ex2_audio_a["array"], ex2_audio_a["sampling_rate"])
sf.write(ex2_audio_b_path, ex2_audio_b["array"], ex2_audio_b["sampling_rate"])

# Check if the audio is the same speaker
result = judge.judge_audio(
    audio1_path=ex2_audio_a_path,
    audio2_path=ex2_audio_b_path,
    examples=examples,
    system_prompt="Are these two audio clips from the same speaker?",
    model="gpt-4o-audio-preview"
)
print(result)

Cache hit for _cached_model_call
{'success': True, 'response': 'Based on the audio clips provided, it sounds like the two speakers have different vocal characteristics. Therefore, they are likely from different speakers.', 'model': 'gpt-4o-audio-preview', 'concatenation_method': 'examples_and_test_concatenation', 'audio1_path': 'temp_audio_demo/ex2_audio_a.wav', 'audio2_path': 'temp_audio_demo/ex2_audio_b.wav', 'instruction_path': None, 'num_examples': 4}
