# Audio Generation Notebook, Spring Rotation 2025, SALT Lab
Steven Dillmann, Stanford University, stevendi@stanford.edu

In [1]:
# External imports
import os
from dotenv import load_dotenv
import openai
import base64
import IPython.display as ipd


import numpy as np
np.Infinity = np.inf

# Internal imports
from utils.voice_changer import VoiceChanger
from utils.word_replacer import WordReplacer
from utils.contrastive_replies_builder import ContrastiveRepliesBuilder

# API Keys
load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_key



ModuleNotFoundError: No module named 'utils'

In [None]:
def prompt_and_speak(input, instructions, voice='alloy', speed = 1, output_file='output.wav', play_audio=True):
    """
    Sends text to OpenAI TTS API and plays the resulting audio.

    Args:
        text (str): Text to convert to speech.
        voice (str): Voice to use. Options include 'alloy', 'ash', 'ballad', 'coral', 'echo', 'fable', 'onyx', 'nova', 'sage', 'shimmer', 'verse'.
        output_file (str): Path to save the resulting audio.
        play_audio (bool): If True, plays audio in notebook.
    """
    response = openai.audio.speech.create(
        model="gpt-4o-mini-tts",  
        voice=voice,
        input=input,
        instructions = instructions,
        response_format="wav"
    )
    
    # Save audio to file
    with open(output_file, "wb") as f:
        f.write(response.content)
        
    if play_audio:
        return ipd.Audio(output_file)
    
text = "Hello Steven, your voice assistant is now active. But I am also very angry at the world."
instructions = "Emotion Sentence 1: friendly and whispering, Emotion Sentence 2: Angry and very loud shouting."
prompt_and_speak(text, instructions, voice='ash', output_file='output.wav', play_audio=True)

# Voice Affect: Energetic and animated; dynamic with variations in pitch and tone.

# Tone: Excited and enthusiastic, conveying an upbeat and thrilling atmosphere. 

# Pacing: Rapid delivery when describing the game or the key moments (e.g., "an overtime thriller," "pull off an unbelievable win") to convey the intensity and build excitement.

# Slightly slower during dramatic pauses to let key points sink in.

# Emotion: Intensely focused, and excited. Giving off positive energy.

# Personality: Relatable and engaging. 

# Pauses: Short, purposeful pauses after key moments in the game.


# Tone: Sarcastic, disinterested, and melancholic, with a hint of passive-aggressiveness.

# Emotion: Apathy mixed with reluctant engagement.

# Delivery: Monotone with occasional sighs, drawn-out words, and subtle disdain, evoking a classic emo teenager attitude.

# Delivery: Exaggerated and theatrical, with dramatic pauses, sudden outbursts, and gleeful cackling.

# Voice: High-energy, eccentric, and slightly unhinged, with a manic enthusiasm that rises and falls unpredictably.

# Tone: Excited, chaotic, and grandiose, as if reveling in the brilliance of a mad experiment.

# Pronunciation: Sharp and expressive, with elongated vowels, sudden inflections, and an emphasis on big words to sound more diabolical.

# Voice: Laid-back, mellow, and effortlessly cool, like a surfer who's never in a rush.

# Tone: Relaxed and reassuring, keeping things light even when the customer is frustrated.

# Speech Mannerisms: Uses casual, friendly phrasing with surfer slang like dude, gnarly, and boom to keep the conversation chill.

# Pronunciation: Soft and drawn-out, with slightly stretched vowels and a naturally wavy rhythm in speech.

# Tempo: Slow and easygoing, with a natural flow that never feels rushed, creating a calming effect.

# Affect: A gentle, curious narrator with a British accent, guiding a magical, child-friendly adventure through a fairy tale world.

# Tone: Magical, warm, and inviting, creating a sense of wonder and excitement for young listeners.

# Pacing: Steady and measured, with slight pauses to emphasize magical moments and maintain the storytelling flow.

# Emotion: Wonder, curiosity, and a sense of adventure, with a lighthearted and positive vibe throughout.

# Pronunciation: Clear and precise, with an emphasis on storytelling, ensuring the words are easy to follow and enchanting to listen to.

import whisper_timestamped
from pydub import AudioSegment

def split_audio_at_fullstop(
    input_wav_path,
    output_path_part1="output_part1.wav",
    output_path_part2="output_part2.wav",
    fullstop_index=1  # which full stop to split at (1 = first, 2 = second, etc.)
):
    """
    Splits an audio file at the timestamp corresponding to the `fullstop_index`th full stop in the transcribed text.

    Args:
        input_wav_path (str): Path to the input WAV file.
        output_path_part1 (str): Path to save the first audio segment.
        output_path_part2 (str): Path to save the second audio segment.
        fullstop_index (int): Which full stop to split at (1 = first full stop).
    """
    # Load Whisper-timestamped model and transcribe
    model = whisper_timestamped.load_model("tiny")
    result = whisper_timestamped.transcribe(model, input_wav_path)
    words = result["segments"][0]["words"]

    # Identify nth full stop
    fullstop_count = 0
    split_word_idx = None
    for i, word in enumerate(words):
        if "?" in word["text"]:
            fullstop_count += 1
            if fullstop_count == fullstop_index:
                split_word_idx = i
                break

    if split_word_idx is None:
        raise ValueError(f"No full stop found at position {fullstop_index}")

    # Get split time in milliseconds
    split_time_ms = int(words[split_word_idx]["end"] * 1000)

    # Load and split the audio
    audio = AudioSegment.from_wav(input_wav_path)
    audio_part1 = audio[:split_time_ms]
    audio_part2 = audio[split_time_ms:]

    # Export both parts
    audio_part1.export(output_path_part1, format="wav")
    audio_part2.export(output_path_part2, format="wav")

    return output_path_part1, output_path_part2

from pydub import AudioSegment
import whisper_timestamped

def split_audio_using_transcript(
    input_wav_path,
    full_transcript,
    split_on=".",  # or "?"/"She just borrowed it"
    split_index=1,
    output_path_part1="output_part1.wav",
    output_path_part2="output_part2.wav"
):
    """
    Splits an audio file at the location of the N-th `split_on` token in the transcript.
    
    Args:
        input_wav_path (str): Path to the input audio file.
        full_transcript (str): Full transcript of the audio (must match timing).
        split_on (str): The token/punctuation or phrase to split after (e.g., '.', '?', or a word).
        split_index (int): The N-th occurrence to split after (1 = first).
        output_path_part1 (str): Output path for the first audio segment.
        output_path_part2 (str): Output path for the second audio segment.
    """
    # Load audio transcription with word-level timestamps
    model = whisper_timestamped.load_model("tiny")
    result = whisper_timestamped.transcribe(model, input_wav_path)
    words = result["segments"][0]["words"]

    # Flatten full_transcript into word tokens to align
    clean_words = [w.lower().strip(".,!?") for w in full_transcript.split()]
    timestamp_words = [w["text"].lower().strip(".,!?") for w in words]

    # Find split position in transcript (index of Nth occurrence of `split_on`)
    split_count = 0
    for i, word in enumerate(full_transcript.split()):
        if split_on in word:
            split_count += 1
            if split_count == split_index:
                break
    else:
        raise ValueError(f"'{split_on}' not found {split_index} times in transcript.")

    # Now find which timestamped word aligns right after the split
    split_word_text = clean_words[i + 1] if i + 1 < len(clean_words) else None
    split_time_ms = None
    for w in words:
        if split_word_text and w["text"].lower().strip(".,!?") == split_word_text:
            split_time_ms = int(w["start"] * 1000)
            break

    if split_time_ms is None:
        raise ValueError("Could not find matching word in timestamped transcript.")

    # Split and export
    audio = AudioSegment.from_wav(input_wav_path)
    audio[:split_time_ms].export(output_path_part1, format="wav")
    audio[split_time_ms:].export(output_path_part2, format="wav")

    return output_path_part1, output_path_part2



Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



## Lexical Stress Shift

In [4]:
voice = 'ash'

In [5]:
text = "She didn't steal the money. She just borrowed it."
instructions = "Stress on STEAL and BORROWED to convey the contrast between the two actions. "
prompt_and_speak(text, instructions, voice=voice, output_file='output.wav', play_audio=True)

In [12]:
text = "She didn't steal the money. Much worse, she stole the car."
instructions = "Stress the word MONEY and CAR to indicate that she didn't still the MONEY but much worse she stole the the CAR."
prompt_and_speak(text, instructions, voice=voice, output_file='output.wav', play_audio=True)


In [28]:
text = "She didn't steal the money. They did."
instructions = "Stress the word SHE and THEY to convey that SHE didn't steal it but THEY did."
prompt_and_speak(text, instructions, voice=voice, output_file='output.wav', play_audio=True)

In [29]:
# Split example
split_audio_using_transcript(
    input_wav_path="output.wav",
    full_transcript="She didn't steal the money. They did.",
    split_on=".",
    split_index=1
)
ipd.Audio("output_part1.wav")

Detected language: English


100%|██████████| 240/240 [00:00<00:00, 1081.81frames/s]


In [30]:
ipd.Audio('output_part2.wav')

## Vocal Mode

In [None]:
voice = 'ash'

In [None]:
text = "Why did you do that? You're such an idiot! "
instructions = "Emotion: Angry, Delivery: Shouting"
prompt_and_speak(text, instructions, voice=voice, output_file='output.wav', play_audio=True)

In [None]:
text = "Why did you do that? You look so funny now."
instructions = "Emotion: Funny and kind and cute and laughing, Delivery: Can't even say the words without laughing"
prompt_and_speak(text, instructions, voice=voice, output_file='output.wav', play_audio=True)

In [None]:
text = "Why did you do that? You knew how much it meant to me."
instructions = "Emotion: Sad, Delivery: Crying"
prompt_and_speak(text, instructions, voice=voice, output_file='output.wav', play_audio=True)