# Voice-Enabled AI Assistant Demo

**Pipeline:** Audio -> STT (Whisper) -> LLM (GPT-4o-mini) -> TTS (OpenAI) -> Audio

In [3]:
import os
from openai import OpenAI
from dotenv import load_dotenv
from IPython.display import Audio, display

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
print("OpenAI client initialized")

OpenAI client initialized


In [4]:
# 1. STT - Speech to Text
def transcribe_audio(audio_file_path: str) -> str:
    with open(audio_file_path, "rb") as audio_file:
        transcript = client.audio.transcriptions.create(
            model="whisper-1",
            file=audio_file,
            response_format="text"
        )
    return transcript

In [5]:
# 2. LLM Processing
def process_with_llm(user_text: str) -> str:
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful customer service assistant. Keep responses concise (2-3 sentences)."},
            {"role": "user", "content": user_text}
        ],
        max_tokens=150
    )
    return response.choices[0].message.content

In [None]:
#def rag_pipeline():

In [6]:
# 3. TTS - Text to Speech
def text_to_speech(text: str, output_path: str = "response.mp3") -> str:
    response = client.audio.speech.create(
        model="tts-1",
        voice="alloy",
        input=text
    )
    response.stream_to_file(output_path)
    return output_path

In [7]:
# Full Pipeline
def voice_assistant(audio_path: str = None, text_input: str = None):
    # Step 1: STT or use text directly
    if audio_path:
        user_text = transcribe_audio(audio_path)
    else:
        user_text = text_input
    print(f"User: {user_text}")
    
    # Step 2: LLM
    response_text = process_with_llm(user_text)
    print(f"Assistant: {response_text}")
    
    # Step 3: TTS
    audio_file = text_to_speech(response_text)
    display(Audio(audio_file, autoplay=True))
    
    return {"user": user_text, "assistant": response_text}

In [8]:
# Test with text
voice_assistant(text_input="What are your store hours?")

User: What are your store hours?
Assistant: Our store hours are Monday to Friday from 9 AM to 8 PM, and Saturday to Sunday from 10 AM to 6 PM. If you need more specific information, please check our website or contact us directly.


  response.stream_to_file(output_path)


{'user': 'What are your store hours?',
 'assistant': 'Our store hours are Monday to Friday from 9 AM to 8 PM, and Saturday to Sunday from 10 AM to 6 PM. If you need more specific information, please check our website or contact us directly.'}

In [None]:
# Test with audio file (when available)
# voice_assistant(audio_path="recording.wav")