In [2]:
hr_lines = [
    "Hi John, thanks for joining. Please have a seat.",
    "I know this might not be an easy conversation, but I want to be direct and respectful. After a thorough review, the management has made the decision to end your employment with the company, effective immediately.",
    "Yes. I understand this may come as a shock. This decision was not taken lightly.",
    "I completely understand your concern. The decision was based on a combination of factors, including ongoing feedback regarding project deadlines, communication with the team, and alignment with company goals. This has been discussed over the past few months during your check-ins.",
    "You‚Äôre right that we didn‚Äôt issue a formal Performance Improvement Plan. However, your last two evaluations did raise several red flags. The management team discussed this and decided not to proceed with a PIP but rather make a direct decision.",
    "That was also a factor. The company is going through changes, and unfortunately, that means reducing roles in some departments.",
    "Your access to company systems will end by the end of today. You‚Äôll receive two months‚Äô severance pay, continued health benefits for 30 days, and we‚Äôll offer outplacement support if you‚Äôre interested.",
    "Yes. I‚Äôm happy to provide a neutral reference confirming your role and time here. For a more detailed recommendation, I suggest reaching out to your former manager directly.",
    "Just your laptop and access card. You can leave them with IT today or tomorrow. We‚Äôll email a checklist.",
    "I truly wish you all the best, John. If you need anything over the next few days, don‚Äôt hesitate to reach out."
]

employee_lines = [
    "Sure, thanks Mary.",
    "‚Ä¶Wait‚ÄîI'm being let go?",
    "Can I ask why? I thought my performance had been solid lately.",
    "But no one told me I was at risk of being fired. I was never formally warned.",
    "I‚Äôm still surprised. I‚Äôve been trying to improve. Is this related to the recent restructuring?",
    "So, what happens now?",
    "Will I be able to get a reference?",
    "Alright. Do I need to return anything?",
    "I see. Well‚Ä¶ this is not how I imagined today would go. But I appreciate the clarity.",
    "Thanks, Mary. I‚Äôll take some time to process this."
]
hr_directory = r"E:\Desktop\AI Stack\conversation-resource\hr"
employee_directory = r"E:\Desktop\AI Stack\conversation-resource\employee"

In [4]:
import openai
import os
from dotenv import load_dotenv

load_dotenv()
openai.api_key = os.getenv("OPENAI_COACH_PROJECT_KEY")

In [5]:
import openai
import time


def tts_openai_invoke(input_text, output_path):
    start = time.time()
    response = openai.audio.speech.create(
        model="tts-1",  # or "tts-1-hd"
        voice="alloy",  # or: alloy, echo, fable, onyx, nova, shimmer
        input=input_text
    )

    with open(output_path, "wb") as f:
        f.write(response.content)

    print(f"‚úÖ Audio saved to {os.path.basename(output_path)} within {time.time() - start:.2f} seconds")

In [7]:
import os
from google.cloud import texttospeech
from dotenv import load_dotenv
import time

load_dotenv()

client = texttospeech.TextToSpeechClient()


def tts_google_invoke(input_text, output_path):
    if output_path.endswith("mp3"):
        audio_encoding = texttospeech.AudioEncoding.MP3
    else:
        audio_encoding = texttospeech.AudioEncoding.LINEAR16
    start = time.time()
    synthesis_input = texttospeech.SynthesisInput(text=input_text)

    voice = texttospeech.VoiceSelectionParams(
        language_code="en-US",
        name="en-US-Wavenet-D",
        ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
    )

    audio_config = texttospeech.AudioConfig(
        audio_encoding=audio_encoding
    )

    # call the API
    response = client.synthesize_speech(
        input=synthesis_input,
        voice=voice,
        audio_config=audio_config
    )

    with open(output_path, "wb") as out:
        out.write(response.audio_content)
        print(f"‚úÖ Audio saved to {output_path} within {time.time() - start:.2f} seconds")


In [8]:
for i, line in enumerate(hr_lines):
    tts_openai_invoke(input_text=line, output_path=rf"{hr_directory}\{i}.mp3")

‚úÖ Audio saved to 0.mp3 within 2.48 seconds
‚úÖ Audio saved to 1.mp3 within 2.94 seconds
‚úÖ Audio saved to 2.mp3 within 1.87 seconds
‚úÖ Audio saved to 3.mp3 within 4.10 seconds
‚úÖ Audio saved to 4.mp3 within 2.71 seconds
‚úÖ Audio saved to 5.mp3 within 1.88 seconds
‚úÖ Audio saved to 6.mp3 within 2.57 seconds
‚úÖ Audio saved to 7.mp3 within 2.34 seconds
‚úÖ Audio saved to 8.mp3 within 2.13 seconds
‚úÖ Audio saved to 9.mp3 within 2.39 seconds


In [9]:
for i, line in enumerate(hr_lines):
    tts_google_invoke(input_text=line, output_path=rf"{hr_directory}\{i}.wav")

‚úÖ Audio saved to E:\Desktop\AI Stack\conversation-resource\hr\0.wav within 0.44 seconds
‚úÖ Audio saved to E:\Desktop\AI Stack\conversation-resource\hr\1.wav within 0.59 seconds
‚úÖ Audio saved to E:\Desktop\AI Stack\conversation-resource\hr\2.wav within 0.61 seconds
‚úÖ Audio saved to E:\Desktop\AI Stack\conversation-resource\hr\3.wav within 0.55 seconds
‚úÖ Audio saved to E:\Desktop\AI Stack\conversation-resource\hr\4.wav within 1.01 seconds
‚úÖ Audio saved to E:\Desktop\AI Stack\conversation-resource\hr\5.wav within 0.50 seconds
‚úÖ Audio saved to E:\Desktop\AI Stack\conversation-resource\hr\6.wav within 0.49 seconds
‚úÖ Audio saved to E:\Desktop\AI Stack\conversation-resource\hr\7.wav within 0.46 seconds
‚úÖ Audio saved to E:\Desktop\AI Stack\conversation-resource\hr\8.wav within 0.71 seconds
‚úÖ Audio saved to E:\Desktop\AI Stack\conversation-resource\hr\9.wav within 0.38 seconds


In [10]:
import os
from groq import Groq


def stt_groq_invoke(audio_path):
    client = Groq()

    # model = "whisper-large-v3-turbo"
    model = "whisper-large-v3"

    start = time.time()
    with open(audio_path, "rb") as file:
        transcription = client.audio.transcriptions.create(
            file=(audio_path, file.read()),
            model=model,
            response_format="verbose_json",
        )
    print(f"‚úÖ Total time convertion mp3: {time.time() - start}")
    return transcription.text



In [11]:
import pygame


def play_mp3(file_path):
    pygame.mixer.init()
    pygame.mixer.music.load(file_path)
    pygame.mixer.music.play()
    while pygame.mixer.music.get_busy():
        continue


pygame 2.6.1 (SDL 2.28.4, Python 3.10.16)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [13]:
# from playsound import playsound
import time


def simulation():
    for i in range(10):
        hr_mp3_path = rf"{hr_directory}\{i}.mp3"
        play_mp3(hr_mp3_path)
        text = stt_groq_invoke(hr_mp3_path)
        print(text)
        time.sleep(0.7)
        employee_mp3_path = rf"{employee_directory}\{i}.mp3"
        tts_google_invoke(employee_lines[i], employee_mp3_path)
        play_mp3(employee_mp3_path)
        time.sleep(0.7)


In [14]:
simulation()

‚úÖ Total time convertion mp3: 0.4945547580718994
 Hi, John. Thanks for joining. Please have a seat.
‚úÖ Audio saved to E:\Desktop\AI Stack\conversation-resource\employee\0.mp3 within 0.21 seconds
‚úÖ Total time convertion mp3: 0.8157632350921631
 I know this might not be an easy conversation, but I want to be direct and respectful. After a thorough review, the management has made the decision to end your employment with the company, effective immediately.
‚úÖ Audio saved to E:\Desktop\AI Stack\conversation-resource\employee\1.mp3 within 0.20 seconds
‚úÖ Total time convertion mp3: 0.5018312931060791
 Yes, I understand this may come as a shock. This decision was not taken lightly.
‚úÖ Audio saved to E:\Desktop\AI Stack\conversation-resource\employee\2.mp3 within 0.45 seconds
‚úÖ Total time convertion mp3: 0.9863526821136475
 I completely understand your concern. The decision was based on a combination of factors, including ongoing feedback regarding project deadlines, communication with

In [21]:
import sounddevice as sd
import numpy as np
from scipy.io.wavfile import write
from pydub import AudioSegment
import os
import time

import sounddevice as sd
import numpy as np
from scipy.io.wavfile import write
from pydub import AudioSegment
import os
import threading

def record_to_mp3_on_keypress(output_mp3_path, fs=16000):
    """
    Records microphone input and stops when Enter is pressed.
    Saves the audio to an MP3 file.
    """
    print("üéôÔ∏è Recording... Press ENTER to stop.")

    recording = []
    stop_flag = threading.Event()

    def input_listener():
        input("üëâ Press ENTER to stop recording...\n")
        stop_flag.set()

    def callback(indata, frames, time_info, status):
        if stop_flag.is_set():
            raise sd.CallbackStop()
        recording.append(indata.copy())

    # Start keyboard listener thread
    threading.Thread(target=input_listener, daemon=True).start()

    try:
        with sd.InputStream(samplerate=fs, channels=1, callback=callback):
            while not stop_flag.is_set():
                sd.sleep(100)
    except sd.CallbackStop:
        pass
    except Exception as e:
        print(f"‚ùå Recording error: {e}")
        return

    print("üõë Recording stopped.")

    if not recording:
        print("‚ö†Ô∏è No audio was recorded.")
        return

    # Combine and save
    audio_data = np.concatenate(recording, axis=0)
    temp_wav = "temp.wav"
    write(temp_wav, fs, audio_data)

    sound = AudioSegment.from_wav(temp_wav)
    sound.export(output_mp3_path, format="mp3")
    os.remove(temp_wav)

    print(f"‚úÖ Recording saved to {output_mp3_path}")


In [23]:
record_to_mp3_on_keypress(r"E:\Desktop\AI Stack\conversation-resource\dummy.mp3")

üéôÔ∏è Recording... Press ENTER to stop.
üõë Recording stopped.
‚úÖ Recording saved to E:\Desktop\AI Stack\conversation-resource\dummy.mp3


In [25]:
def simulation():
   for i in range(10):
        # Áî®Êà∑ÂΩïÈü≥Êõø‰ª£ HR Èü≥È¢ëÊí≠Êîæ
        hr_mp3_path = rf"{hr_directory}\{i}.mp3"
        record_to_mp3_on_keypress(hr_mp3_path)

        # STT ËØÜÂà´
        text = stt_groq_invoke(hr_mp3_path)
        print(f"HR said: {text}")
        time.sleep(0.7)

        # TTS ÁîüÊàêÂëòÂ∑•ËØ≠Èü≥
        employee_mp3_path = rf"{employee_directory}\{i}.mp3"
        tts_google_invoke(employee_lines[i], employee_mp3_path)
        play_mp3(employee_mp3_path)
        time.sleep(0.7)

In [26]:
simulation()

üéôÔ∏è Recording... Press ENTER to stop.
üõë Recording stopped.
‚úÖ Recording saved to E:\Desktop\AI Stack\conversation-resource\hr\0.mp3
‚úÖ Total time convertion mp3: 0.41703152656555176
HR said:  Hi John, thanks for joining. Please have a seat.
‚úÖ Audio saved to E:\Desktop\AI Stack\conversation-resource\employee\0.mp3 within 0.25 seconds
üéôÔ∏è Recording... Press ENTER to stop.
üõë Recording stopped.
‚úÖ Recording saved to E:\Desktop\AI Stack\conversation-resource\hr\1.mp3
‚úÖ Total time convertion mp3: 0.5667593479156494
HR said:  I know this might be not an easy conversation, but I want to be direct and respectful. After a thorough review, the management has made the decision to end your employment with the company efficiently immediately.
‚úÖ Audio saved to E:\Desktop\AI Stack\conversation-resource\employee\1.mp3 within 0.22 seconds
üéôÔ∏è Recording... Press ENTER to stop.
üõë Recording stopped.
‚úÖ Recording saved to E:\Desktop\AI Stack\conversation-resource\hr\2.mp3
‚úÖ 