In [18]:
import tkinter as tk
from tkinter import filedialog, messagebox
import openai
import os
import yaml
import speech_recognition as sr
import pyttsx3
from google.cloud import texttospeech
import os
import threading

In [19]:
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)
    openai.api_key = config.get('api')
    
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/w_yy_ccc/Desktop/ProjectTracking/google_api.json'
tts_client = texttospeech.TextToSpeechClient()

In [39]:
# Different functions
def select_image():
    file_path = filedialog.askopenfilename(filetypes=[("Image files", "*.jpg *.png")])
    if file_path:
        image_path.set(file_path)

# def select_audio():
#     file_path = filedialog.askopenfilename(filetypes=[("Audio files", "*.mp3")])
#     if file_path:
#         audio_path.set(file_path)

# Record audio and transcribe it
def record_audio():
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        messagebox.showinfo("Recording", "Please start speaking...")
        try:
            audio_data = recognizer.listen(source)
            messagebox.showinfo("Recording", "Recording finished.")
            
            # Try to recognize the audio
            user_input = recognizer.recognize_google(audio_data)
            audio_transcription.set(user_input)
             
        except sr.UnknownValueError:
            audio_transcription.set("Sorry, I couldn't understand the audio. Please try again.")
        except sr.RequestError as e:
            audio_transcription.set(f"Error with the speech recognition service: {e}")

def summarize():
    img = image_path.get()
    txt = text_input.get("1.0", tk.END).strip()
    aud_txt = audio_transcription.get()

    if not img and not aud_txt and not txt:
        messagebox.showerror("Input Error", "Please provide at least one input (image, audio, or text).")
        return

    prompt = f"Image: {img}\nAudio: {aud_txt}\nText: {txt}\nPlease conclude the customer's requirements from the information provided. Answer as short as possible."
    
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    
    summary_output.set(response['choices'][0]['message']['content'].strip())        



In [40]:

# Function to play synthesized speech
def play_audio(filename):
    os.system(f"afplay {filename}")
# Function to synthesize speech using Google Cloud Text-to-Speech
def synthesize_speech(text, filename='output.mp3', voice_name='en-US-Journey-D'):
    synthesis_input = texttospeech.SynthesisInput(text=text)
    voice = texttospeech.VoiceSelectionParams(
        language_code="en-US",
        name=voice_name,
        ssml_gender=texttospeech.SsmlVoiceGender.MALE
    )
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    )
    response = tts_client.synthesize_speech(
        input=synthesis_input,
        voice=voice,
        audio_config=audio_config
    )
    with open(filename, 'wb') as out:
        out.write(response.audio_content)
    return filename

# Function to handle voice interaction

def voice_interaction():
    global voice_interaction_active
    voice_interaction_active = True
    recognizer = sr.Recognizer()

    initial_question = "Hello! What are we up to today?"
    play_audio(synthesize_speech(initial_question))

    with sr.Microphone() as source:
        while voice_interaction_active:
            try:
                print("Listening for user input...")
                audio_data = recognizer.listen(source)
                user_input = recognizer.recognize_google(audio_data)
                print(f"User said: {user_input}")

                if not voice_interaction_active:
                    break

                prompt = f"The user said: {user_input}. Please respond appropriately."
                response = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "system", "content": "You are a sweet, concise assistant. You always add a sweet care message to the user at the end of each answer."},
                        {"role": "user", "content": prompt}
                    ]
                )
                ai_response = response['choices'][0]['message']['content'].strip()
                print(f"AI response: {ai_response}")

                play_audio(synthesize_speech(ai_response))

            except sr.UnknownValueError:
                error_message = "Sorry, I didn't catch that. Please try again."
                play_audio(synthesize_speech(error_message))
            except sr.RequestError:
                error_message = "Sorry, there was an error with the speech recognition service."
                play_audio(synthesize_speech(error_message))

def start_voice_interaction():
    threading.Thread(target=voice_interaction).start()

def end_voice_interaction():
    global voice_interaction_active
    voice_interaction_active = False


In [41]:
# Create the main window
root = tk.Tk()
root.title("Summarizer")

# Define variables
image_path = tk.StringVar()
audio_path = tk.StringVar()
summary_output = tk.StringVar()

# Create and layout widgets
tk.Label(root, text="Select Image:").grid(row=0, column=0, padx=10, pady=10)
tk.Entry(root, textvariable=image_path, width=50).grid(row=0, column=1, padx=10, pady=10)
tk.Button(root, text="Browse", command=select_image).grid(row=0, column=2, padx=10, pady=10)

tk.Label(root, text="Audio Transcription:").grid(row=1, column=0, padx=10, pady=10)
tk.Label(root, textvariable=audio_transcription, wraplength=400, justify="left").grid(row=1, column=1, columnspan=2, padx=10, pady=10)

tk.Button(root, text="Record Audio", command=record_audio).grid(row=1, column=1, padx=10, pady=10)


tk.Label(root, text="Enter Text:").grid(row=2, column=0, padx=10, pady=10)
text_input = tk.Text(root, height=10, width=50)
text_input.grid(row=2, column=1, columnspan=2, padx=10, pady=10)

tk.Button(root, text="Summary", command=summarize).grid(row=3, column=1, padx=10, pady=20)

tk.Label(root, text="Summary Output:").grid(row=4, column=0, padx=10, pady=10)
tk.Label(root, textvariable=summary_output, wraplength=400, justify="left").grid(row=4, column=1, columnspan=2, padx=10, pady=10)

# Voice interaction buttons
tk.Button(root, text="Start Voice Interaction", command=start_voice_interaction).grid(row=5, column=1, padx=10, pady=10)
tk.Button(root, text="End Voice Interaction", command=end_voice_interaction).grid(row=6, column=1, padx=10, pady=10)

In [42]:
# The main loop
root.mainloop()