In [1]:
import os
import numpy as np
import torch
import whisper
from pydub import AudioSegment, effects
import noisereduce as nr
import soundfile as sf
from speechbrain.inference import EncoderClassifier
import pyttsx3
import sounddevice as sd
import tempfile
import time

from langchain import LLMChain
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI

from langdetect import detect, DetectorFactory, LangDetectException

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DetectorFactory.seed = 0

In [3]:
class SessionState:
    def __init__(self, objective, target_language):
        self.objective = objective
        self.target_language = target_language
        self.history = []  # List of dictionaries: {'user': ..., 'assistant': ...}
        self.current_status = 'ongoing'  # Can be 'ongoing', 'fulfilled', 'failed'

    def add_interaction(self, user_text, assistant_response):
        self.history.append({"user": user_text, "assistant": assistant_response})

    def update_status(self, status):
        self.current_status = status

    def get_summary(self):
        return "\n".join([f"User: {item['user']}\nAssistant: {item['assistant']}" for item in self.history])

In [4]:
class AudioHandler:
    def __init__(self, sample_rate=16000, duration=5):
        self.sample_rate = sample_rate
        self.duration = duration

    def record_audio(self):
        print("Recording...")
        try:
            recording = sd.rec(int(self.duration * self.sample_rate), samplerate=self.sample_rate, channels=1, dtype='float32')
            sd.wait()  # Wait until recording is finished
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
                sf.write(tmpfile.name, recording, self.sample_rate)
                print(f"Audio recorded and saved to {tmpfile.name}")
                return tmpfile.name
        except Exception as e:
            print(f"Failed to record audio: {e}")
            return None

    def preprocess_audio(self, input_path):
        normalized_path = "normalized_audio.wav"
        denoised_path = "denoised_audio.wav"
        
        # Normalize Audio
        audio = AudioSegment.from_file(input_path)
        normalized_audio = effects.normalize(audio, headroom=-20.0)
        normalized_audio.export(normalized_path, format="wav")
        print(f"Normalized audio saved to {normalized_path}")

        # Reduce Noise
        data, rate = sf.read(normalized_path)
        reduced_noise = nr.reduce_noise(y=data, sr=rate)
        sf.write(denoised_path, reduced_noise, rate)
        print(f"Noise-reduced audio saved to {denoised_path}")

        return denoised_path

In [5]:
class TranscriptionHandler:
    def __init__(self, model_name='large', device=None):
        self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")
        print("Loading Whisper model...")
        self.model = whisper.load_model(model_name).to(self.device)
        print("Whisper model loaded.")

    def transcribe(self, audio_path):
        print("Transcribing with Whisper...")
        result = self.model.transcribe(audio_path)
        return result['text']

In [6]:
class LanguageProcessor:
    def __init__(self, llm):
        self.llm = llm  # Instance of ChatGoogleGenerativeAI or similar

    def detect_language(self, text):
        try:
            language = detect(text)
            print(f"Detected language: {language}")
            return language
        except LangDetectException:
            print("Could not detect language.")
            return None

    def translate_text(self, text, source_lang, target_lang):
        prompt_template = PromptTemplate(
            input_variables=["text", "source_lang", "target_lang"],
            template="""
You are a proficient translator.

Source Language: {source_lang}
Target Language: {target_lang}

Please translate the following text from {source_lang} to {target_lang}:

"{text}"

Translation:
"""
        )
        chain = LLMChain(llm=self.llm, prompt=prompt_template)
        translation = chain.run(text=text, source_lang=source_lang, target_lang=target_lang)
        return translation.strip()

In [7]:
class IntentRecognizer:
    def __init__(self, llm):
        self.llm = llm

    def recognize_intent(self, text):
        prompt_template = PromptTemplate(
            input_variables=["text"],
            template="""
You are an assistant that extracts the user's intent from their input.

User Input: "{text}"

Determine the user's intent and extract relevant information.

Intent and Information:
"""
        )
        chain = LLMChain(llm=self.llm, prompt=prompt_template)
        response = chain.run(text=text)
        return response.strip()

In [8]:
class ConversationManager:
    def __init__(self, llm, session):
        self.llm = llm
        self.session = session
        self.intent_recognizer = IntentRecognizer(llm)
        self.language_processor = LanguageProcessor(llm)

    def evaluate_objective(self, user_text, assistant_response):
        # Define how to evaluate if the objective is met
        # This could be based on keywords, specific responses, or a more complex analysis
        # For simplicity, let's assume if the assistant confirms the objective, it's fulfilled
        if any(keyword in assistant_response.lower() for keyword in ["completed", "achieved", "done"]):
            self.session.update_status('fulfilled')
            return True
        return False

    def generate_response(self, translated_text):
        prompt_template = PromptTemplate(
            input_variables=["translated_text", "objective"],
            template="""
You are an assistant tasked with helping the user achieve their objective.

Objective: {objective}

User Input: "{translated_text}"

Generate a response that moves towards achieving the user's objective.
"""
        )
        chain = LLMChain(llm=self.llm, prompt=prompt_template)
        response = chain.run(translated_text=translated_text, objective=self.session.objective)
        return response.strip()

    def manage_conversation(self, user_text):
        # Detect language
        source_lang = self.language_processor.detect_language(user_text)
        if not source_lang:
            return "Sorry, I couldn't detect the language of your input.", False

        # Translate to target language
        translated_text = self.language_processor.translate_text(user_text, source_lang, self.session.target_language)
        print(f"Translated Text: {translated_text}")

        # Recognize intent (if needed)
        intent_info = self.intent_recognizer.recognize_intent(translated_text)
        print(f"Recognized Intent: {intent_info}")

        # Generate assistant response
        assistant_response = self.generate_response(translated_text)
        print(f"Assistant Response: {assistant_response}")

        # Optionally, translate assistant response back to user's language
        final_response = self.language_processor.translate_text(assistant_response, self.session.target_language, source_lang)
        print(f"Final Response (Translated Back): {final_response}")

        # Add to session history
        self.session.add_interaction(user_text, final_response)

        # Evaluate if objective is met
        if self.evaluate_objective(user_text, assistant_response):
            return final_response, True  # Objective fulfilled
        else:
            return final_response, False  # Continue conversation

In [9]:
class SummaryGenerator:
    def __init__(self, llm):
        self.llm = llm

    def generate_summary(self, history):
        summary_text = "Conversation History:\n" + "\n".join(
            [f"User: {item['user']}\nAssistant: {item['assistant']}" for item in history]
        )
        prompt_template = PromptTemplate(
            input_variables=["history"],
            template="""
You are an assistant that summarizes conversations.

{history}

Generate a concise summary of the conversation, focusing on the objectives achieved and any unresolved issues.

Summary:
"""
        )
        chain = LLMChain(llm=self.llm, prompt=prompt_template)
        summary = chain.run(history=summary_text)
        return summary.strip()

In [10]:
# Utility Function for User Settings
LANGUAGE_CODE_MAP = {
    'english': 'en',
    'spanish': 'es',
    'french': 'fr',
    # Add more mappings as needed
}

def get_user_settings():
    print("Welcome to the Translation App!")
    objective = input("Please enter your objective (e.g., Schedule a meeting): ").strip()
    target_language_input = input("Please enter the target language (e.g., English, Spanish): ").strip().lower()
    target_language = LANGUAGE_CODE_MAP.get(target_language_input, 'en')  # Default to English
    return objective, target_language

# %%
# Language Model Initialization
def initialize_language_model():
    model = ChatGoogleGenerativeAI(
        model="gemini-1.5-flash",
        google_api_key=os.getenv("GEMINI_API_KEY"),
        temperature=0.5
    )
    return model

In [11]:
def main():
    # Initialize Language Model
    try:
        llm = initialize_language_model()
    except Exception as e:
        print(f"Failed to initialize language model: {e}")
        # Handle the error appropriately (e.g., notify the user)
        return

    # Get User Settings
    try:
        objective, target_language = get_user_settings()
    except Exception as e:
        print(f"Failed to get user settings: {e}")
        # Handle the error appropriately
        return

    # Initialize Session State
    session = SessionState(objective=objective, target_language=target_language)

    # Initialize Conversation Manager
    conversation_manager = ConversationManager(llm=llm, session=session)

    # Initialize Audio Handler
    audio_handler = AudioHandler(duration=5, sample_rate=16000)

    # Initialize Transcription Handler
    transcription_handler = TranscriptionHandler()

    # Initialize Summary Generator
    summary_generator = SummaryGenerator(llm=llm)

    print(f"Objective set to: {objective}")
    print(f"Target language: {target_language}")

    while session.current_status == 'ongoing':
        print("\nPlease speak your input. Press Ctrl+C to exit.")
        try:
            # Record Audio
            input_audio = audio_handler.record_audio()
            if not input_audio:
                print("Failed to record audio. Please try again.")
                continue

            # Preprocess Audio
            denoised_audio = audio_handler.preprocess_audio(input_audio)

            # Transcribe Audio
            user_text = transcription_handler.transcribe(denoised_audio)
            print(f"**Transcribed Text:** {user_text}")

            # Manage Conversation
            assistant_response, fulfilled = conversation_manager.manage_conversation(user_text)
            print(f"**Assistant Response:** {assistant_response}")

            # Optionally, implement TTS here later

            if fulfilled:
                print("Objective fulfilled. Ending conversation.")
                session.update_status('fulfilled')
                break

            # Cleanup temporary audio files
            os.remove(input_audio)
            os.remove(denoised_audio)

        except KeyboardInterrupt:
            print("\nConversation terminated by user.")
            session.update_status('failed')
            break
        except Exception as e:
            print(f"An error occurred: {e}")
            # Handle other exceptions appropriately

    # Generate Summary if Conversation Ended
    if session.history:
        summary = summary_generator.generate_summary(session.history)
        print("\n**Conversation Summary:**")
        print(summary)
        # Optionally, implement TTS for summary here later

if __name__ == "__main__":
    main()

Welcome to the Translation App!
Using device: cpu
Loading Whisper model...


  checkpoint = torch.load(fp, map_location=device)


Whisper model loaded.
Objective set to: Negotiate price of taxi to 4 dollars
Target language: es

Please speak your input. Press Ctrl+C to exit.
Recording...
Audio recorded and saved to C:\Users\Owenc\AppData\Local\Temp\tmpj848l1vv.wav
Normalized audio saved to normalized_audio.wav
Noise-reduced audio saved to denoised_audio.wav
Transcribing with Whisper...




**Transcribed Text:**  Продолжение следует...
Detected language: ru


  if ismodule(module) and hasattr(module, '__file__'):
  chain = LLMChain(llm=self.llm, prompt=prompt_template)
  translation = chain.run(text=text, source_lang=source_lang, target_lang=target_lang)


Translated Text: "Continuará..."
Recognized Intent: ## Intent and Information:

**Intent:**  To indicate that the current story, conversation, or event is not finished and will continue at a later time.

**Information:**  The user is using a common phrase in Spanish, "Continuará...", which translates to "To be continued..." in English. This implies a continuation of the current topic.
Assistant Response: I understand you want to negotiate the price of a taxi down to $4.  "Continuará..." doesn't give me much to work with.  To help you, I need more information! 

Tell me:

* **Where are you?** (Country, city, or even just a general location helps)
* **What is the current price the taxi driver is asking?**
* **What is your destination?** (This can help me understand if the price is reasonable)
* **What is your strategy for negotiating?** (Are you friendly and polite, or are you more assertive?)

Once I have this information, I can give you specific advice on how to negotiate the price dow



**Transcribed Text:**  Oh
Detected language: de
Translated Text: "Ah"
Recognized Intent: ## Intent and Information:

**Intent:**  It's difficult to determine a clear intent from the input "Ah". It could be interpreted in several ways, depending on context:

* **Acknowledgement:** The user might be acknowledging something previously said.
* **Surprise:** The user might be expressing surprise or astonishment.
* **Disappointment:** The user might be expressing disappointment or frustration.
* **Pain:** The user might be expressing pain or discomfort.
* **Hesitation:** The user might be hesitating before continuing a thought.

**Information:** There is no relevant information to extract from this input alone. 

**Recommendation:**  To understand the user's intent, more context is needed. For example, you could ask:

* "What's that about?"
* "What happened?"
* "Can you tell me more?"
Assistant Response: "Ah, that's a bit steep! Could you do $4?"
Final Response (Translated Back): "Ach, das i




Conversation terminated by user.

**Conversation Summary:**
The user is attempting to negotiate a taxi fare down to $4. The assistant is requesting more information to provide helpful advice, but the user has not provided any details about their location, the current price, their destination, or their negotiation style. The conversation ends with the user simply saying "Oh", leaving the negotiation unresolved.
