In [None]:
# !pip install --quiet torch whisper pydub noisereduce soundfile speechbrain transformers pyttsx3 langchain langchain_google_genai

In [1]:
import os
import numpy as np
import torch
import whisper
from pydub import AudioSegment, effects
import noisereduce as nr
import soundfile as sf
from speechbrain.inference import EncoderClassifier
import pyttsx3

from langchain import LLMChain
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tts_engine = pyttsx3.init()

def setup_tts():
    tts_engine.setProperty('rate', 130)    # Speed percent (can go over 100)
    tts_engine.setProperty('volume', 0.9)  # Volume 0-1
    voices = tts_engine.getProperty('voices')
    tts_engine.setProperty('voice', voices[0].id)  # 0 for male, 1 for female

def speak_text(text):
    tts_engine.say(text)
    tts_engine.runAndWait()

In [5]:
speak_text("hello there friend")

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


In [7]:
print("Loading Whisper model...")
whisper_model = whisper.load_model("large").to(device)  # Options: ['base', 'small', 'medium', 'large']
print("Whisper model loaded.")

Loading Whisper model...


  checkpoint = torch.load(fp, map_location=device)


Whisper model loaded.


In [8]:
def normalize_audio(input_path, output_path, target_dBFS=-20.0):
    audio = AudioSegment.from_file(input_path)
    normalized_audio = effects.normalize(audio, headroom=target_dBFS)
    normalized_audio.export(output_path, format="wav")
    print(f"Normalized audio saved to {output_path}")

def reduce_noise(input_path, output_path):
    data, rate = sf.read(input_path)
    reduced_noise = nr.reduce_noise(y=data, sr=rate)
    sf.write(output_path, reduced_noise, rate)
    print(f"Noise-reduced audio saved to {output_path}")

def preprocess_audio(input_path, normalized_path, denoised_path):
    normalize_audio(input_path, normalized_path)
    reduce_noise(normalized_path, denoised_path)

In [9]:
def transcribe_whisper(audio_path):
    print("Transcribing with Whisper...")
    result = whisper_model.transcribe(audio_path)
    return result['text']

In [10]:
def initialize_language_model():
    model = ChatGoogleGenerativeAI(
        model="gemini-1.5-flash",
        google_api_key=os.getenv("GEMINI_API_KEY"),
        temperature=0.5
    )
    return model

In [11]:
def recognize_intent(llm, text):
    prompt_template = PromptTemplate(
        input_variables=["text"],
        template="""
You are an assistant that extracts the user's intent from their input.

User Input: "{text}"

Determine the user's intent and extract relevant information.

Intent and Information:"""
    )
    chain = LLMChain(llm=llm, prompt=prompt_template)
    response = chain.run(text=text)
    return response.strip()

# Generate assistant response using the LLM
def generate_response(llm, intent_info):
    prompt_template = PromptTemplate(
        input_variables=["intent_info"],
        template="""
You are an assistant tasked with helping the user achieve their objectives based on the following intent and information.

Intent and Information:
{intent_info}

Generate a response that moves towards achieving the user's objective.
"""
    )
    chain = LLMChain(llm=llm, prompt=prompt_template)
    response = chain.run(intent_info=intent_info)
    return response.strip()

class SessionState:
    def __init__(self):
        self.history = []
        self.objectives = {}

    def add_interaction(self, user_text, assistant_response):
        self.history.append({"user": user_text, "assistant": assistant_response})

    def set_objective(self, objective, details):
        self.objectives[objective] = details

    def get_history(self):
        return self.history

    def get_objectives(self):
        return self.objectives

In [12]:
def generate_summary(llm, session):
    history = session.get_history()
    summary_text = "Conversation History:\n"
    for interaction in history:
        summary_text += f"User: {interaction['user']}\nAssistant: {interaction['assistant']}\n"
    prompt_template = PromptTemplate(
        input_variables=["history"],
        template="""
You are an assistant that summarizes conversations.

{history}

Generate a concise summary of the conversation, focusing on the objectives achieved and any unresolved issues.

Summary:"""
    )
    chain = LLMChain(llm=llm, prompt=prompt_template)
    summary = chain.run(history=summary_text)
    return summary.strip()

In [13]:
def main():
    setup_tts()
    llm = initialize_language_model()
    session = SessionState()

    # Paths to your audio files
    input_audio = "testing.wav"  # Update with your input audio path
    normalized_audio = "normalized_audio.wav"
    denoised_audio = "denoised_audio.wav"

    preprocess_audio(input_audio, normalized_audio, denoised_audio)

    whisper_text = transcribe_whisper("denoised_audio.wav")
    print("\n**Whisper Transcription:**")
    print(whisper_text)
    speak_text(whisper_text)  

    intent_info = recognize_intent(llm, whisper_text)
    print("\n**Recognized Intent and Information:**")
    print(intent_info)

    assistant_response = generate_response(llm, intent_info)
    print("\n**Assistant Response:**")
    print(assistant_response)
    speak_text(assistant_response)  # Speak the assistant's response

    session.add_interaction(whisper_text, assistant_response)

    # Check if an objective is achieved or if there is trouble
    # For demonstration, we'll generate a summary after each interaction
    summary = generate_summary(llm, session)
    print("\n**Summary:**")
    print(summary)
    speak_text(summary)

if __name__ == "__main__":
    main()

Normalized audio saved to normalized_audio.wav
Noise-reduced audio saved to denoised_audio.wav
Transcribing with Whisper...





**Whisper Transcription:**
 Hello, this is a very nice test. Can you understand me MMS?


  if ismodule(module) and hasattr(module, '__file__'):
  chain = LLMChain(llm=llm, prompt=prompt_template)
  response = chain.run(text=text)



**Recognized Intent and Information:**
## Intent and Information:

**Intent:**  The user is testing the assistant's ability to understand natural language.

**Information:**

* The user is using a friendly greeting ("Hello").
* The user is providing a positive evaluation of the test ("this is a very nice test").
* The user is explicitly asking if the assistant can understand them ("Can you understand me").
* The user is using an abbreviation ("MMS") which could be interpreted as "Multi-Media Service" or just a random string of letters. However, without context, it's difficult to determine the exact meaning.

**Assistant Response:**
Hello!  It's great to hear you think this is a nice test. 😊  And yes, I can understand you!  

I'm still learning, so I'm not always perfect.  Could you tell me a little more about what you meant by "MMS"?  That will help me understand you even better.

**Summary:**
Summary:

The user initiated the conversation by expressing positive feedback about the test

In [2]:
MODEL = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    google_api_key=os.getenv("GEMINI_API_KEY"),
    temperature=0.5
)

In [3]:
result = MODEL.invoke("Write a ballad about LangChain")
print(result.content)

(Verse 1)
In realms of code, where data flows,
A knight of knowledge, LangChain grows.
With chains of thought, it weaves a spell,
To unlock secrets, stories to tell.

(Chorus)
LangChain, LangChain, a tool so grand,
Connecting models, hand in hand.
From LLMs vast, to memories stored,
It builds a bridge, where wisdom is poured.

(Verse 2)
With prompts it whispers, questions it asks,
To chains of reasoning, it gently tasks.
From simple queries, to complex schemes,
It finds the answers, fulfilling dreams.

(Chorus)
LangChain, LangChain, a tool so grand,
Connecting models, hand in hand.
From LLMs vast, to memories stored,
It builds a bridge, where wisdom is poured.

(Verse 3)
It gathers facts, from sources diverse,
And weaves them into a narrative verse.
With agents it acts, in the real world's fray,
Solving problems, in a clever way.

(Chorus)
LangChain, LangChain, a tool so grand,
Connecting models, hand in hand.
From LLMs vast, to memories stored,
It builds a bridge, where wisdom is pour