# Import packages

In [1]:
import os
import json
from langchain import ConversationChain, LLMChain, PromptTemplate
from langchain.memory import ConversationBufferWindowMemory
from langchain_groq import ChatGroq
import speech_recognition as sr
from gtts import gTTS
from playsound import playsound
from IPython.display import Audio
import librosa.display

playsound is relying on another python subprocess. Please use `pip install pygobject` if you want playsound to run more efficiently.


# Load the secret keys

In [4]:
# Load the secret keys from the JSON file
def load_secret_keys(file_path):
    with open(file_path, 'r') as file:
        secret_keys = json.load(file)
    return secret_keys
keys =load_secret_keys('secrets.json')

In [5]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = keys['Langchain Smith']
os.environ["COHERE_API_KEY"] = keys['cohere']
os.environ["GROQ_API_KEY"] = keys['Groq']

# Set the template for Chatbot inputs

In [47]:
template = """Assistant is a based on large language model.

Assistant is designed to be able to assist with a wide range of tasks, 
from answering simple questions to providing accurate explanations on a wide range of topics. 
In the case, if you don't know, answer that currently I don't know I will update it soon.

As a language model, it should be able to generate human-like text based on the input it receives, 
allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.

Assistant is constantly learning and improving, and its capabilities are constantly evolving. 
It is able to process and understand large amounts of text, and can use this knowledge to provide accurate and informative responses to a wide range of questions. 
Additionally, Assistant is able to generate its own text based on the input it receives, allowing it to engage in discussions and provide concise explanations on a wide range of topics.

Assistant is aware that human input is being transcribed from audio and as such there may be some errors in the transcription. It will attempt to account for some words being swapped with similar-sounding words or phrases. 
Assistant must be accurate, concise and not more that 5 sentences, because human attention spans are more limited over the audio channel since it takes time to listen to a response.

{history}
Human: {input}
AI:
"""

# Prompt for User input in chatbots

In [48]:
prompt = PromptTemplate(input_variables=["history", "human_input"], template=template)

# Load the LLM Model

In [49]:
llm= ChatGroq(model="llama3-8b-8192")

# Build the Text based Chatbot chain

In [50]:
chatbot_chain = ConversationChain(
    llm= llm,
    prompt=prompt,
    verbose=True,
    memory=ConversationBufferWindowMemory(k=1),
)

In [29]:
chatbot_chain("What's embedding?")



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mAssistant is a based on large language model.

Assistant is designed to be able to assist with a wide range of tasks, 
from answering simple questions to providing accurate explanations on a wide range of topics. 
In the case, if you don't know, answer that currently I don't know I will update it soon.

As a language model, it should be able to generate human-like text based on the input it receives, 
allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.

Assistant is constantly learning and improving, and its capabilities are constantly evolving. 
It is able to process and understand large amounts of text, and can use this knowledge to provide accurate and informative responses to a wide range of questions. 
Additionally, Assistant is able to generate its own text based on the input it receives, allowing it to engage in 

{'input': "What's embedding?",
 'history': '',
 'response': 'Embedding is a technique used in natural language processing and machine learning to convert words, phrases, or sentences into numerical vectors that capture their semantic meaning.'}

# Speech Recognizations Setup

In [16]:
# Speech Recognizations
recognizer = sr.Recognizer()
microphone = sr.Microphone()

ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'


In [17]:
def get_voice_input():
    recognizer = sr.Recognizer()
    
    with sr.Microphone() as source:
        recognizer.adjust_for_ambient_noise(source)
        print("Listening ...")
        audio = recognizer.listen(source)
    
    try:
        voice_text = recognizer.recognize_google(audio, language="en-IN")
        print(f"You said: {voice_text}")
    except sr.UnknownValueError:
        voice_text = None
        print("Sorry, I did not get that. Can you please ask again?")
    except sr.RequestError as e:
        print(f"Could not request results from Google Speech Recognition service; {e}")    
    
    return voice_text

# Call the function to test
get_voice_input()

# Text to Speech

In [22]:
from IPython.display import Audio

def speak_text(text):
    tts = gTTS(text=text, lang='en') 
    tts.save("response.mp3")
    #os.system("response.mp3")
    #Audio("response.mp3", rate=16000)
    display(Audio('response.mp3', rate=16000,autoplay=True))
    #playsound("response.mp3")

In [23]:
def run():
    speak_text('Hi, I am your intelligent voice assistant. How can I help you?')
    
    while True:
        user_input = get_voice_input()
        if user_input == None:
            speak_text("Sorry, I did not get that, Can you please ask again.")
        elif user_input.lower()=="thanks":
            speak_text("Thanks, It was great time to assist you.")
            return "Thanks, It was great time to assist you. !!"
        else:
            print('Querying ...')
            response = chatbot_chain(user_input)
            
            # Remove special characters using regex
            clean_text = re.sub(r'[!*#]', '', response['response'])
            
            print(f"Response: {response['response']}")
            speak_text(clean_text)
            
        # Speak 'Thanks' to exit
        print('Speak "Thanks" To Exit')
run()

ALSA lib pcm_dmix.c:1032:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib pcm_dmix.c:1032:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm_dmix.c:1032:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib 

Listening ...
You said: how can I help you
Querying ...


NameError: name 'chatbot_chain' is not defined

In [5]:
import pygame
import time

# Initialize Pygame mixer
pygame.mixer.init()

# Load the sound file
sound = pygame.mixer.Sound('Response.mp3')

# Play the sound
def play_sound():
    sound.play()

# Pause the sound
def pause_sound():
    pygame.mixer.pause()

# Unpause the sound
def unpause_sound():
    pygame.mixer.unpause()

# Stop the sound
def stop_sound():
    sound.stop()

# Example usage
if __name__ == "__main__":
    print("Playing sound...")
    play_sound()
    time.sleep(2)  # Wait for 2 seconds

    print("Pausing sound...")
    pause_sound()
    time.sleep(2)  # Wait for 2 seconds

    print("Unpausing sound...")
    unpause_sound()
    time.sleep(2)  # Wait for 2 seconds

    print("Stopping sound...")
    stop_sound()

Playing sound...
Pausing sound...
Unpausing sound...
Stopping sound...


In [12]:
import os
import json
import re
import speech_recognition as sr
from gtts import gTTS
from playsound import playsound
from langchain import ConversationChain, PromptTemplate
from langchain.memory import ConversationBufferWindowMemory
from langchain_groq import ChatGroq
import gradio as gr

class VoiceChatbot:
    def __init__(self, secrets_file):
        self.keys = self.load_secret_keys(secrets_file)
        self.setup_environment()
        self.recognizer = sr.Recognizer()
        self.llm = ChatGroq(model="llama3-8b-8192")
        self.prompt_template = self.create_prompt_template()
        self.chatbot_chain = self.create_chatbot_chain()

    def load_secret_keys(self, file_path):
        with open(file_path, 'r') as file:
            secret_keys = json.load(file)
        return secret_keys

    def setup_environment(self):
        os.environ["LANGCHAIN_TRACING_V2"] = "true"
        os.environ["LANGCHAIN_API_KEY"] = self.keys['Langchain Smith']
        os.environ["GROQ_API_KEY"] = self.keys['Groq']

    def create_prompt_template(self):
        template = """Voice Assistant is a based on large language model.

                    It is designed to be able to assist with a wide range of tasks, from answering simple questions to providing accurate explanations on a wide range of topics. 
                    In the case, if you don't know, answer that currently I don't know I will update it soon.
                    
                    As a language model, it should be able to generate human-like text based on the input it receives, 
                    allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.

                    Assistant is aware that human input is being transcribed from audio and as such there may be some errors in the transcription. It will attempt to account for some words being swapped with similar-sounding words or phrases. 
                    Assistant must be accurate, concise and not more that 5 sentences, because human attention spans are more limited over the audio channel since it takes time to listen to a response.

                    {history}
                    Human: {input}
                    AI:
                    """

        return PromptTemplate(input_variables=["history", "human_input"], template=template)

    def create_chatbot_chain(self):
        chatbot_chain = ConversationChain(
            llm=self.llm,
            prompt=self.prompt_template,
            verbose=False,
            memory=ConversationBufferWindowMemory(k=1),
        )
        return chatbot_chain

    def speak_text(self, text):
        tts = gTTS(text=text, lang='en')
        file_path="response.mp3"
        tts.save(file_path)
        return file_path

    def process_audio(self, audio):
        audio_data = sr.AudioFile(audio)
        with audio_data as source:
            self.recognizer.adjust_for_ambient_noise(source)
            audio_data = self.recognizer.record(source)
        user_input = self.recognizer.recognize_google(audio_data, language="en-IN")
        if user_input is None:
            user_input="Sorry, I did not get that. Can you please ask again?"
            path = self.speak_text(user_input)
            return user_input, path, response
        else:
            print('Querying ...')
            response = self.chatbot_chain.predict(input=user_input)
            clean_text = re.sub(r'[!*#]', '', response)
            path = self.speak_text(clean_text)
            return user_input,path,response


def run_gradio():
    chatbot = VoiceChatbot('secrets.json')
    interface = gr.Interface(
        fn=chatbot.process_audio,
        inputs=gr.Audio(sources=["microphone"], type="filepath", label="Speak Here"),
        outputs=[
            gr.Textbox(label="You Said:", placeholder="Recording..."),
            gr.Audio(type='filepath', label="Generated Audio", autoplay=True),
            gr.Textbox(label="Assistant's Response:", placeholder="Waiting for response..."),
        ],
        live=True,
        title="Voice Assistant",
        description="Speak into the microphone and interact with the Voice Assistant.",
        theme="compact",
    )
    interface.launch(inbrowser=True)

if __name__ == "__main__":
    run_gradio()


Sorry, we can't find the page you are looking for.


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


Opening in existing browser session.
Querying ...


In [1]:
from transformers import pipeline
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"

classifier = pipeline(
    "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
)



from transformers.pipelines.audio_utils import ffmpeg_microphone_live


def stop_fn(
    stop_word="stop",
    prob_threshold=0.5,
    chunk_length_s=2.0,
    stream_chunk_s=0.25,
    debug=False,
):


    sampling_rate = classifier.feature_extractor.sampling_rate

    mic = ffmpeg_microphone_live(
        sampling_rate=sampling_rate,
        chunk_length_s=chunk_length_s,
        stream_chunk_s=stream_chunk_s,
    )

    print("Listening for stop word...")
    for prediction in classifier(mic):
        prediction = prediction[0]
        if debug:
            print(prediction)
        if prediction["label"] == stop_word:
            if prediction["score"] > prob_threshold:
                return True
            else:
                return False

stop_fn(debug=True)

2024-07-06 23:59:18.791586: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-06 23:59:18.792582: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-06 23:59:18.794827: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-06 23:59:18.801926: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-06 23:59:18.815682: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registe

Listening for stop word...


  waveform = torch.from_numpy(waveform).unsqueeze(0)


{'score': 0.05440586432814598, 'label': 'no'}
{'score': 0.29025253653526306, 'label': 'four'}
{'score': 0.06330902874469757, 'label': 'down'}
{'score': 0.06710086017847061, 'label': 'down'}
{'score': 0.07421767711639404, 'label': 'down'}
{'score': 0.07411103695631027, 'label': 'down'}
{'score': 0.07411103695631027, 'label': 'down'}
{'score': 0.07411103695631027, 'label': 'down'}
{'score': 0.052969492971897125, 'label': 'up'}
{'score': 0.1634492129087448, 'label': 'seven'}
{'score': 0.283249169588089, 'label': 'down'}
{'score': 0.283249169588089, 'label': 'down'}
{'score': 0.283249169588089, 'label': 'down'}
{'score': 0.283249169588089, 'label': 'down'}
{'score': 0.5797058343887329, 'label': 'four'}
{'score': 0.48479413986206055, 'label': 'four'}
{'score': 0.48479413986206055, 'label': 'four'}
{'score': 0.48479413986206055, 'label': 'four'}
{'score': 0.48479413986206055, 'label': 'four'}
{'score': 0.9936321377754211, 'label': 'stop'}


True

In [4]:
import os
import json
import speech_recognition as sr
from gtts import gTTS
import re
from playsound import playsound
from langchain import ConversationChain, PromptTemplate
from langchain.memory import ConversationBufferWindowMemory
from langchain_groq import ChatGroq
from IPython.display import Audio, display
import pygame
#import librosa.display

class VoiceChatbot:
    def __init__(self, secrets_file):
        self.keys = self.load_secret_keys(secrets_file)
        self.setup_environment()
        self.recognizer = sr.Recognizer()
        self.microphone = sr.Microphone()
        self.intrupted = False
        self.llm = ChatGroq(model="llama3-8b-8192")
        self.prompt_template = self.create_prompt_template()
        self.chatbot_chain = self.create_chatbot_chain()

    def load_secret_keys(self, file_path):
        with open(file_path, 'r') as file:
            secret_keys = json.load(file)
        return secret_keys

    def setup_environment(self):
        os.environ["LANGCHAIN_TRACING_V2"] = "true"
        os.environ["LANGCHAIN_API_KEY"] = self.keys['Langchain Smith']
        os.environ["GROQ_API_KEY"] = self.keys['Groq']

    def create_prompt_template(self):
        template = """Voice Assistant is a based on large language model.

                    It is designed to be able to assist with a wide range of tasks, from answering simple questions to providing accurate explanations on a wide range of topics. 
                    In the case, if you don't know, answer that currently I don't know I will update it soon.
                    
                    As a language model, it should be able to generate human-like text based on the input it receives, 
                    allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.

                    Assistant is aware that human input is being transcribed from audio and as such there may be some errors in the transcription. It will attempt to account for some words being swapped with similar-sounding words or phrases. 
                    Assistant must be accurate, concise and not more that 5 sentences, because human attention spans are more limited over the audio channel since it takes time to listen to a response.

                    {history}
                    Human: {input}
                    AI:
                    """

        return PromptTemplate(input_variables=["history", "human_input"], template=template)

    def create_chatbot_chain(self):
        chatbot_chain = ConversationChain(
            llm= self.llm,
            prompt=self.prompt_template,
            verbose=True,
            memory=ConversationBufferWindowMemory(k=1),
        )
        return chatbot_chain

    def get_voice_input(self):
        with self.microphone as source:
            self.recognizer.adjust_for_ambient_noise(source)
            print("Listening ...")
            audio = self.recognizer.listen(source)
        try:
            voice_text = self.recognizer.recognize_google(audio,language = "en-IN")
            print(f"You said: {voice_text}")
        except sr.UnknownValueError:
                voice_text=None
                print("Sorry, I did not get that, Can you please ask again, !!")
        except sr.RequestError as e:
            print(f"Could not request results from Google Speech Recognition service; {e}")    
        return voice_text

    def speak_text(self, text):
        tts = gTTS(text=text, lang='en')
        tts.save("response.mp3")
        display(Audio('response.mp3', rate=16000))
        # Initialize Pygame mixer
        pygame.mixer.init()
        # Load the sound file
        sound = pygame.mixer.Sound('response.mp3')
        return sound

    def run(self):
        self.speak_text('Hi, I am your intelligent voice assistant. How can I help you?')
        
        while True:
            user_input = self.get_voice_input()
            if user_input == None:
                sound=self.speak_text("Sorry, I did not get that, Can you please ask again.")
                sound.play()
            elif user_input.lower()=="stop":
                sound=self.speak_text("Thanks, It was great time to assist you.")
                sound.play()
                return "Thanks, It was great time to assist you. !!"
            else:
                print('Querying ...')
                response = self.chatbot_chain(user_input)
                
                # Remove special characters using regex
                clean_text = re.sub(r'[!*#]', '', response['response'])
                
                print(f"Response: {response['response']}")
                sound = self.speak_text(clean_text)
                sound.play()
                if self.intrupted:
                    sound.stop()
                
            # Speak 'Thanks' to exit
            print('Speak "STOP" To Exit')

if __name__ == "__main__":
    chatbot = VoiceChatbot('secrets.json')
    chatbot.run()

ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'


ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'


Listening ...
You said: what is embedding
Querying ...


[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mVoice Assistant is a based on large language model.

                    It is designed to be able to assist with a wide range of tasks, from answering simple questions to providing accurate explanations on a wide range of topics. 
                    In the case, if you don't know, answer that currently I don't know I will update it soon.
                    
                    As a language model, it should be able to generate human-like text based on the input it receives, 
                    allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.

                    Assistant is aware that human input is being transcribed from audio and as such there may be some errors in the transcription. It will attempt to account for some words being swapped with similar-soun

Speak "Thanks" To Exit


ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'


Listening ...


KeyboardInterrupt: 

In [5]:
#input("press ENTER to stop playback")
p.terminate()

In [1]:
from transformers.pipelines.audio_utils import ffmpeg_microphone_live
from transformers import pipeline
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"

classifier = pipeline(
    "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
)

def excite_fn(
    excite_word="stop",
    prob_threshold=0.5,
    chunk_length_s=0.1,
    stream_chunk_s=0.25,
    debug=False,
):

    sampling_rate = classifier.feature_extractor.sampling_rate

    mic = ffmpeg_microphone_live(
        sampling_rate=sampling_rate,
        chunk_length_s=chunk_length_s,
        stream_chunk_s=stream_chunk_s,
    )

    print("speak 'YES' to intrupt or 'STOP' to stop")
    count=0
    for prediction in classifier(mic):
        prediction = prediction[0]
        if debug:
            print(prediction)
        elif prediction["label"] == excite_word:
            if prediction["score"] > prob_threshold:
                return 'stop'
        count+=1
        if count==5:
            return

excite_fn(debug=True)

2024-07-08 10:01:42.857598: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-08 10:01:42.864430: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-08 10:01:42.921717: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-08 10:01:42.986647: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 10:01:43.051555: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registe

speak 'YES' to intrupt or 'STOP' to stop


  waveform = torch.from_numpy(waveform).unsqueeze(0)


{'score': 0.04524710774421692, 'label': 'no'}
{'score': 0.04299049451947212, 'label': 'two'}
{'score': 0.04376319423317909, 'label': 'two'}
{'score': 0.045104604214429855, 'label': 'two'}
{'score': 0.041769806295633316, 'label': 'no'}


In [7]:
playsound("Response.mp3")

In [6]:
import os
import json
import speech_recognition as sr
from gtts import gTTS
import re
from playsound import playsound
from langchain import ConversationChain, PromptTemplate
from langchain.memory import ConversationBufferWindowMemory
from langchain_groq import ChatGroq
from IPython.display import Audio, display
from transformers.pipelines.audio_utils import ffmpeg_microphone_live
from transformers import pipeline
import torch
import time
from pydub import AudioSegment
from pydub.playback import play

class VoiceChatbot:
    def __init__(self, secrets_file):
        """
        Initialize the VoiceChatbot with necessary configurations and models.
        """
        self.keys = self.load_secret_keys(secrets_file)
        self.setup_environment()
        self.recognizer = sr.Recognizer()
        self.microphone = sr.Microphone()
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", device=self.device)
        self.classifier = pipeline("audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=self.device)
        self.llm = ChatGroq(model="llama3-8b-8192")
        self.prompt_template = self.create_prompt_template()
        self.chatbot_chain = self.create_chatbot_chain()

    def load_secret_keys(self, file_path):
        """
        Load secret keys from a JSON file.
        """
        with open(file_path, 'r') as file:
            secret_keys = json.load(file)
        return secret_keys

    def setup_environment(self):
        """
        Set up the environment variables required for Langchain and Groq.
        """
        os.environ["LANGCHAIN_TRACING_V2"] = "true"
        os.environ["LANGCHAIN_API_KEY"] = self.keys['Langchain Smith']
        os.environ["GROQ_API_KEY"] = self.keys['Groq']

    def create_prompt_template(self):
        """
        Create the prompt template for the chatbot.
        """
        template = """Voice Assistant is based on a large language model.

                    It is designed to assist with a wide range of tasks, from answering simple questions to providing accurate explanations on various topics. 
                    If you don't know the answer, respond with "I don't know, I will update it soon."
                    
                    The language model generates human-like text based on the input it receives, 
                    allowing it to engage in natural-sounding conversations and provide coherent and relevant responses.

                    The assistant is aware that human input is transcribed from audio, which may contain errors. It will attempt to account for similar-sounding words or phrases. 
                    Responses must be accurate, concise, and no more than 7 sentences, considering human attention spans are limited over audio as listening takes time.

                    {history}
                    Human: {input}
                    AI:
                    """
        return PromptTemplate(input_variables=["history", "human_input"], template=template)

    def create_chatbot_chain(self):
        """
        Create the chatbot chain with memory for maintaining conversation context.
        """
        chatbot_chain = ConversationChain(
            llm=self.llm,
            prompt=self.prompt_template,
            verbose=False,
            memory=ConversationBufferWindowMemory(k=1),
        )
        return chatbot_chain

    def get_voice_input(self):
        """
        Capture voice input from the microphone and convert it to text.
        """
        with self.microphone as source:
            self.recognizer.adjust_for_ambient_noise(source)
            print("Listening ...")
            audio = self.recognizer.listen(source)
        try:
            voice_text = self.recognizer.recognize_google(audio, language="en-IN")
            print(f"You said: {voice_text}")
        except sr.UnknownValueError:
            voice_text = None
            print("Sorry, I did not get that. Can you please ask again?")
        except sr.RequestError as e:
            print(f"Could not request results from Google Speech Recognition service; {e}")    
        return voice_text

    def speak_text(self, text, speed=1.0, Play= True):
        """
        Convert text to speech and play it with optional speed adjustment.
        """
        tts = gTTS(text=text, lang='en')
        audio_response_filepath= "response.wav"
        tts.save(audio_response_filepath)
        display(Audio(audio_response_filepath, rate=16000))
        if Play:
            # Load the audio file
            audio = AudioSegment.from_file(audio_response_filepath)
            
            # Function to change playback speed
            def change_playback_speed(sound, speed=1.0):
                sound_with_altered_frame_rate = sound._spawn(sound.raw_data, overrides={
                    "frame_rate": int(sound.frame_rate * speed)
                })
                return sound_with_altered_frame_rate.set_frame_rate(sound.frame_rate)
            
            speed_up_audio = change_playback_speed(audio, speed)
            play(speed_up_audio)
        return audio_response_filepath

    def excite_fn(self, excite_word, debug=False):
        """
        Listen for a specific trigger word and return 'stop' if detected.
        """
        prob_threshold = 0.5
        sampling_rate = self.classifier.feature_extractor.sampling_rate
        time.sleep(3)
        mic = ffmpeg_microphone_live(sampling_rate=sampling_rate, chunk_length_s=0.2, stream_chunk_s=0.25)
        
        self.speak_text("Speak 'STOP' to stop the conversation, just after it.", speed=1.3)
        count = 0
        for prediction in self.classifier(mic):
            prediction = prediction[0]
            if debug:
                print(prediction)
            elif prediction["label"] == excite_word and prediction["score"] > prob_threshold:
                return 'stop'
            count += 1
            if count == 50:
                return
    def get_response(self, user_input):
        if user_input is not None:
            return self.chatbot_chain(user_input)
        return 'No User Input'

    def run(self):
        """
        Main function to run the voice assistant.
        """
        self.speak_text('Hi, I am your intelligent voice assistant. How can I help you?', speed=1.1)
        # Set i to count the empty user_input
        i=0
        while True and i<3:
            user_input = self.get_voice_input()
            if user_input is None:
                self.speak_text("Sorry, I did not get that. Can you please ask again?")
                i+=1
            else:
                print('Querying ...')
                response = self.chatbot_chain(user_input)
                
                # Remove special characters using regex
                clean_text = re.sub(r'[!*#]', '', response['response'])
                
                print(f"Response: {response['response']}")
                self.speak_text(clean_text)
            
                if self.excite_fn(excite_word='stop') == "stop":
                    self.speak_text("Thanks, it was great assisting you.")
                    return "Thanks, it was great assisting you!"
                # Reset the empty input
                i=0

if __name__ == "__main__":
    chatbot = VoiceChatbot('secrets.json')
    chatbot.run()

2024-07-08 14:19:19.056754: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-08 14:19:19.057651: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-08 14:19:19.059776: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-08 14:19:19.066480: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 14:19:19.080187: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registe

ALSA lib pcm_dsnoop.c:601:(snd_pcm_dsnoop_open) unable to open slave
ALSA lib pcm_dmix.c:1032:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib pcm_dmix.c:1032:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm_dsnoop.c:601:(snd_pcm_dsnoop_open) unable to open slave
ALSA lib pcm_dmix.c:1032:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm.c:2664

Listening ...
You said: what is embedding
Querying ...


  warn_deprecated(


Response: Embedding is a concept in machine learning and natural language processing that refers to the process of converting data, such as text or audio, into a numerical representation that can be processed by a computer. This representation is called an embedding, and it allows the computer to analyze and understand the data in a more efficient and accurate way.


KeyboardInterrupt: 

In [8]:
# Speech Recognizations
recognizer = sr.Recognizer()
microphone = sr.Microphone()

def get_voice_input(audio_file=None):
    """
    Capture voice input from the microphone and convert it to text.
    """
    if audio_file is None:
        with self.microphone as source:
            recognizer.adjust_for_ambient_noise(source)
            print("Listening ...")
            audio = recognizer.listen(source)
    else:
        try:
            with sr.AudioFile(audio_file) as source:
                #audio_data = recognizer.record(source)
                voice_text = recognizer.recognize_google(audio_data, language="en-IN")
                print(f"You said: {voice_text}")
                return voice_text
        except sr.UnknownValueError:
            voice_text = None
            print("Sorry, I did not get that. Can you please ask again?")
        except sr.RequestError as e:
            print(f"Could not request results from Google Speech Recognition service; {e}")    
    return voice_text

get_voice_input(audio_file='response.wav')

ALSA lib pcm_dsnoop.c:601:(snd_pcm_dsnoop_open) unable to open slave
ALSA lib pcm_dmix.c:1032:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib pcm_dmix.c:1032:(snd_pcm_dmix_open) unable to open slave


ValueError: Audio file could not be read as PCM WAV, AIFF/AIFF-C, or Native FLAC; check if file is corrupted or in another format

In [11]:
import librosa
audio_array, sampling_rate = librosa.load('response.wav')
recognizer.recognize_google(audio_array, language="en-IN")

ValueError: ``audio_data`` must be audio data

In [None]:
import os
import json
import speech_recognition as sr
from gtts import gTTS
import re
from playsound import playsound
from langchain import ConversationChain, PromptTemplate
from langchain.memory import ConversationBufferWindowMemory
from langchain_groq import ChatGroq
from IPython.display import Audio, display
from transformers.pipelines.audio_utils import ffmpeg_microphone_live
from transformers import pipeline
import torch
import time
from pydub import AudioSegment
from pydub.playback import play

class VoiceChatbot:
    def __init__(self, secrets_file):
        """
        Initialize the VoiceChatbot with necessary configurations and models.
        """
        self.keys = self.load_secret_keys(secrets_file)
        self.setup_environment()
        self.recognizer = sr.Recognizer()
        self.microphone = sr.Microphone()
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.classifier = pipeline("audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=self.device)
        self.llm = ChatGroq(model="llama3-8b-8192")
        self.prompt_template = self.create_prompt_template()
        self.chatbot_chain = self.create_chatbot_chain()

    def load_secret_keys(self, file_path):
        """
        Load secret keys from a JSON file.
        """
        with open(file_path, 'r') as file:
            secret_keys = json.load(file)
        return secret_keys

    def setup_environment(self):
        """
        Set up the environment variables required for Langchain and Groq.
        """
        os.environ["LANGCHAIN_TRACING_V2"] = "true"
        os.environ["LANGCHAIN_API_KEY"] = self.keys['Langchain Smith']
        os.environ["GROQ_API_KEY"] = self.keys['Groq']

    def create_prompt_template(self):
        """
        Create the prompt template for the chatbot.
        """
        template = """Voice Assistant is based on a large language model.

                    It is designed to assist with a wide range of tasks, from answering simple questions to providing accurate explanations on various topics. 
                    If you don't know the answer, respond with "I don't know, I will update it soon."
                    
                    The language model generates human-like text based on the input it receives, 
                    allowing it to engage in natural-sounding conversations and provide coherent and relevant responses.

                    The assistant is aware that human input is transcribed from audio, which may contain errors. It will attempt to account for similar-sounding words or phrases. 
                    Responses must be accurate, concise, and no more than 7 sentences, considering human attention spans are limited over audio as listening takes time.

                    {history}
                    Human: {input}
                    AI:
                    """
        return PromptTemplate(input_variables=["history", "human_input"], template=template)

    def create_chatbot_chain(self):
        """
        Create the chatbot chain with memory for maintaining conversation context.
        """
        chatbot_chain = ConversationChain(
            llm=self.llm,
            prompt=self.prompt_template,
            verbose=False,
            memory=ConversationBufferWindowMemory(k=1),
        )
        return chatbot_chain

    def get_voice_input(self, audio_file=None):
        """
        Capture voice input from the microphone and convert it to text.
        """
        if audio_file is None:
            with self.microphone as source:
                self.recognizer.adjust_for_ambient_noise(source)
                print("Listening ...")
                audio = self.recognizer.listen(source)
        else:
            try:
                voice_text = self.recognizer.recognize_google(audio, language="en-IN")
                print(f"You said: {voice_text}")
            except sr.UnknownValueError:
                voice_text = None
                print("Sorry, I did not get that. Can you please ask again?")
            except sr.RequestError as e:
                print(f"Could not request results from Google Speech Recognition service; {e}")    
        return voice_text

    def speak_text(self, text, speed=1.0, Play= True):
        """
        Convert text to speech and play it with optional speed adjustment.
        """
        tts = gTTS(text=text, lang='en')
        audio_response_filepath= "response.mp3"
        tts.save(audio_response_filepath)
        display(Audio(audio_response_filepath, rate=16000))
        if Play:
            # Load the audio file
            audio = AudioSegment.from_file(audio_response_filepath)
            
            # Function to change playback speed
            def change_playback_speed(sound, speed=1.0):
                sound_with_altered_frame_rate = sound._spawn(sound.raw_data, overrides={
                    "frame_rate": int(sound.frame_rate * speed)
                })
                return sound_with_altered_frame_rate.set_frame_rate(sound.frame_rate)
            
            speed_up_audio = change_playback_speed(audio, speed)
            play(speed_up_audio)
        return audio_response_filepath

    def excite_fn(self, excite_word, debug=False):
        """
        Listen for a specific trigger word and return 'stop' if detected.
        """
        prob_threshold = 0.5
        sampling_rate = self.classifier.feature_extractor.sampling_rate
        time.sleep(3)
        mic = ffmpeg_microphone_live(sampling_rate=sampling_rate, chunk_length_s=0.2, stream_chunk_s=0.25)
        
        self.speak_text("Speak 'STOP' to stop the conversation, just after it.", speed=1.3)
        count = 0
        for prediction in self.classifier(mic):
            prediction = prediction[0]
            if debug:
                print(prediction)
            elif prediction["label"] == excite_word and prediction["score"] > prob_threshold:
                return 'stop'
            count += 1
            if count == 50:
                return
    def get_response(self, user_input):
        if user_input is not None:
            return self.chatbot_chain(user_input)
        return 'No User Input'

    def run(self):
        """
        Main function to run the voice assistant.
        """
        self.speak_text('Hi, I am your intelligent voice assistant. How can I help you?', speed=1.1)
        # Set i to count the empty user_input
        i=0
        while True and i<3:
            user_input = self.get_voice_input()
            if user_input is None:
                self.speak_text("Sorry, I did not get that. Can you please ask again?")
                i+=1
            else:
                print('Querying ...')
                response = self.chatbot_chain(user_input)
                
                # Remove special characters using regex
                clean_text = re.sub(r'[!*#]', '', response['response'])
                
                print(f"Response: {response['response']}")
                self.speak_text(clean_text)
            
                if self.excite_fn(excite_word='stop') == "stop":
                    self.speak_text("Thanks, it was great assisting you.")
                    return "Thanks, it was great assisting you!"
                # Reset the empty input
                i=0

if __name__ == "__main__":
    chatbot = VoiceChatbot('secrets.json')
    chatbot.run()

In [21]:
import speech_recognition as sr
from gtts import gTTS
import os

# Function to transcribe audio file to text
def transcribe_audio(audio_file):
    # Initialize recognizer class (for recognizing the speech)
    recognizer = sr.Recognizer()
    
    # Load audio file
    try:
        with sr.AudioFile(audio_file) as source:
            audio_data = recognizer.record(source)  # Read the entire audio file
    except FileNotFoundError:
        return f"File '{audio_file}' not found"
    except Exception as e:
        return f"Error loading audio file: {e}"
    
    # Recognize speech using Google Speech Recognition
    try:
        text = recognizer.recognize_google(audio_data)
        return text
    except sr.UnknownValueError:
        return "Google Speech Recognition could not understand audio"
    except sr.RequestError as e:
        return f"Could not request results from Google Speech Recognition service; {e}"

# Example usage
if __name__ == "__main__":
    audio_file = "response.wav"  # Replace with your converted audio file path
    transcribed_text = transcribe_audio(audio_file)
    print(f"Transcribed Text: {transcribed_text}")
    
    # Convert text to speech using gTTS
    if transcribed_text:
        tts = gTTS(text=transcribed_text, lang='en')
        tts.save("transcribed_text.mp3")  # Save as MP3 file
        os.system("start transcribed_text.mp3")  # Play the MP3 file (Windows)

Transcribed Text: Error loading audio file: Audio file could not be read as PCM WAV, AIFF/AIFF-C, or Native FLAC; check if file is corrupted or in another format


sh: 1: start: not found


In [23]:
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")

In [26]:
transcriber('Response.mp3')

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


{'text': ' A great question embedding is a fundamental concept in machine learning in natural language. Processing. In a nutshell, an embedding is a way to represent words, phrases, or concepts as dense vectors in a high-dimensional space. These vectors, also known as embeddings, capture the semantic meaning and relationships between words. allowing machines to understand language more effectively. For example, the word, dog, and puppy may have similar embeddings because they are semantically close.'}

In [None]:
import os
import json
import speech_recognition as sr
from gtts import gTTS
import re
from playsound import playsound
from langchain import ConversationChain, PromptTemplate
from langchain.memory import ConversationBufferWindowMemory
from langchain_groq import ChatGroq
from IPython.display import Audio, display
from transformers.pipelines.audio_utils import ffmpeg_microphone_live
from transformers import pipeline
import torch
import time
from pydub import AudioSegment
from pydub.playback import play

class VoiceChatbot:
    def __init__(self, secrets_file):
        """
        Initialize the VoiceChatbot with necessary configurations and models.
        """
        self.keys = self.load_secret_keys(secrets_file)
        self.setup_environment()
        self.recognizer = sr.Recognizer()
        self.microphone = sr.Microphone()
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.transcriber=pipeline("automatic-speech-recognition", model="openai/whisper-base.en", device=self.device)
        self.classifier = pipeline("audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=self.device)
        self.llm = ChatGroq(model="llama3-8b-8192")
        self.prompt_template = self.create_prompt_template()
        self.chatbot_chain = self.create_chatbot_chain()

    def load_secret_keys(self, file_path):
        """
        Load secret keys from a JSON file.
        """
        with open(file_path, 'r') as file:
            secret_keys = json.load(file)
        return secret_keys

    def setup_environment(self):
        """
        Set up the environment variables required for Langchain and Groq.
        """
        os.environ["LANGCHAIN_TRACING_V2"] = "true"
        os.environ["LANGCHAIN_API_KEY"] = self.keys['Langchain Smith']
        os.environ["GROQ_API_KEY"] = self.keys['Groq']

    def create_prompt_template(self):
        """
        Create the prompt template for the chatbot.
        """
        template = """Voice Assistant is based on a large language model.

                    It is designed to assist with a wide range of tasks, from answering simple questions to providing accurate explanations on various topics. 
                    If you don't know the answer, respond with "I don't know, I will update it soon."
                    
                    The language model generates human-like text based on the input it receives, 
                    allowing it to engage in natural-sounding conversations and provide coherent and relevant responses.

                    The assistant is aware that human input is transcribed from audio, which may contain errors. It will attempt to account for similar-sounding words or phrases. 
                    Responses must be accurate, concise, and no more than 7 sentences, considering human attention spans are limited over audio as listening takes time.

                    {history}
                    Human: {input}
                    AI:
                    """
        return PromptTemplate(input_variables=["history", "human_input"], template=template)

    def create_chatbot_chain(self):
        """
        Create the chatbot chain with memory for maintaining conversation context.
        """
        chatbot_chain = ConversationChain(
            llm=self.llm,
            prompt=self.prompt_template,
            verbose=False,
            memory=ConversationBufferWindowMemory(k=1),
        )
        return chatbot_chain
    def convert_voice_to_text(self,audio_file_path):
        # convert voice to text
        voice_text= self.transcriber(audio_file_path)
        return voice_text
    def get_voice_input(self):
        """
        Capture voice input from the microphone and convert it to text.
        """
        with self.microphone as source:
            self.recognizer.adjust_for_ambient_noise(source)
            print("Listening ...")
            audio = self.recognizer.listen(source)
        try:
            voice_text = self.recognizer.recognize_google(audio, language="en-IN")
            print(f"You said: {voice_text}")
        except sr.UnknownValueError:
            voice_text = None
            print("Sorry, I did not get that. Can you please ask again?")
        except sr.RequestError as e:
            print(f"Could not request results from Google Speech Recognition service; {e}")    
        return voice_text

    def speak_text(self, text, speed=1.0, Play= True):
        """
        Convert text to speech and play it with optional speed adjustment.
        """
        tts = gTTS(text=text, lang='en')
        audio_response_filepath= "response.mp3"
        tts.save(audio_response_filepath)
        display(Audio(audio_response_filepath, rate=16000))
        if Play:
            # Load the audio file
            audio = AudioSegment.from_file(audio_response_filepath)
            
            # Function to change playback speed
            def change_playback_speed(sound, speed=1.0):
                sound_with_altered_frame_rate = sound._spawn(sound.raw_data, overrides={
                    "frame_rate": int(sound.frame_rate * speed)
                })
                return sound_with_altered_frame_rate.set_frame_rate(sound.frame_rate)
            
            speed_up_audio = change_playback_speed(audio, speed)
            play(speed_up_audio)
        return audio_response_filepath

    def excite_fn(self, excite_word, debug=False):
        """
        Listen for a specific trigger word and return 'stop' if detected.
        """
        prob_threshold = 0.5
        sampling_rate = self.classifier.feature_extractor.sampling_rate
        time.sleep(3)
        mic = ffmpeg_microphone_live(sampling_rate=sampling_rate, chunk_length_s=0.2, stream_chunk_s=0.25)
        
        self.speak_text("Speak 'STOP' to stop the conversation, just after it.", speed=1.3)
        count = 0
        for prediction in self.classifier(mic):
            prediction = prediction[0]
            if debug:
                print(prediction)
            elif prediction["label"] == excite_word and prediction["score"] > prob_threshold:
                return 'stop'
            count += 1
            if count == 50:
                return
    def get_response(self, user_input):
        if user_input is not None:
            return self.chatbot_chain(user_input)
        return 'No User Input'

    def run(self):
        """
        Main function to run the voice assistant.
        """
        self.speak_text('Hi, I am your intelligent voice assistant. How can I help you?', speed=1.1)
        # Set i to count the empty user_input
        i=0
        while True and i<3:
            user_input = self.get_voice_input()
            if user_input is None:
                self.speak_text("Sorry, I did not get that. Can you please ask again?")
                i+=1
            else:
                print('Querying ...')
                response = self.chatbot_chain(user_input)
                
                # Remove special characters using regex
                clean_text = re.sub(r'[!*#]', '', response['response'])
                
                print(f"Response: {response['response']}")
                self.speak_text(clean_text)
            
                if self.excite_fn(excite_word='stop') == "stop":
                    self.speak_text("Thanks, it was great assisting you.")
                    return "Thanks, it was great assisting you!"
                # Reset the empty input
                i=0

if __name__ == "__main__":
    chatbot = VoiceChatbot('secrets.json')
    chatbot.run()

In [27]:
import tempfile
tempfile.mkdtemp()

'/tmp/tmpwe1bsm21'

In [28]:
import speech_recognition as sr

def transcribe(file):
    recognizer = sr.Recognizer()
    
    with sr.AudioFile(file) as source:
        audio = recognizer.record(source)
    
    try:
        # Transcribe audio file using Google Web Speech API
        text = recognizer.recognize_google(audio)
        print("\nFull Transcription:\n", text)
        return text
    except sr.UnknownValueError:
        print("Google Web Speech API could not understand the audio")
    except sr.RequestError as e:
        print(f"Could not request results from Google Web Speech API; {e}")

# Call the transcribe function
transcription = transcribe('temp/recording.wav')

ValueError: Audio file could not be read as PCM WAV, AIFF/AIFF-C, or Native FLAC; check if file is corrupted or in another format