# Ai Assistant

In [None]:
# Importing necessary modules and libraries
import speech_recognition as sr  # for speech recognition
import tensorflow as tf          # deep learning library
import cv2                       # computer vision library
import matplotlib.pyplot as plt  # for plotting
from gtts import gTTS            # text to speech
import pygame                    # multimedia library
import webbrowser                # for web operations

In [None]:
# Uncomment the below line if you don't have pyaudio installed. 
# It is required for microphone functionality of the speech_recognition library.
# ! pip install pyaudio

In [None]:
# Listing out the available microphones and their indices.
# You can find the device index form the output below which helps you to feed right device index to connect desired microphone.
for index, name in enumerate(sr.Microphone().list_microphone_names()):
    print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name))


In [None]:
def listen_to_user():
    """
    Listens to the user's speech and tries to convert it to text using Google's Speech Recognition.
    
    Returns:
        str or None: The recognized text from the user's speech or None if the recognition failed.
    """
    
    # Initialize the recognizer object from the speech_recognition library.
    r = sr.Recognizer()
    
    # We specify the microphone we want to use by its index. Here, the microphone at index 3 is chosen.
    # This could be any microphone recognized by the system, and its index can be found using 
    # the sr.Microphone().list_microphone_names() method.
    with sr.Microphone(device_index=3) as source:
        
        print("listening...")  # Notify the user that the system is ready to listen.
        
        # The listen method captures audio from the source (microphone) until silence is detected.
        audio = r.listen(source)
        
        try:
            # Convert the captured audio to text using Google's speech recognition API.
            text = r.recognize_google(audio)
            
            return text  # Return the recognized text.
            
        except sr.UnknownValueError:
            # If the speech is not clear or not recognized, this error will be raised.
            print("Muje samaj nahi aaya")  
            
            return None  # Return None if recognition failed.


**Explanation**:

1. `sr.Recognizer()`: This creates a new recognizer instance which is the main workhorse of the `speech_recognition` library.

2. `sr.Microphone(device_index=3)`: The library supports multiple microphones. Here, we're selecting the microphone with index 3. This might be different on different systems, and you may need to enumerate the microphone list (as shown in the original notebook) to choose the correct index.

3. `r.listen(source)`: The recognizer listens to the source (in this case, our microphone) and captures the audio.

4. `r.recognize_google(audio)`: This sends the audio data to Google's Web Speech API for recognition. Google then returns the recognized text.

5. `except sr.UnknownValueError`: If Google's API is unable to recognize the speech, this error is raised. In this code, a message is printed to indicate this, but in a production system, you'd probably want more sophisticated error handling.

In [None]:
# Test the function to see if it captures our voice correctly.
# listen_to_user()

In [None]:
# Downloading weights for the EfficientNetB3 model pre-trained on ImageNet
efficientnet = tf.keras.applications.EfficientNetB3(weights='imagenet')

In [None]:
def describe_image():
    """
    Captures an image using the camera, preprocesses it, 
    and uses EfficientNetB3 to predict and describe the image's content.
    
    Returns:
        str: A description of the content in the captured image.
    """
    
    # Initializes the video capture with the camera at index 2. This will connect to the third camera 
    # (indexing starts at 0). If you have multiple cameras, you might need to change this number.
    cap = cv2.VideoCapture(2)
    
    # Capture a single frame/image from the video source (camera in this case). 
    # 'flag' indicates if the capture was successful, 'img' contains the captured frame.
    flag, img = cap.read()
    
    # Convert the color format of the image from BGR (Blue-Green-Red, which is OpenCV's default) 
    # to RGB (Red-Green-Blue), which is more standard and used by many other libraries.
    img = img[:,:,::-1]
    
    # Display the captured image using matplotlib.
    plt.imshow(img)
    
    # Resize the image to 300x300 pixels, which may be the input size expected by the EfficientNetB3 model.
    img_array = cv2.resize(img, (300,300))
    
    # Expand the dimensions of the image. This is required because the neural network expects a batch of images,
    # not a single image. By expanding dimensions, we're essentially converting the single image into a batch of one image.
    img_array = tf.expand_dims(img_array, axis = 0)
    
    # Preprocess the image array to ensure it's in the format the EfficientNetB3 model expects.
    # This might involve normalizing pixel values, zero-centering, etc.
    img_array = tf.keras.applications.efficientnet.preprocess_input(img_array)
    
    # Use the EfficientNetB3 model to predict the content of the image. 
    # This returns a probability distribution over all categories in the model's training data.
    prediction = efficientnet.predict(img_array)
    
    # Decode the prediction to get human-readable labels. 
    # 'top=1' means we only want the top 1 prediction, i.e., the label with the highest probability.
    decoded_pred = tf.keras.applications.efficientnet.decode_predictions(prediction, top = 1)[0][0][1]
    
    # Return a descriptive string with the predicted label.
    return f'I think the image contains {decoded_pred}'


In [None]:
# Test the function by describing an image captured by our camera.
# describe_image()

In [None]:
def no_gen():
    """
    A generator function that keeps producing the next integer.
    Starts from 1 and increments by 1 for each subsequent call.
    
    Yields:
        int: The next integer in the sequence.
    """
    
    # Initialize a variable 'num' with the value 1. This will be our starting point.
    num = 1
    
    # A never-ending loop. Since there's no condition to break the loop, it will keep running indefinitely.
    while True:
        
        # 'yield' essentially returns 'num' to the caller and then, the next time the generator 
        # is called, it resumes from right after this point.
        yield num
        
        # Increment the value of 'num' by 1.
        num += 1

# Create an instance of the generator. This doesn't start the generator, but prepares it to be used.
gen = no_gen()

def speak(text):
    """
    Converts the provided text into speech using Google Text-to-Speech (gTTS) 
    and plays the resulting audio using the pygame library.
    
    Args:
        text (str): The text to be converted into speech.
    """
    
    # Use the gTTS library to convert the provided text into speech. 
    # The 'lang' parameter specifies that we want to use English for the speech synthesis.
    tts = gTTS(text= text, lang= 'en')
    
    # Get the next number from our generator. This ensures each audio file has a unique name.
    resp_no = next(gen)
    
    # Format the audio file name using the obtained number.
    audio_file = f'response{resp_no}.mp3'
    
    # Save the speech audio into an mp3 file.
    tts.save(audio_file)
    
    # Initialize the pygame mixer. This prepares pygame to play audio.
    pygame.mixer.init()
    
    # Load the saved audio file into pygame.
    pygame.mixer.music.load(audio_file)
    
    # Play the loaded audio.
    pygame.mixer.music.play()
    
    # This loop ensures the program doesn't proceed until the audio is finished playing.
    while pygame.mixer.music.get_busy():
        continue


In [None]:
# Test the function by making it speak "how are you".
# speak("how are you")

**Explanation**:

1. `no_gen()`: This is a generator function. It's a special type of function that can pause its execution and resume from where it left off. Here, it's used to produce an endless sequence of integers. It's a neat way of generating unique identifiers without having to keep track of state outside the function.

2. `gen = no_gen()`: Creates a generator object. Now, every time you call `next(gen)`, you'll get the next number in the sequence.

3. `speak(text)`: This function is responsible for converting the given text into speech and then playing that speech. It uses the Google Text-to-Speech (gTTS) service to create an audio file from the text, and then the `pygame` library to play that audio. The use of the generator function ensures that each generated audio file has a unique name.

In [None]:
def main():
    """
    The main function to drive the entire operation.
    Listens to the user, performs actions accordingly, and responds.
    """
    
    # This function call captures the user's voice and tries to convert it into text.
    user_input = listen_to_user()
    
    # Checking if the system was able to successfully capture and convert voice to text.
    if user_input:
        
        # Check if the user wants to describe an image.
        if 'describe the image' in user_input.lower():
            
            # Use the earlier described function to capture and describe an image.
            response = describe_image()
        
        # Check if the user wants to play a song.
        elif 'play my song' in user_input.lower():
            
            # Use the webbrowser library to open YouTube in the default browser.
            webbrowser.open('https://www.youtube.com/')
            response = "playing your song"
        
        # Print out what the assistant will respond with.
        print('Assistant Resp : ', response)
        
        # Use the `speak` function to convert the response into speech and play it.
        speak(response)


In [None]:

# Uncomment the following line to run the main function and start the voice assistant.
# main()
