<a href="https://colab.research.google.com/github/Palak1593/Audio_to_text/blob/main/Audio_To_Text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install SpeechRecognition # allows you to recognize and transcribe speech from various audio sources, such as microphones or audio files,
!pip install pydub # simplifies working with audio files.Perform various operations on audio, such as splitting, merging, adjusting volume,convert audio formats, applying effects, and more.


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting SpeechRecognition
  Downloading SpeechRecognition-3.10.0-py2.py3-none-any.whl (32.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.10.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [2]:
import speech_recognition as sr
import os #provides a way to interact with the operating system,
          #allowing you to perform various operations related to file management, directory manipulation, environment variables, and more.
import subprocess
from google.colab import files # used in Google Colab notebooks to interact with files and user input/output
from pydub import AudioSegment  #imports the AudioSegment class from the PyDub library in Python.
                               #can easily load, manipulate, and export audio files in your Python code.
from pydub.silence import split_on_silence #imports the split_on_silence function from the pydub.silence module in the PyDub library.
                                           #This function allows you to split an audio file into segments based on periods of silence.


In [None]:
#Upload audio file

files.upload() #The files.upload() function allows you to upload files from your local machine to your Google Colab environment.

In [9]:
# convert mp3 to wav file
subprocess.call(['ffmpeg', '-i', 'hello (1).mp3','wav_file.wav'])

0

In [24]:
# initialize the recognizer
r = sr.Recognizer()       #Creates an instance of the Recognizer class from the speech_recognition module in Python.
                          #The Recognizer class is the main component of the SpeechRecognition library and is used for speech recognition purposes.

# a function that splits the audio file into chunks
# and applies speech recognition

def get_large_audio_transcription(path):
    """
    Splitting the large audio file into chunks
    and apply speech recognition on each of these chunks
    """
    # open the audio file using pydub
    sound = AudioSegment.from_wav(path)

   #The split_on_silence() function analyzes the input audio segment, identifies periods of silence based on the provided parameters,
   #and returns a list of audio chunks. Each chunk represents a portion of the original audio segment, separated by periods of silence.
    chunks = split_on_silence(sound,
       # split audio sound where silence is 700 miliseconds or more and get chunks
        min_silence_len = 700,  #specifies the minimum duration of silence in milliseconds that should be considered as a separation point for splitting

        silence_thresh = sound.dBFS-14,  #determines the silence threshold level in decibels (dBFS) that defines what audio level is considered silence.
                                         #The sound.dBFS-14 expression sets the threshold relative to the average audio level of the sound segment.

        keep_silence=700, #This parameter determines the duration of silence to keep between the resulting audio chunks in milliseconds.
                          #It allows you to add a specified duration of silence between the chunks.
    )

    folder_name = "audio-chunks"

    # create a directory to store the audio chunks
    if not os.path.isdir(folder_name): #checks if a directory with the name specified by the folder_name variable exists.
                                       #If the directory doesn't exist, it creates a new directory with that name using the os.mkdir() function.
        os.mkdir(folder_name)

    whole_text = ""

    # process each chunk
    for i, audio_chunk in enumerate(chunks, start=1): #The enumerate() function is used to retrieve each audio chunk along with its corresponding index.
                                                       #The start=1 argument sets the starting index value as 1 instead of the default 0.

        # export audio chunk and save it in
        # the `folder_name` directory.
        chunk_filename = os.path.join(folder_name, f"chunk{i}.wav") # This line creates the filename for the current audio chunk.
                                                                    #It uses the os.path.join() function to join the folder_name directory path with the filename, which is generated dynamically based on the index of the current chunk.

        audio_chunk.export(chunk_filename, format="wav") #This exports the current audio chunk to a WAV file with the filename specified by chunk_filename.
                                                         # The export() method of the AudioSegment class is used for exporting the chunk, and the format="wav" argument specifies that the file should be saved in WAV format

        """ each audio chunk obtained from the split_on_silence() function will be saved as a separate WAV file in the folder_name directory.
        The files will be named as "chunk1.wav", "chunk2.wav", and so on, corresponding to their respective index in the chunks list."""

        # recognize the chunk
        with sr.AudioFile(chunk_filename) as source:  #Uses the AudioFile class from the speech_recognition module in Python.
                                                      #It opens an audio file (chunk) using the AudioFile context manager and records the audio data for speech recognition

            audio_listened = r.record(source)   # This line uses the record() method of the Recognizer class (r) to record the audio data from the opened source (audio file).
                                              #The record() method reads the audio data from the source and stores it in the audio_listened variable.

            # try converting it to text
            try:
                text = r.recognize_google(audio_listened) #recognize_google() method of the Recognizer class (r) to perform speech recognition on the variable 'audio_listened' which contains the recorded audio data.

            except sr.UnknownValueError as e:  #If an UnknownValueError exception occurs during the speech recognition process (e.g., when no speech is detected or the speech cannot be recognized), the code inside this block is executed.
                print("Error:", str(e))

            else:   # If no exception occurs during the try block, the code inside this block is executed.

                text = f"{text.capitalize()}. "  #This line capitalizes the recognized text and appends a period and a space at the end.

                print(chunk_filename, ":", text)  #prints the filename of the chunk and the recognized text.

                whole_text += text  #This line appends the recognized text to the whole_text variable, which is likely a cumulative variable to store the recognized text from multiple audio chunks.

    # return the text for all chunks detected
    return whole_text


In [25]:
path = "/content/wav_file.wav"
print("\nFull text:", get_large_audio_transcription(path))


audio-chunks/chunk1.wav : Try and keep on trying. 
audio-chunks/chunk2.wav : The rule that must be followed underscore to become an expert in anything. 

Full text: Try and keep on trying. The rule that must be followed underscore to become an expert in anything. 
