### Hugging Face model Speech to text model

In [6]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from pydub import AudioSegment
import torchaudio
import torchaudio.transforms as transforms
import torch
import os

# Function to convert .m4a to .wav
def convert_m4a_to_wav(audio_path):
    """Converts an .m4a file to .wav format."""
    if not os.path.exists(audio_path):
        raise FileNotFoundError(f"The file {audio_path} does not exist.")
    
    audio = AudioSegment.from_file(audio_path)
    wav_path = audio_path.replace(".m4a", ".wav")
    audio.export(wav_path, format="wav")
    return wav_path

# Function to load the Whisper model and processor
def load_whisper_model(model_name="openai/whisper-tiny.en"):
    """Loads the Whisper model and processor."""
    processor = WhisperProcessor.from_pretrained(model_name)
    model = WhisperForConditionalGeneration.from_pretrained(model_name)
    model.config.forced_decoder_ids = None
    return processor, model

# Function to preprocess and transcribe audio
def transcribe_audio(audio_path, processor, model):
    """Preprocesses and transcribes an audio file."""
    # Convert .m4a to .wav if necessary
    if audio_path.endswith(".m4a"):
        audio_path = convert_m4a_to_wav(audio_path)
    
    # Load the audio file
    waveform, sample_rate = torchaudio.load(audio_path)

    # Resample to 16 kHz if needed
    if sample_rate != 16000:
        resampler = transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    # Convert audio to input features
    input_features = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features

    # Generate token ids (set language='en' to force translation to English)
    predicted_ids = model.generate(input_features)
    
    # Decode token ids to text
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription

# Load the Whisper model and processor
processor, model = load_whisper_model()

# Path to your audio file
audio_file = r"C:\Users\USER\Desktop\Projects\Recipe_Generator\Notebooks\recordings\my_voice.wav"

# Transcribe the audio
transcription = transcribe_audio(audio_file, processor, model)
print("Transcription:", transcription)


Transcription:  I'm a vegetarian. I love cooked vegetables, but I don't know what's cooked. If I give me some recipe, I can use


### Recording audio (Just to test the model)

In [21]:

import sounddevice as sd
from scipy.io.wavfile import write
import os

def record_audio(duration, save_path):
    """
    Records audio from the microphone and saves it as a .wav file.

    Parameters:
        duration (int): Duration of the recording in seconds.
        save_path (str): Path where the recorded audio will be saved (including filename).
    """
    # Ensure the save directory exists
    save_dir = os.path.dirname(save_path)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    # Define the sampling rate
    sampling_rate = 16000  # 16 kHz is commonly used for speech

    print("Recording... Speak into the microphone.")
    # Record audio
    audio_data = sd.rec(int(duration * sampling_rate), samplerate=sampling_rate, channels=1, dtype='int16')
    sd.wait()  # Wait until recording is finished
    print("Recording complete.")
    
    # Save the audio file
    write(save_path, sampling_rate, audio_data)
    print(f"Audio saved to: {save_path}")

# Example usage
if __name__ == "__main__":
    duration_in_seconds = 10  # Record for 10 seconds
    save_file_path = "recordings/my_voice.wav"  # Specify the save directory and filename
    record_audio(duration_in_seconds, save_file_path)


Recording... Speak into the microphone.
Recording complete.
Audio saved to: recordings/my_voice.wav


In [4]:
current_directory = os.getcwd()  # Get the current working directory
print(current_directory)
audio_file = os.path.join(current_directory, "recordings", "my_voice.wav")
print(audio_file)

c:\Users\USER\Desktop\Projects\Recipe_Generator\Notebooks
c:\Users\USER\Desktop\Projects\Recipe_Generator\Notebooks\recordings\my_voice.wav


## Finally


Recording... Speak into the microphone.
Recording complete.
Transcription:  I'm gonna go to the hospital.
