# import vosk
import pyaudio
import json
import os
from gtts import gTTS
import wave
from transformers import MarianMTModel, MarianTokenizer

### Getting all the input devices

In [3]:
# Initialize PyAudio
p = pyaudio.PyAudio()

# Get the number of audio input devices
device_count = p.get_device_count()

print("Available Audio Input Devices:")
for i in range(device_count):
    info = p.get_device_info_by_index(i)
    if info['maxInputChannels'] > 0:  # Check if it's an input device
        print(f"Device {i}: {info['name']}")

# Terminate PyAudio
p.terminate()


Available Audio Input Devices:
Device 0: Microsoft Sound Mapper - Input
Device 1: Microphone (Razer Seiren Mini)
Device 2: Microphone (HD Webcam eMeet C96
Device 8: Primary Sound Capture Driver
Device 9: Microphone (Razer Seiren Mini)
Device 10: Microphone (HD Webcam eMeet C960)
Device 20: Microphone (HD Webcam eMeet C960)
Device 21: Microphone (Razer Seiren Mini)
Device 23: Stereo Mix (Realtek HD Audio Stereo input)
Device 25: Line In (Realtek HD Audio Line input)
Device 27: Microphone (Realtek HD Audio Mic input)
Device 30: Input (OCULUSVAD Wave Speaker Headphone)
Device 31: Headset Microphone (OCULUSVAD Wave Microphone Headphone)
Device 32: Microphone (Razer Seiren Mini)
Device 35: SteelSeries Sonar - Stream (SteelSeries_Sonar_VAD Stream Wave)
Device 37: SteelSeries Sonar - Microphone (SteelSeries_Sonar_VAD Chat Capture Wave)
Device 41: Microphone (VDVAD Wave)
Device 45: Microphone (HD Webcam eMeet C960)
Device 47: Headset (@System32\drivers\bthhfenum.sys,#2;%1 Hands-Free%0
;(WH-100

### setting index to set defalt input device

In [4]:
device_index = 1

## Using Model

In [None]:
model_path = "./vosk-model-en-us-0.42-gigaspeech"

# # Check if the model path exists
if not os.path.exists(model_path):
    print("Model path does not exist.")
else:
    try:
        # Initialize the model
        # model = vosk.Model(model_path)
        print("Model loaded successfully.")
    except Exception as e:
        print(f"Failed to create a model: {e}")

Model loaded successfully.


In [6]:
import pyaudio
import json
from vosk import Model, KaldiRecognizer

def record_audio(device_index=0, model_path="model"):
    """Records audio from the microphone, recognizes speech until 'stop' is detected, 
       and removes the final 'stop' word from the result."""
    
    # Initialize the recognizer model (make sure model_path points to your Vosk model)
    model = Model(model_path)
    rec = KaldiRecognizer(model, 16000)
    
    # Initialize the audio stream
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16,
                    channels=1,
                    rate=16000,
                    input=True,
                    input_device_index=device_index,
                    frames_per_buffer=8192)
    
    print(f"Using microphone: {p.get_device_info_by_index(device_index)['name']}")
    recognized_text = ""

    # Stream audio and recognize speech
    print("Listening for speech. Say 'Stop' to stop.")
    while True:
        data = stream.read(4096, exception_on_overflow=False)
        if rec.AcceptWaveform(data):
            result = json.loads(rec.Result())
            text = result.get("text", "")
            recognized_text += text + " "
            print(text)
            
            # Stop condition
            if "stop" in text.lower():
                print("Stop keyword detected. Stopping...")
                break

    # Clean up audio stream
    stream.stop_stream()
    stream.close()
    p.terminate()
    
    # Remove the final 'stop' word
    words = recognized_text.strip().split()
    if words and words[-1].lower() == "stop":
        words.pop()
        words.pop()
    
    cleaned_text = " ".join(words)
    print("Last word 'stop' removed successfully!")
    
    return cleaned_text  # Return the cleaned recognized text


In [7]:
#model = vosk.Model(lang="en-in")

In [8]:
# # Create a recognizer
# rec = vosk.KaldiRecognizer(model, 16000)

In [9]:
# # Open the microphone stream
# p = pyaudio.PyAudio()
# stream = p.open(format=pyaudio.paInt16,
#                 channels=1,
#                 rate=16000,
#                 input=True,
#                 input_device_index= device_index,
#                 frames_per_buffer=8192)
# print(f"Using microphone: {p.get_device_info_by_index(device_index)['name']}")

In [10]:
# # Specify the path for the output text file
# output_file_path = "recognized_text.txt"

In [11]:
# # Open a text file in write mode using a 'with' block
# with open(output_file_path, "w") as output_file:
#     print("Listening for speech. Say 'Stop' to stop.")
#     # Start streaming and recognize speech
#     while True:
#         data = stream.read(4096,exception_on_overflow= False)#read in chunks of 4096 bytes
#         if rec.AcceptWaveform(data):#accept waveform of input voice
#             # Parse the JSON result and get the recognized text
#             result = json.loads(rec.Result())
#             recognized_text = result['text']
            
#             # Write recognized text to the file
#             output_file.write(recognized_text + "\n")
#             print(recognized_text)
            
#             # Check for the termination keyword
#             if "stop" in recognized_text.lower():
#                 print("Stop keyword detected. Stopping...")
#                 break

In [12]:
# # Stop and close the stream
# stream.stop_stream()
# stream.close()

In [13]:
# # Terminate the PyAudio object
# p.terminate()

### checking my own recording

In [14]:


# # Set parameters for audio stream
# FORMAT = pyaudio.paInt16  # Audio format (16-bit PCM)
# CHANNELS = 1              # Number of audio channels
# RATE = 16000              # Sample rate (16 kHz)
# CHUNK = 1024              # Buffer size (number of frames per buffer)

# # Create a PyAudio object
# p = pyaudio.PyAudio()

# # Open a stream to capture audio
# stream = p.open(format=FORMAT,
#                  channels=CHANNELS,
#                  rate=RATE,
#                  input=True,
#                  frames_per_buffer=CHUNK)

# print("Recording...")

# frames = []

# # Record audio for a certain duration (e.g., 5 seconds)
# for _ in range(0, int(RATE / CHUNK * 5)):  # Change 5 to the desired duration
#     data = stream.read(CHUNK)
#     frames.append(data)

# print("Finished recording.")

# # Stop and close the stream
# stream.stop_stream()
# stream.close()
# p.terminate()

# # Save the recorded audio to a WAV file
# output_file = "output.wav"
# with wave.open(output_file, 'wb') as wf:
#     wf.setnchannels(CHANNELS)
#     wf.setsampwidth(p.get_sample_size(FORMAT))
#     wf.setframerate(RATE)
#     wf.writeframes(b''.join(frames))

# print(f"Audio saved to {output_file}.")


### Converting into string

In [15]:
# with open ("recognized_text.txt", 'r') as file:
#     text = "".join(line.rstrip() for line in file)
#     print(text)

In [16]:


def speech_to_text(audio_data, model_path="model"):
    """Converts recorded audio data to text using Vosk speech recognition model.
    
    Args:
        audio_data (bytes): Audio data to process.
        model_path (str): Path to the Vosk model directory.

    Returns:
        str: Recognized text from the audio data.
    """
    
    # Initialize the recognizer model
    model = Model(model_path)
    rec = KaldiRecognizer(model, 16000)
    
    # Recognize speech from audio data
    recognized_text = ""
    
    # Feed the audio data in chunks to the recognizer
    if rec.AcceptWaveform(audio_data):
        result = json.loads(rec.Result())
        recognized_text = result.get("text", "")
    
    print("Recognized Text:", recognized_text)
    return recognized_text


In [17]:

file_path = "recognized_text.txt"


with open(file_path, "r") as file:
    text = file.read()


words = text.split()  # Split the text into a list of words
if words:  # Ensure there's at least one word
    words.pop()
    words.pop()# Remove the last word


with open(file_path, "w") as file:
    file.write(" ".join(words))  

print("Last word deleted successfully!") # deleting the last word as it is stop
print(text)

Last word deleted successfully!



In [18]:
text

''

### English to French

In [19]:
# english to french

import torch
print(torch.__version__)

src_text = text 
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
translated = model.generate(**tokenizer(src_text, return_tensors = 'pt', padding = True))
tgt_text = [tokenizer.decode(t, skip_special_tokens = True) for t in translated]

2.5.1+cpu




In [20]:
tgt_text

["Le présent règlement entre en vigueur le jour suivant celui de sa publication au Journal officiel de l'Union européenne."]

### French to English

In [21]:
# french to english


src_text = tgt_text
model_name = "Helsinki-NLP/opus-mt-fr-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
translated = model.generate(**tokenizer(src_text, return_tensors = 'pt', padding = True))
tgt_text = [tokenizer.decode(t, skip_special_tokens = True) for t in translated]

In [22]:
tgt_text

['This Regulation shall enter into force on the day following its publication in the Official Journal of the European Union.']

### English to Spanish

In [23]:
# English to Spanish
src_text = text
model_name = "Helsinki-NLP/opus-mt-en-es"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
translated = model.generate(**tokenizer(src_text, return_tensors='pt', padding=True))
tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]


In [24]:
tgt_text

['- No, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no.']

### Spanish to English

In [25]:
# Spanish to English
src_text = tgt_text
model_name = "Helsinki-NLP/opus-mt-es-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
translated = model.generate(**tokenizer(src_text, return_tensors='pt', padding=True))
tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]


In [26]:
tgt_text

['- No, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no.']

In [27]:
from transformers import MarianMTModel, MarianTokenizer

def translate_text(src_text, target_language_model):
    """Translates the source text to the desired language using the specified model.

    Args:
        src_text (str): The source text in the original language.
        target_language_model (str): The name of the MarianMT model for the target language translation.

    Returns:
        str: Translated text in the target language.
    """
    
    # Initialize the model and tokenizer for the target language
    tokenizer = MarianTokenizer.from_pretrained(target_language_model)
    model = MarianMTModel.from_pretrained(target_language_model)
    
    # Tokenize the source text and generate the translation
    translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
    
    # Decode and return the translated text
    tgt_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    
    return tgt_text


### Text to Speech using gTTS

In [28]:
# my_text = "".join(tgt_text)
# language = 'es'
# # Passing the text and language to the engine, 
# # here we have marked slow=False. Which tells 
# # the module that the converted audio should 
# # have a high speed
# myobj = gTTS(text = my_text, lang = language, slow = False)
# myobj.save("target.mp3")
# os.system("target.mp3")

In [29]:
from gtts import gTTS
from io import BytesIO

def text_to_speech(text, language_code):
    """Converts text to speech in the specified language and returns the audio data.
    
    Args:
        text (str): The text to convert to speech.
        language_code (str): The language code for the desired speech language (e.g., "en" for English, "fr" for French).
        
    Returns:
        BytesIO: In-memory audio file with the spoken text.
    """
    
    # Convert text to speech
    tts = gTTS(text=text, lang=language_code)
    
    # Save audio to an in-memory file
    audio_fp = BytesIO()
    tts.write_to_fp(audio_fp)
    audio_fp.seek(0)  # Reset file pointer to the beginning
    
    return audio_fp


### Streamlit

In [3]:
!streamlit run streamlit.py
# import streamlit as st
# from io import BytesIO

# # Assuming the following functions exist and are directly callable:
# # - record_audio() -> records English audio and returns audio data
# # - speech_to_text(audio_data) -> converts recorded audio to text
# # - translate_text(src_text, target_language_model) -> translates text to the desired language
# # - text_to_speech(text, language_code) -> converts text to speech and returns audio file

# # Language options dictionary, mapping language names to translation model names and language codes
# language_options = {
#     "French": ("Helsinki-NLP/opus-mt-en-fr", "fr"),
#     "Spanish": ("Helsinki-NLP/opus-mt-en-es", "es")
# }

# # Streamlit GUI setup
# st.title("Multilingual Audio Translator")

# # Step 1: Record Audio
# st.header("Step 1: Record English Audio")
# if st.button("Start Recording"):
#     st.info("Recording... Please speak now.")
#     audio_data = record_audio()  # Use your function to record audio
#     st.success("Recording complete.")
    
#     # Convert recorded audio to text (Speech-to-Text)
#     try:
#         src_text = speech_to_text(audio_data)  # Convert to text using your function
#         st.write("Recognized Text:", src_text)
#     except Exception as e:
#         st.error(f"Error: {e}")

#     # Step 2: Choose Translation Language
#     st.header("Step 2: Translate Text")
#     target_language = st.selectbox("Select target language:", list(language_options.keys()))
    
#     if target_language and src_text:
#         model_name, lang_code = language_options[target_language]
#         translation = translate_text(src_text, model_name)  # Translate using your function
#         st.write("Translated Text:", translation)

#         # Step 3: Text-to-Speech
#         st.header("Step 3: Play Translated Speech")
#         audio_fp = text_to_speech(translation, lang_code)  # Convert to speech using your function
#         st.audio(audio_fp, format="audio/mp3")




^C


In [None]:
# import streamlit as st
# from transformers import MarianMTModel, MarianTokenizer

# # Translation function
# def translate_text(src_text, model_name):
#     tokenizer = MarianTokenizer.from_pretrained(model_name)
#     model = MarianMTModel.from_pretrained(model_name)
#     translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
#     tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
#     return tgt_text[0]

# # Streamlit GUI
# st.title("Multilingual Translator")

# # Language options
# language_options = {
#     "English to French": "Helsinki-NLP/opus-mt-en-fr",
#     "French to English": "Helsinki-NLP/opus-mt-fr-en",
#     "Spanish to English": "Helsinki-NLP/opus-mt-es-en",
#     "English to Spanish": "Helsinki-NLP/opus-mt-en-es",
# }

# # User inputs
# st.subheader("Select Translation Direction")
# translation_direction = st.selectbox("Choose a translation direction:", list(language_options.keys()))

# st.subheader("Enter Text to Translate")
# src_text = st.text_area("Input text here")

# # Translate button
# if st.button("Translate"):
#     if src_text:
#         model_name = language_options[translation_direction]
#         translation = translate_text(src_text, model_name)
#         st.subheader("Translated Text")
#         st.write(translation)
#     else:
#         st.warning("Please enter text to translate.")
