# Lab Exercise 04

In [None]:
__author__ = "VÃ­ctor Vega Sobral"

---

# Local Part

---

## 1. Local Speech to Text 

In [2]:
import whisper

### 1.1 Loading the Whisper Turbo model

In [3]:
model = whisper.load_model("turbo")

  checkpoint = torch.load(fp, map_location=device)


### 1.2 Padding to 30 seconds

In [4]:
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio("55_radio_47.mp3")
audio = whisper.pad_or_trim(audio)


### 1.3 Log-Mel spectogram

In [5]:
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)
# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

Detected language: en


### 1.4 Decoding the audio

In [6]:
# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)
# print the recognized text
print(result.text)

Rain expected in 4 minutes. What is the rain intensity? 2 to 3.


---

## 2. Local Text to Speech

Using ``Coqui TTS``, a fork of Mozilla TTS.

In [None]:
from TTS.api import TTS

### 2.1 Initializing TTS model

In [None]:
# Tacotron2
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=True, gpu=False)

### 2.2 Converting text to speech

In [None]:
tts.tts_to_file(text = "Rain expected in 4 minutes. What is the rain intensity? 2 to 3.", file_path="outputs/output.mp3")

---

# External APIs services Part

### 1.1 Google Speech to Text External API

In [None]:
# pip install SpeechRecognition
import speech_recognition as sr

r = sr.Recognizer()
with sr.AudioFile("outputs/output.mp3") as source:
    audio = r.record(source)
    # This function calls the Google external API
    text = r.recognize_google(audio)  
    print(text)

### 1.2 Google Text to Speech External API

In [None]:
# pip install gTTS
from gtts import gTTS

tts = gTTS("This is a test for Google text to speech external api", lang="en")
tts.save("outputs/google_tts_output.mp3")

---

# Selected Speech Task: speech to speech translation

A easy pipeline for doing this task would be the following:

1. First, use Whisper to transcribe the audio to text.

2. Then, traduce the text (Spanish to English).

3. Finally, using mozilla TTS to create the new audio.

However, this approach adds a lot of latency for calling three models anc can be slow, so I will try to use ``NVIVIDA NeMo``, with a robust toolkit for speech-to_speech models based on Pytorch.

## Update: changing to first approach

NVIDIA NeMO is giving lots of problems from compilable tools that requiere WSL or a Linux distro. Then, I will move to the first approach even if it is not the most efficient one.

In [None]:
import torch
import whisper
from transformers import MarianMTModel, MarianTokenizer
from TTS.api import TTS

In [None]:
# Configuration: paths and model names
INPUT_AUDIO = "spanish_audio.mp3"         # Path to your Spanish mp3 file
OUTPUT_AUDIO = "english_translation.mp3"    # Path to save the translated English mp3
WHISPER_MODEL = "turbo"           # Whisper model size: tiny, base, small, medium, large
MT_MODEL = "Helsinki-NLP/opus-mt-es-en"
TTS_MODEL = "tts_models/en/ljspeech/tacotron2-DDC"


In [None]:
# Check device
USE_GPU = torch.cuda.is_available()
DEVICE = "cuda" if USE_GPU else "cpu"
print(f"[INFO] Using device: {DEVICE}")

In [None]:
# 1) Load Whisper ASR model
asr_model = whisper.load_model(WHISPER_MODEL, device=DEVICE)

In [None]:
# 2) Load MarianMT tokenizer and model
tokenizer = MarianTokenizer.from_pretrained(MT_MODEL)
mt_model = MarianMTModel.from_pretrained(MT_MODEL).to(DEVICE)

In [None]:
# 3) Initialize Coqui TTS
tts = TTS(model_name=TTS_MODEL, progress_bar=False, gpu=USE_GPU)


In [None]:
# -- ASR: Transcribe Spanish speech to text --
print(f"[ASR] Transcribing {INPUT_AUDIO}...")
asr_result = asr_model.transcribe(INPUT_AUDIO, language="es")
spanish_text = asr_result["text"].strip()
print(f"[ASR] Detected Spanish Text: {spanish_text}")



In [None]:
# -- MT: Translate Spanish text to English --
print(f"[MT] Translating to English...")
batch = tokenizer([spanish_text], return_tensors="pt", padding=True).to(DEVICE)
generated = mt_model.generate(**batch)
english_text = tokenizer.decode(generated[0], skip_special_tokens=True)
print(f"[MT] Translated English Text: {english_text}")


In [None]:
# -- TTS: Synthesize English text to speech --
print(f"[TTS] Generating English speech to {OUTPUT_AUDIO}...")
tts.tts_to_file(text=english_text, file_path=OUTPUT_AUDIO)
print(f"[TTS] Saved English audio at {OUTPUT_AUDIO}")

print("[INFO] Translation pipeline complete.")