# Lab Exercise 04

In [1]:
__author__ = "Víctor Vega Sobral"

---

# Local Part

---

## 1. Local Speech to Text 

In [2]:
import whisper

### 1.1 Loading the Whisper Turbo model

In [3]:
model = whisper.load_model("turbo")

  checkpoint = torch.load(fp, map_location=device)


### 1.2 Padding to 30 seconds

In [4]:
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio("55_radio_47.mp3")
audio = whisper.pad_or_trim(audio)


### 1.3 Log-Mel spectogram

In [5]:
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)
# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

Detected language: en


### 1.4 Decoding the audio

In [6]:
# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)
# print the recognized text
print(result.text)

Rain expected in 4 minutes. What is the rain intensity? 2 to 3.


---

## 2. Local Text to Speech

Using ``Coqui TTS``, a fork of Mozilla TTS.

In [12]:
from TTS.api import TTS

### 2.1 Initializing TTS model

In [13]:
# Tacotron2
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=True, gpu=False)

 > tts_models/en/ljspeech/tacotron2-DDC is already downloaded.
 > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Model's reduction rate `r` is set to: 1
 > Vocoder Model: hifigan
 > Setting up Audio P

  return torch.load(f, map_location=map_location, **kwargs)


Removing weight norm...


### 2.2 Converting text to speech

In [14]:
tts.tts_to_file(text = "Rain expected in 4 minutes. What is the rain intensity? 2 to 3.", file_path="outputs/output.mp3")

 > Text splitted to sentences.
['Rain expected in 4 minutes.', 'What is the rain intensity?', '2 to 3.']
 > Processing time: 3.6516635417938232
 > Real-time factor: 0.5822571813646433


'outputs/output.mp3'

---

# External APIs services Part

### 1.1 Google Speech to Text External API

In [15]:
# pip install SpeechRecognition
import speech_recognition as sr

r = sr.Recognizer()
with sr.AudioFile("outputs/output.mp3") as source:
    audio = r.record(source)
    # This function calls the Google external API
    text = r.recognize_google(audio)  
    print(text)

rain expected in 4 minutes what is the rain intensity 2203


### 1.2 Google Text to Speech External API

In [16]:
# pip install gTTS
from gtts import gTTS

tts = gTTS("This is a test for Google text to speech external api", lang="en")
tts.save("outputs/google_tts_output.mp3")

---

# Selected Speech Task: speech to speech translation

A easy pipeline for doing this task would be the following:

1. First, use Whisper to transcribe the audio to text.

2. Then, traduce the text (Spanish to English).

3. Finally, using mozilla TTS to create the new audio.

However, this approach adds a lot of latency for calling three models anc can be slow, so I will try to use ``NVIVIDA NeMo``, with a robust toolkit for speech-to_speech models based on Pytorch.

## Update: changing to first approach

NVIDIA NeMO is giving lots of problems from compilable tools that requiere WSL or a Linux distro. Then, I will move to the first approach even if it is not the most efficient one.

In [None]:
import torch
import whisper
from transformers import MarianMTModel, MarianTokenizer
from TTS.api import TTS
import argparse

### 1.1 Loading Models 

In [None]:
# --------------------------------------------------------
# Speech-to-Speech Translation Script (Spanish -> English)

def load_models(whisper_model_name: str, mt_model_name: str,
                tts_model_name: str, use_gpu: bool):
    """
    Load all required models and return them.
    """
    device = "cuda" if use_gpu else "cpu"
    print(f"[INFO] Device: {device}")

    # 1) Whisper ASR model
    asr_model = whisper.load_model(whisper_model_name, device=device)

    # 2) MarianMT tokenizer and model
    tokenizer = MarianTokenizer.from_pretrained(mt_model_name)
    mt_model = MarianMTModel.from_pretrained(mt_model_name).to(device)

    # 3) Coqui TTS
    tts = TTS(model_name=tts_model_name, progress_bar=False, gpu=use_gpu)

    return asr_model, tokenizer, mt_model, tts, device





In [None]:
def transcribe(asr_model, audio_path: str, device: str) -> str:
    """
    Transcribe Spanish audio to text.
    """
    print(f"[ASR] Transcribing {audio_path}...")
    result = asr_model.transcribe(audio_path, language="es")
    text = result["text"].strip()
    print(f"[ASR] Result: {text}")
    return text

