## Text-to-Speech

In [None]:
%%capture
!pip install speechbrain
!pip install transformers

In [None]:
import torchaudio
import speechbrain as sb
from IPython.display import Audio
from speechbrain.pretrained import Tacotron2
from speechbrain.pretrained import HIFIGAN
from speechbrain.dataio.dataio import read_audio
from speechbrain.pretrained import SepformerSeparation as separator

In [None]:
# Intialize TTS (tacotron2) and Vocoder (HiFIGAN)
tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="tmpdir_tts")
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder")

Downloading (…)ain/hyperparams.yaml:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading model.ckpt:   0%|          | 0.00/113M [00:00<?, ?B/s]

Downloading (…)ain/hyperparams.yaml:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

Downloading generator.ckpt:   0%|          | 0.00/55.8M [00:00<?, ?B/s]

In [None]:
# Running the TTS
mel_output, mel_length, alignment = tacotron2.encode_text("if you really like this video, so please subscribe our youtube channel Research Rocks")

In [None]:
# Running Vocoder (analysis and synthesis)
waveforms = hifi_gan.decode_batch(mel_output)

In [None]:
Audio(waveforms.detach().cpu().squeeze(), rate=22050)

## **Speech Enhancement**


The goal of speech enhancement is to remove the noise that affects a recording. Speechbrain has several systems for speech enhancement. In the following, you can find an example processed by the SepFormer (the version trained to perform enhancement):

In [None]:
model = separator.from_hparams(source="speechbrain/sepformer-whamr-enhancement", savedir='pretrained_models/sepformer-whamr-enhancement')

Downloading (…)ain/hyperparams.yaml:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

Downloading encoder.ckpt:   0%|          | 0.00/17.3k [00:00<?, ?B/s]

Downloading masknet.ckpt:   0%|          | 0.00/113M [00:00<?, ?B/s]

Downloading decoder.ckpt:   0%|          | 0.00/17.3k [00:00<?, ?B/s]

In [None]:
enhanced_speech = model.separate_file(path='speechbrain/sepformer-whamr-enhancement/example_whamr.wav') 

Downloading (…)in/example_whamr.wav:   0%|          | 0.00/164k [00:00<?, ?B/s]

In [None]:
signal = read_audio("example_whamr.wav").squeeze()
Audio(signal, rate=8000)

In [None]:
Audio(enhanced_speech[:, :].detach().cpu().squeeze(), rate=8000)