In [1]:
!pip install --upgrade vocos encodec librosa safetensors

Collecting vocos
  Downloading vocos-0.1.0-py3-none-any.whl.metadata (4.8 kB)
Collecting encodec
  Downloading encodec-0.1.1.tar.gz (3.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading vocos-0.1.0-py3-none-any.whl (24 kB)
Building wheels for collected packages: encodec
  Building wheel for encodec (setup.py) ... [?25l[?25hdone
  Created wheel for encodec: filename=encodec-0.1.1-py3-none-any.whl size=45760 sha256=cb60b109ca0cb7f526299d793ce1e77297fc6494410b126430623c305142b4e4
  Stored in directory: /root/.cache/pip/wheels/fc/36/cb/81af8b985a5f5e0815312d5e52b41263237af07b977e6bcbf3
Successfully built encodec
Installing collected packages: encodec, vocos
Successfully installed encodec-0.1.1 vocos-0.1.0


In [2]:
import pprint
import IPython.display as ipd
import torch
import librosa

In [3]:
# load model
mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)

Downloading: "https://github.com/Camb-ai/mars5-tts/zipball/master" to /root/.cache/torch/hub/master.zip
100%|██████████| 1.42G/1.42G [01:01<00:00, 24.9MB/s]
100%|██████████| 863M/863M [00:32<00:00, 27.6MB/s]
  WeightNorm.apply(module, name, dim)
Downloading: "https://dl.fbaipublicfiles.com/encodec/v0/encodec_24khz-d7cc33bc.th" to /root/.cache/torch/hub/checkpoints/encodec_24khz-d7cc33bc.th
100%|██████████| 88.9M/88.9M [00:02<00:00, 31.2MB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.yaml:   0%|          | 0.00/503 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/40.4M [00:00<?, ?B/s]

  state_dict = torch.load(model_path, map_location="cpu")


Now that the model is loaded, pick a reference audio to clone from. If you want to use deep clone, also specify its transcript.

In [4]:
# download example ref audio
!wget -O example.wav https://github.com/Camb-ai/mars5-tts/raw/master/docs/assets/example_ref.wav

--2024-11-24 17:23:26--  https://github.com/Camb-ai/mars5-tts/raw/master/docs/assets/example_ref.wav
Resolving github.com (github.com)... 20.27.177.113
Connecting to github.com (github.com)|20.27.177.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/Camb-ai/mars5-tts/master/docs/assets/example_ref.wav [following]
--2024-11-24 17:23:27--  https://raw.githubusercontent.com/Camb-ai/mars5-tts/master/docs/assets/example_ref.wav
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 137392 (134K) [audio/wav]
Saving to: ‘example.wav’


2024-11-24 17:23:27 (1.12 MB/s) - ‘example.wav’ saved [137392/137392]



In [5]:
wav, sr = librosa.load('./example.wav',
                       sr=mars5.sr, mono=True)
wav = torch.from_numpy(wav)
ref_transcript = "We actually haven't managed to meet demand."
print("Reference audio:")
ipd.display(ipd.Audio(wav.numpy(), rate=mars5.sr))
print(f"Reference transcript: {ref_transcript}")

Reference audio:


In [6]:
deep_clone = True # set to False if you don't know prompt transcript or want fast inference.
# Below you can tune other inference settings, like top_k, temperature, top_p, etc...
cfg = config_class(deep_clone=deep_clone, rep_penalty_window=100,
                      top_k=100, temperature=0.7, freq_penalty=3)

ar_codes, wav_out = mars5.tts("The quick brown rat.", wav,
          ref_transcript,
          cfg=cfg)

print('Synthesized output audio:')
ipd.Audio(wav_out.numpy(), rate=mars5.sr)

Reference transcript: We actually haven't managed to meet demand.
Note: using deep clone. Assuming input `c_phones` is concatenated prompt and output phones. Also assuming no padded indices in `c_codes`.
New x: torch.Size([1, 1025, 8]) | new x_known: torch.Size([1, 1025, 8]) . Base prompt: torch.Size([1, 215, 8]). New padding mask: torch.Size([1, 1025]) | m shape: torch.Size([1, 1025, 8])
Synthesized output audio:


You can see all the inference settings available to tune in the inference config here:

In [7]:
pprint.pprint(config_class())

InferenceConfig(temperature=0.7,
                top_k=200,
                top_p=0.2,
                typical_p=1.0,
                freq_penalty=3,
                presence_penalty=0.4,
                rep_penalty_window=80,
                eos_penalty_decay=0.5,
                eos_penalty_factor=1,
                eos_estimated_gen_length_factor=1.0,
                timesteps=200,
                x_0_temp=0.7,
                q0_override_steps=20,
                nar_guidance_w=3,
                max_prompt_dur=12,
                generate_max_len_override=-1,
                deep_clone=True,
                use_kv_cache=True,
                trim_db=27,
                beam_width=1,
                ref_audio_pad=0)


You can also listen to the vocoded raw coarse codes, for debugging purposes:

In [None]:
ar_wav = mars5.vocode(ar_codes.cpu()[:, None])
ipd.Audio(ar_wav.numpy(), rate=mars5.sr)

: 