# Setup
Mount g-drive & install TTS

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!ls -l /content/drive/MyDrive/NigerVoltaLTI/open-bible-yo-tts/

total 1941
-rw------- 1 root root    1092 Dec 27 05:09 io_preprocessing.py
-rw------- 1 root root 1967170 Dec 11 04:44 metadata.tsv
drwx------ 2 root root    4096 Dec 28 03:34 processing
-rw------- 1 root root     901 Dec 11 05:10 README.josh.txt
-rw------- 1 root root     275 Dec 30 09:59 test_sentences.txt
drwx------ 2 root root    4096 Dec 29 02:22 tts_outputs
drwx------ 2 root root    4096 Dec 28 03:34 wavs
drwx------ 2 root root    4096 Dec 29 19:15 wavs_16kHz


In [3]:
!pip install TTS

Collecting TTS
  Downloading TTS-0.4.2.tar.gz (1.4 MB)
[?25l[K     |▎                               | 10 kB 37.8 MB/s eta 0:00:01[K     |▌                               | 20 kB 44.6 MB/s eta 0:00:01[K     |▊                               | 30 kB 25.7 MB/s eta 0:00:01[K     |█                               | 40 kB 19.4 MB/s eta 0:00:01[K     |█▏                              | 51 kB 16.9 MB/s eta 0:00:01[K     |█▍                              | 61 kB 15.6 MB/s eta 0:00:01[K     |█▊                              | 71 kB 14.6 MB/s eta 0:00:01[K     |██                              | 81 kB 16.1 MB/s eta 0:00:01[K     |██▏                             | 92 kB 16.0 MB/s eta 0:00:01[K     |██▍                             | 102 kB 14.1 MB/s eta 0:00:01[K     |██▋                             | 112 kB 14.1 MB/s eta 0:00:01[K     |██▉                             | 122 kB 14.1 MB/s eta 0:00:01[K     |███▏                            | 133 kB 14.1 MB/s eta 0:00:01[K     |██

In [4]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Fri Dec 31 21:42:25 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
import os

from TTS.config.shared_configs import BaseAudioConfig
from TTS.trainer import Trainer, TrainingArgs
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.shared_configs import CharactersConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.glow_tts import GlowTTS
from TTS.tts.utils.speakers import SpeakerManager
from TTS.utils.audio import AudioProcessor
from pathlib import Path


# formatter 
Change the bible corpus metadata.tsv file for Yorùbá into a list of:
`[text, wav_file_path, speaker name]`

Note that we have a single speaker/reader



In [6]:
def bible_formatter(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
    """Normalizes Open Bible dataset to TTS format"""
    txt_file = os.path.join(root_path, meta_file)
    speaker_name = Path(root_path).stem
    items = []
    with open(txt_file, "r", encoding="utf-8") as ttf:
        for line in ttf:
            cols = line.split("\t")
            wav_file = os.path.join(root_path + "wavs_16kHz", cols[0].strip())
            text = cols[1]
            if os.path.isfile(wav_file):
                if len(text) > 0:
                    text = text.strip().lower()     # lowercase chars
                    items.append([text, wav_file, speaker_name])
                else:
                    print("text len <= 0, empty text?")
            else:
                print("> File %s does not exist!" % (wav_file))
    return items


# Main code

In [8]:
# set experiment paths
output_path = "/content/drive/MyDrive/NigerVoltaLTI/open-bible-yo-tts/tts_outputs/"
dataset_path = "/content/drive/MyDrive/NigerVoltaLTI/open-bible-yo-tts/"


# define dataset config
dataset_config = BaseDatasetConfig(name="open-bible-yo", meta_file_train="metadata.tsv", path=dataset_path)

# define audio config
# ❗ resample the dataset externally using `TTS/bin/resample.py` and set `resample=False` for faster training
# audio_config = BaseAudioConfig(sample_rate=48000, resample=False, do_trim_silence=True, trim_db=23.0)
audio_config = BaseAudioConfig(sample_rate=16000, resample=False, do_trim_silence=True, trim_db=23.0)

# Character set, specifically for Yorùbá
char_dict = {
        "pad": "_",
        "eos": "~",
        "bos": "^",
        "characters": "aàábdeèéẹè̩é̩fghiìíjklmnǹńoóòọò̩ọ́prsṣtuùúwy!'(),-.:;? ",
        "punctuations": "!'(),-.:;? ",
        "unique": True 
        }

# define model config
config = GlowTTSConfig(
    batch_size=64,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1000,
    text_cleaner="basic_cleaners",            # Only a basic text cleaner
    use_phonemes=False,
    phoneme_language=None,                    # No g2p, not yet
    phoneme_cache_path=None,
    print_step=25,
    print_eval=True,
    audio=audio_config,                       # audio config
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
    use_speaker_embedding=False,              # Single speaker model
    characters=CharactersConfig(**char_dict), # Ingest Yorùbá char_dict above
    test_sentences_file="/content/drive/MyDrive/NigerVoltaLTI/open-bible-yo-tts/test_sentences.txt"
)

# init audio processor
ap = AudioProcessor(**config.audio.to_dict())

# load training samples
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True,
                                               formatter=bible_formatter)


 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:23.0
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
> File /content/drive/MyDrive/NigerVoltaLTI/open-bible-yo-tts/wavs_16kHz/filename does not exist!
 | > Found 10267 files in /content/drive/MyDrive/NigerVoltaLTI/open-bible-yo-tts


In [None]:
# Not needed for single-speaker Yorùbá bible
# init speaker manager for multi-speaker training
# it maps speaker-id to speaker-name in the model and data-loader
# speaker_manager = SpeakerManager()
# speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
# config.num_speakers = speaker_manager.num_speakers

# init model
model = GlowTTS(config)

# init the trainer and 🚀
trainer = Trainer(
    TrainingArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
    training_assets={"audio_processor": ap},
)
trainer.fit()

 > Using CUDA:  True
 > Number of GPUs:  1

 > Model has 28596241 parameters

[4m[1m > EPOCH: 0/1000[0m
 --> /content/drive/MyDrive/NigerVoltaLTI/open-bible-yo-tts/tts_outputs/coqui_tts-December-31-2021_09+45PM-0000000

 > DataLoader initialization
 | > Use phonemes: False
 | > Number of instances : 10165
 | > Max length sequence: 354
 | > Min length sequence: 11
 | > Avg length sequence: 125.75395966551893
 | > Num. instances discarded by max-min (max=500, min=3) seq limits: 0
 | > Batch group size: 0.

[1m > TRAINING (2021-12-31 21:45:23) [0m
