#Generating samples

In [None]:
# Imports
import os
from google.colab import drive
import csv

In [None]:
drive.mount('/content/drive') # All data is stored on google drive
# If you are using this script locally, comment this and adjust paths

# Model, config and transcriptions path
MODEL_PATH = "/content/drive/MyDrive/thesis_training_and_adaptation/models/vits_two.pth" #model path
CONFIG_PATH = "/content/drive/MyDrive/thesis_training_and_adaptation/config.json"         # configuration file
TRANS = "/content/drive/MyDrive/thesis_training_and_adaptation/trans/transcriptions_vits.csv"           # transcription file
OUT_PATH = "/content/drive/MyDrive/thesis_training_and_adaptation/output"                 # output directory for wav files

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Dependencies needed for synthesizing the speech
!pip install git+https://github.com/coqui-ai/TTS
!pip install torch torchaudio
!apt install espeak-ng

Collecting git+https://github.com/coqui-ai/TTS
  Cloning https://github.com/coqui-ai/TTS to /tmp/pip-req-build-kzrbe71a
  Running command git clone --filter=blob:none --quiet https://github.com/coqui-ai/TTS /tmp/pip-req-build-kzrbe71a
  Resolved https://github.com/coqui-ai/TTS to commit dbf1a08a0d4e47fdad6172e433eeb34bc6b13b4e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting anyascii>=0.3.0 (from TTS==0.22.0)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting mutagen==1.47.0 (from TTS==0.22.0)
  Downloading mutagen-1.47.0-py3-none-any.whl.metadata (1.7 kB)
Collecting pysbd>=0.3.4 (from TTS==0.22.0)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting pandas<2.0,>=1.4 (from TTS==0.22.0)
  Downloading pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting trainer>=0.0.36

##Synthesizing speech using default vits model

In [None]:
#This command lets you synthesize speech using the default vits moodel trained on ljspeech
#!tts --model_name "tts_models/en/ljspeech/vits" --text "Sample text" --out_path "/content/drive/MyDrive/output/sample.wav"

#Synthesizing from a transcription file
OUTPUT_FOLDER = os.path.join(OUT_PATH, "vits_default")  #Output folder for synthesized speech
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

with open(TRANS, 'r') as f:
    reader = csv.reader(f, delimiter="|") # each line is in format: audio_name.wav|transcription
    for row in reader:
        filename, text, norm = row
        output = os.path.join(OUTPUT_FOLDER, filename)
        !tts --model_name "tts_models/en/ljspeech/vits" --text "$norm" --out_path $output

 > Downloading model to /root/.local/share/tts/tts_models--en--ljspeech--vits
 98% 143M/146M [00:01<00:00, 81.8MiB/s] > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Text: When

##Synthesizing speech using trained or adapted model

In [None]:
#This command lets you synthesize speech using your trained model
#!tts --model_path $MODEL_PATH --text "Sample text" --out_path "/content/drive/MyDrive/output/sample.wav" --config_path $CONFIG_PATH
# Can also work on default vits model if you have it downloaded

OUTPUT_FOLDER = os.path.join(OUT_PATH, "vits_two_v2")  #Output folder for synthesized speech
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
#This part is used to generate 100 sentences (each with different transcription) and store them on google drive
with open(TRANS, 'r') as f:
    reader = csv.reader(f, delimiter="|") # each line is in format: audio_name.wav|transcription
    for row in reader:
        filename, text, norm = row
        output = os.path.join(OUTPUT_FOLDER, filename)
        !tts --model_path $MODEL_PATH --text "$norm" --out_path $output --config_path $CONFIG_PATH



 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Text: the good woman had taken into her retirement an ideal of gentlemanliness as exhibited by the patrons of private-saloon bars.
 > Text splitted to sentences.
['the good woman had taken into her retirement an ideal of gentlemanl