In [17]:
import IPython
import torch
import soundfile as sf

from IPython.display import Audio

from TTS.config import load_config
from TTS.tts.models import setup_model
from TTS.tts.utils.synthesis import synthesis
from TTS.utils.audio import AudioProcessor

In [48]:
GENERAL_PATH = '/home/julian/workspace/train/VC-MSC-TTS/vits_tts-portuguese-September-27-2021_10+02AM-e6143fd5/'
MODEL_PATH = GENERAL_PATH + 'checkpoint_789000.pth.tar'
#MODEL_PATH = '/home/julian/workspace/train/VC-MSC-TTS/FT-mode-2-best_model.pth.tar'
CONFIG_PATH = GENERAL_PATH + 'config.json'
TTS_LANGUAGES = GENERAL_PATH + "language_ids.json"
TTS_SPEAKERS = GENERAL_PATH + "speakers.json"
#TTS_SPEAKERS = '/media/julian/Datasets1/Test-Dataset/new_se.json'
USE_CUDA = torch.cuda.is_available()

In [49]:
# load the config
C = load_config(CONFIG_PATH)

# load the audio processor
ap = AudioProcessor(**C.audio)

speaker_embedding = None

C.model_args['d_vector_file'] = TTS_SPEAKERS
C.model_args['use_speaker_encoder_as_loss'] = False

model = setup_model(C)
model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)
cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
model_weights = cp['model'].copy()
for key in list(model_weights.keys()):
  if "speaker_encoder" in key:
    del model_weights[key]

model.load_state_dict(model_weights)


model.eval()

if USE_CUDA:
    model = model.cuda()

use_griffin_lim = True

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:False
 | > do_amp_to_db_mel:True
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Using model: vits
 > Speaker manager is loaded with 105 speakers: ED, Juilan, Mohammed, bernard, ezwa, gilles, nadine, p226, p227, p228, p229, p230, p231, p232, p233, p236, p237, p239, p240, p241, p243, p244, p246, p247, p249, p250, p251, p252, p253, p254, p255, p256, p257, p258, p259, p260, p262, p263, p264, p265, p266, p267, p268, p269, p270,

In [72]:
#set speaker
speaker = 'Mohammed'
d_vector = model.speaker_manager.get_mean_d_vector(speaker)

In [73]:
model.language_manager.language_id_mapping

{'en': 0, 'fr-fr': 1, 'pt-br': 2}

In [74]:
# set scales 
model.noise_scale = 0.0  # defines the noise variance applied to the random z vector at inference.
model.length_scale = 1.5  # scaler for the duration predictor. The larger it is, the slower the speech.
model.noise_scale_w = 0.0 # defines the noise variance applied to the duration predictor z vector at inference.
model.inference_noise_scale = 0.3  # defines the noise variance applied to the random z vector at inference.
model.inference_noise_scale_dp = 0 # defines the noise variance applied to the duration predictor z vector at inference.

In [75]:
text = "J'apprécie beaucoup les travaux de votre équipe, ces derniers peuve avoir un grand intérêt pour le groupe sopra."
#text = "On s'appuie sur un ensemble de données historiques pour donner des prédictions." #, et ça marche supère bien."
#text = "I'm very glad to introduce the text to speech system that we made."
#text = "Ele pensou que tinha detectado um cheiro agradável de ervas, como as que sua mãe tinha nas tigelas de sua casa."
#text = "to be clear, in charge of product doesn't mean they get to decide what's being built."
#text = "Pendant la Seconde Guerre mondiale, la production d'armes atomiques était la principale raison d'être de l'industrie nucléaire."
language_id = 1
wav, alignment, _, _ = synthesis(
                    model,
                    text,
                    C,
                    "cuda" in str(next(model.parameters()).device),
                    ap,
                    speaker_id=None,
                    d_vector=d_vector,
                    style_wav=None,
                    language_id=language_id,
                    enable_eos_bos_chars=C.enable_eos_bos_chars,
                    use_griffin_lim=True,
                    do_trim_silence=False,
                ).values()
IPython.display.display(Audio(wav, rate=ap.sample_rate))
file_name = speaker+'-'+str(model.inference_noise_scale)+'-'+str(model.inference_noise_scale_dp)+'.wav'
sf.write('/home/julian/workspace/out/'+file_name, wav, ap.sample_rate)


In [8]:
import os
import librosa
import numpy as np

from pydub import effects  
from pydub import AudioSegment

from TTS.tts.utils.speakers import SpeakerManager

CONFIG_SE_PATH = "/media/julian/Datasets/encoder/new_se/config.json"
CHECKPOINT_SE_PATH = "/media/julian/Datasets/encoder/new_se/converted_checkpoint.pth.tar"
our_SE_speaker_manager = SpeakerManager(
    encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=torch.cuda.is_available()
)

def compute_emb(path, normalize=True, target_dbfs=-27): #target_dbfs=-24.632442475923607
    if normalize:
      song = AudioSegment.from_file(path)
      change_in_dBFS = target_dbfs - song.dBFS
      normalized_sound = song.apply_gain(change_in_dBFS)
      normalized_sound.export(path, format=path[-3:])

    embed = our_SE_speaker_manager.compute_d_vector_from_clip(path)

    return embed

FileNotFoundError: [Errno 2] No such file or directory: '/media/julian/Datasets/encoder/new_se/config.json'

In [66]:
ref_wav = "/home/julian/workspace/reference_wavs/julian.wav" # don't forget to resample and normalize the audio
d_vector = compute_emb(ref_wav)

In [27]:
# set scales 
model.noise_scale = 0.0  # defines the noise variance applied to the random z vector at inference.
model.length_scale = 1.0  # scaler for the duration predictor. The larger it is, the slower the speech.
model.noise_scale_w = 0.0 # defines the noise variance applied to the duration predictor z vector at inference.
model.inference_noise_scale = 0.3 # defines the noise variance applied to the random z vector at inference.
model.inference_noise_scale_dp = 0.3 # defines the noise variance applied to the duration predictor z vector at inference.

In [35]:
text = "J'apprécie beaucoup les travaux de l'équipe Innovation de esse Bé est-ce, ces derniers peuvent avoir un grand intérêt pour le Groupe. "
#text = "I'm the president of the united states"
language_id = 1
wav, alignment, _, _ = synthesis(
                    model,
                    text,
                    C,
                    "cuda" in str(next(model.parameters()).device),
                    ap,
                    speaker_id=None,
                    d_vector=d_vector,
                    style_wav=None,
                    language_id=language_id,
                    enable_eos_bos_chars=C.enable_eos_bos_chars,
                    use_griffin_lim=True,
                    do_trim_silence=False,
                ).values()
IPython.display.display(Audio(wav, rate=ap.sample_rate))