# StarGANv2-VC Demo (VCTK 20 Speakers)

### Utils

In [14]:
# load packages
import random
import yaml
from munch import Munch
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torchaudio
import soundfile as sf

from Utils.ASR.models import ASRCNN
from Utils.JDC.model import JDCNet

from models import Generator

import os

from meldataset import build_dataloader

%matplotlib inline

In [2]:
# Source: http://speech.ee.ntu.edu.tw/~jjery2243542/resource/model/is18/en_speaker_used.txt
# Source: https://github.com/jjery2243542/voice_conversion

## Let's only do 4 speakers in the beginning
#speakers = ['SF1', 'SF2', 'SF3', 'SM1', 'SM2', 'TM1', 'TM2', 'TM3', 'TF1', 'TF2']
speakers = ['p225', 'p226']
directory_conv = f"Data/"
directory_ref = f"Data/"
model_path = f"Models/epoch_00102.pth"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
to_mel = torchaudio.transforms.MelSpectrogram(
    n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
mean, std = -4, 4

def preprocess(wave):
    wave_tensor = torch.from_numpy(wave).float()
    mel_tensor = to_mel(wave_tensor)
    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
    return mel_tensor

def build_model(model_params={}):
    args = Munch(model_params)
    generator_a = Generator(args.dim_in, args.max_conv_dim, w_hpf=args.w_hpf, F0_channel=args.F0_channel)
    generator_b = Generator(args.dim_in, args.max_conv_dim, w_hpf=args.w_hpf, F0_channel=args.F0_channel)
    
    nets_ema = Munch(generator_a=generator_a,
                     generator_b=generator_b)

    return nets_ema

### Load models

In [4]:
# load pretrained F0 model
F0_model = JDCNet(num_class=1, seq_len=192)
params = torch.load("Utils/JDC/bst.t7", map_location='cpu')['net']
F0_model.load_state_dict(params)
F0_model = F0_model.to(device)

In [5]:
# load vocoder
from parallel_wavegan.utils import load_model
vocoder = load_model("Vocoder/checkpoint-400000steps.pkl").to('cuda').eval()
vocoder.remove_weight_norm()
_ = vocoder.eval()



In [6]:
# load pretrained ASR model
with open('Utils/ASR/config.yml') as f:
        ASR_config = yaml.safe_load(f)
ASR_model_config = ASR_config['model_params']
ASR_model = ASRCNN(**ASR_model_config)
params = torch.load('Utils/ASR/epoch_00100.pth', map_location='cpu')['model']
ASR_model.load_state_dict(params)
_ = ASR_model.eval()
ASR_model = ASR_model.to('cuda')

In [15]:
# load starganv2
with open(f'Models/config.yml') as f:
    starganv2_config = yaml.safe_load(f)
starganv2 = build_model(model_params=starganv2_config["model_params"])
params = torch.load(model_path, map_location='cpu')
params = params['model_ema']
_ = [starganv2[key].load_state_dict(params[key]) for key in starganv2]
_ = [starganv2[key].eval() for key in starganv2]
starganv2.generator_a = starganv2.generator_a.to('cuda')
starganv2.generator_b = starganv2.generator_b.to('cuda')

### Reconstruction

In [17]:
# load input deepfake wave
k = random.choice(speakers)
wav_path_a = os.path.join(directory_conv, f"{speakers[0]}/1.wav")
wav_path_b = os.path.join(directory_ref, f"{speakers[1]}/1.wav")

wave_a, sr_a = sf.read(wav_path_a)
mel_tensor_a = preprocess(wave_a).to('cuda')
mel_length = mel_tensor_a.size(2)

max_mel_length = 192
if mel_length > max_mel_length: # this is just a cutting operation, it should be kept
    mel_tensor_a = mel_tensor_a[:,:, : max_mel_length]
    
x_a = mel_tensor_a.unsqueeze(0)
    
wave_b, sr_b = sf.read(wav_path_b)
mel_tensor_b = preprocess(wave_b).to('cuda')
mel_length = mel_tensor_b.size(2)
if mel_length > max_mel_length: # this is just a cutting operation, it should be kept
    mel_tensor_b = mel_tensor_b[:,:, : max_mel_length]
x_b = mel_tensor_b.unsqueeze(0)
x_b.to('cuda')

tensor([[[[ 1.1065,  1.2497,  1.4650,  ...,  1.4276,  1.5203,  1.3801],
          [ 1.0809,  1.0233,  0.8850,  ...,  1.7169,  1.8120,  1.9087],
          [ 0.6111,  0.6313,  0.3530,  ...,  2.7010,  2.7178,  2.7052],
          ...,
          [-1.1691, -1.1775, -1.0895,  ..., -0.2841, -0.4726, -0.6430],
          [-1.0818, -1.0916, -1.0120,  ..., -0.2076, -0.3942, -0.6946],
          [-1.1472, -1.2101, -1.1437,  ..., -0.2456, -0.4602, -0.6913]]]],
       device='cuda:0')

In [20]:
# reconstruction
with torch.no_grad():
    F0_a, GAN_F0_a, cyc_F0_a = F0_model(x_a)
    x_fake_b = starganv2.generator_b(x_a, F0=GAN_F0_a, masks=None)
    x_fake_b = x_fake_b.transpose(-1, -2).squeeze()
    
    c = x_fake_b.squeeze().to('cuda')
    y_out = vocoder.inference(c)
    fake_b = y_out.view(-1).cpu()
    
    F0_b, GAN_F0_b, cyc_F0_b = F0_model(x_b)
    x_fake_a = starganv2.generator_a(x_b, F0=GAN_F0_b, masks=None)
    x_fake_a = x_fake_a.transpose(-1, -2).squeeze()
    
    c = x_fake_a.squeeze().to('cuda')
    y_out = vocoder.inference(c)
    fake_a = y_out.view(-1).cpu()


import IPython.display as ipd
print('Converted: %s' % speakers[1])
display(ipd.Audio(fake_b, rate=sr_a))
print('Converted: %s' % speakers[0])
display(ipd.Audio(fake_a, rate=sr_b))

Converted: p226


Converted: p225
