# StarGANv2-VC Demo (VCTK 20 Speakers)

### Utils

In [1]:
## Notes: Zbyněk Lička
## Altered implementation of the the original StarGANv2-VC inference.ipynb located under the Demo/ folder
## For reference, check the original implementation: https://github.com/yl4579/StarGANv2-VC/blob/main/Demo/inference.ipynb
## Non-original comments are marked with "##"
# load packages
import random
import yaml
from munch import Munch
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torchaudio
import soundfile as sf

from Utils.ASR.models import ASRCNN
from Utils.JDC.model import JDCNet

from models import Generator

import os

from meldataset import build_dataloader

%matplotlib inline

In [2]:
# Source: http://speech.ee.ntu.edu.tw/~jjery2243542/resource/model/is18/en_speaker_used.txt
# Source: https://github.com/jjery2243542/voice_conversion

## Parameters moved here for easy access
speakers = ['p225', 'p226']
directory_conv = f"Data/"
directory_ref = f"Data/"
model_path = f"Models/latest.pth"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
to_mel = torchaudio.transforms.MelSpectrogram(
    n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
mean, std = -4, 4

def preprocess(wave):
    wave_tensor = torch.from_numpy(wave).float()
    mel_tensor = to_mel(wave_tensor)
    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
    return mel_tensor

## Changed it to build two generators
def build_model(model_params={}):
    args = Munch(model_params)
    generator_a = Generator(args.dim_in, args.max_conv_dim, w_hpf=args.w_hpf, F0_channel=args.F0_channel)
    generator_b = Generator(args.dim_in, args.max_conv_dim, w_hpf=args.w_hpf, F0_channel=args.F0_channel)
    
    nets_ema = Munch(generator_a=generator_a,
                     generator_b=generator_b)

    return nets_ema
## Removed compute_style()

### Load models

In [4]:
# load pretrained F0 model
F0_model = JDCNet(num_class=1, seq_len=192)
params = torch.load("Utils/JDC/bst.t7", map_location='cpu')['net']
F0_model.load_state_dict(params)
F0_model = F0_model.to(device)

In [5]:
# load vocoder
from parallel_wavegan.utils import load_model
vocoder = load_model("Vocoder/checkpoint-400000steps.pkl").to('cuda').eval()
vocoder.remove_weight_norm()
_ = vocoder.eval()



In [6]:
# load starganv2
with open(f'Models/config.yml') as f:
    starganv2_config = yaml.safe_load(f)
starganv2 = build_model(model_params=starganv2_config["model_params"])
params = torch.load(model_path, map_location='cpu')
params = params['model_ema']
_ = [starganv2[key].load_state_dict(params[key]) for key in starganv2]
_ = [starganv2[key].eval() for key in starganv2]
starganv2.generator_a = starganv2.generator_a.to('cuda')
starganv2.generator_b = starganv2.generator_b.to('cuda')

### Reconstruction

In [7]:
## From here on forward, the code is severely altered
## Check the original for reference: https://github.com/yl4579/StarGANv2-VC/blob/main/Demo/inference.ipynb
test_wavs = []
with open("Data/test_list.txt") as f:
    for line in f:
        splitted = line.strip().split('|')
        test_wavs.append((splitted[0], splitted[1]))
        
os.makedirs(os.path.join("converted", f"{speakers[0]}"), exist_ok=True)
os.makedirs(os.path.join("converted", f"{speakers[1]}"), exist_ok=True)

In [8]:
## This section was heavily modified, check the original for reference
mode="norm_cyc"
# reconstruction
for i, (wav_path_a, wav_path_b) in enumerate(test_wavs):
    wave_a, sr_a = sf.read(wav_path_a)
    wave_b, sr_b = sf.read(wav_path_b)
    
    mel_tensor_a = preprocess(wave_a).to('cuda')
    mel_tensor_b = preprocess(wave_b).to('cuda')
    
    max_mel_length = 192
    mel_length_a = mel_tensor_a.size(2)
    mel_length_b = mel_tensor_b.size(2)
    
    if mel_length_a > max_mel_length:
        mel_tensor_a = mel_tensor_a[:,:, : max_mel_length]
        
    if mel_length_b > max_mel_length:
        mel_tensor_b = mel_tensor_b[:,:, : max_mel_length]
        
    x_a = mel_tensor_a.unsqueeze(0)
    x_b = mel_tensor_b.unsqueeze(0)
    
    x_a = x_a.to(device)
    x_b = x_b.to(device)
    
    with torch.no_grad():
        F0_a, GAN_F0_a, cyc_F0_a = F0_model(x_a)
        x_fake_b = starganv2.generator_b(x_a, F0=GAN_F0_a, masks=None)
        x_fake_b = x_fake_b.transpose(-1, -2).squeeze()
        
        c = x_fake_b.squeeze().to('cuda')
        y_out = vocoder.inference(c)
        fake_b = y_out.view(-1).cpu()
        
        F0_b, GAN_F0_b, cyc_F0_b = F0_model(x_b)
        x_fake_a = starganv2.generator_a(x_b, F0=GAN_F0_b, masks=None)
        x_fake_a = x_fake_a.transpose(-1, -2).squeeze()
        
        c = x_fake_a.squeeze().to('cuda')
        y_out = vocoder.inference(c)
        fake_a = y_out.view(-1).cpu()
        
        sf.write(os.path.join("converted", f"{speakers[0]}", f'converted_a_{mode}_{i}.wav'), fake_a.numpy(), sr_b)
        sf.write(os.path.join("converted", f"{speakers[1]}", f'converted_b_{mode}_{i}.wav'), fake_b.numpy(), sr_a)


import IPython.display as ipd
print('Converted: %s' % speakers[1])
display(ipd.Audio(fake_b, rate=sr_a))
print('Converted: %s' % speakers[0])
display(ipd.Audio(fake_a, rate=sr_b))

Converted: p226


Converted: p225
