In [43]:
# CONVERSION PERFORMS THE ACTUAL VOICE CONVERSION THAT HAPPENS AFTER A MODEL IS TRAINED,
# SO WE'VE BEEN PROVIDED WITH A PRETRAINED AUTOVC MODEL TO DEMONSTRATE THIS

import os
import pickle
import torch
import numpy as np
from math import ceil
from model_vc import Generator

def pad_seq(x, base=32):
    len_out = int(base * ceil(float(x.shape[0])/base))
    len_pad = len_out - x.shape[0]
    assert len_pad >= 0
    return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad

device = 'cuda:0'
one_hot = False
if one_hot==True:
    G = Generator(32,20,512,32).eval().to(device)
else:
    G = Generator(32,256,512,32).eval().to(device)
ckpt_iters = 100000
model_dir = '20Spkr32Dim32Freq0Shot'
root = '/homes/bdoc3/my_autovc/model_data'
checkpoint_path = root +'/' +model_dir +'/ckpts/ckpt_' +str(ckpt_iters) +'.pth.tar'

subdir_for_wavs = root +'/' +model_dir +'/generated_wavs/' +str(ckpt_iters) +'iters'
if os.path.exists(subdir_for_wavs)==False:
    os.makedirs(subdir_for_wavs)

In [44]:
# pretrained autovc model loaded into G model
g_checkpoint = torch.load(checkpoint_path)
G.load_state_dict(g_checkpoint['model_state_dict'])
g_optimizer = torch.optim.Adam(G.parameters(), 0.0001)
g_optimizer.load_state_dict(g_checkpoint['optimizer_state_dict'])

# for state in g_optimizer.state.values():
#     for k, v in state.items():
#         if isinstance(v, torch.Tensor):
#             state[k] = v.cuda()

In [48]:
# see which speakers are in both training and pretrained meta data
training_meta_data = pickle.load(open(root +'/' +model_dir +'/training_meta_data.pkl', "rb"))
pretrained_test_data = pickle.load(open('metadata.pkl', "rb"))
all_meta_data = pickle.load(open('all_meta_data.pkl', "rb"))

for i, spkr_i in enumerate(training_meta_data):
    print('Training speaker: ', spkr_i[0])
    for j, spkr_j in enumerate(pretrained_test_data):
        if spkr_i[0]==spkr_j[0]:
            print('Speaker in common with pretrained meta data: ', spkr_i[0])

Training speaker:  p244
Training speaker:  p303
Training speaker:  p376
Training speaker:  p360
Training speaker:  p341
Training speaker:  p233
Training speaker:  p259
Training speaker:  p241
Training speaker:  p293
Training speaker:  p284
Training speaker:  p287
Training speaker:  p315
Training speaker:  p275
Training speaker:  p347
Training speaker:  p253
Training speaker:  p238
Training speaker:  p292
Training speaker:  p228
Speaker in common with pretrained meta data:  p228
Training speaker:  p364
Training speaker:  p276


In [63]:
# Choose what speakers you want to use as test, and then find their corresponding 

# p360  19  M    American  New  Jersey
# p259  23  M    English    Nottingham
# p233  23  F    English    Staffordshire
# p228  22  F    English    Southern  England
test_speakers = ['p360', 'p259', 'p233']
trained_spkr = 'p228'
test_speakers.append(trained_spkr)
test_speakers
if one_hot==True:
    test_speaker_idx = []
    for i, spkr_i in enumerate(test_speakers):
        for j, spkr_j in enumerate(training_meta_data):
            if spkr_i == spkr_j[0]:
                print(j, spkr_i)
                test_speaker_idx.append(j)

In [66]:
# Determine embeddings
num_training_spkrs = len(training_meta_data)
if one_hot==True:
    one_hot_array = np.eye(num_training_spkrs)[np.arange(num_training_spkrs)]

test_spkr_embs = []
if one_hot==True:
    for spkr_idx in test_speaker_idx:
        spkr_emb = one_hot_array[spkr_idx]
        test_spkr_embs.append(spkr_emb)
else:
    for spkr_id_i in test_speakers:
        for spkr_data in all_meta_data:
            if spkr_id_i == spkr_data[0]:
                spkr_emb = spkr_data[1]
                test_spkr_embs.append(spkr_emb)

[array([ 6.93282634e-02, -1.01436982e-02,  1.21917101e-02,  2.21594591e-02,
        -3.41714695e-02, -4.04114053e-02, -4.54880372e-02, -2.06909627e-02,
         8.27469453e-02, -5.72406128e-02, -7.61412736e-03,  2.64559500e-02,
         3.45057882e-02,  2.50040106e-02, -2.14785747e-02, -7.63055170e-03,
        -9.44781303e-02, -6.96726665e-02, -1.35129886e-02,  3.05525362e-02,
         2.09923834e-02, -2.14075204e-02,  1.48701668e-01,  2.42711641e-02,
         9.46623087e-02, -6.19609058e-02, -4.53179553e-02, -3.43416706e-02,
        -1.40791563e-02,  5.82508855e-02,  4.88353893e-02,  5.48112318e-02,
        -3.99652049e-02,  3.28298029e-03,  6.39875070e-04,  8.42629522e-02,
         9.24306829e-03,  1.48851229e-02, -5.17848209e-02,  5.12772575e-02,
         7.31534977e-03,  3.38990577e-02,  5.61777428e-02, -5.51385395e-02,
        -3.62390205e-02,  2.54919976e-02, -1.44954948e-02,  2.18365155e-02,
        -2.59912219e-02,  4.35634442e-02, -1.56426424e-04, -5.33796139e-02,
         6.7

In [70]:
# get paths for determined uttrs
example_name = '001'
spec_dir = './spmel'
test_spkr_specs = []
database_paths = []
database_root = '/import/c4dm-datasets/VCTK-Corpus-0.92/wav48_silence_trimmed'
for spkr in test_speakers:
    spec_path = spec_dir +'/' +str(spkr) +'/' +str(spkr) +'_' +example_name +'_mic1.npy'
    audio_path = database_root +'/' +str(spkr) +'/' +str(spkr) +'_' +example_name +'_mic1.flac'
    database_paths.append(audio_path)
    spec = np.load(spec_path)
    test_spkr_specs.append(spec)
database_paths

['/import/c4dm-datasets/VCTK-Corpus-0.92/wav48_silence_trimmed/p360/p360_001_mic1.flac',
 '/import/c4dm-datasets/VCTK-Corpus-0.92/wav48_silence_trimmed/p259/p259_001_mic1.flac',
 '/import/c4dm-datasets/VCTK-Corpus-0.92/wav48_silence_trimmed/p233/p233_001_mic1.flac',
 '/import/c4dm-datasets/VCTK-Corpus-0.92/wav48_silence_trimmed/p228/p228_001_mic1.flac']

In [71]:
# make format that can be fed to converter which consists of a list of id, spkr_emb, mel_spec
training_data_for_synthesis = []

for i in range(len(test_speakers)):
    spkr_tuple = (test_speakers[i], test_spkr_embs[i], test_spkr_specs[i])
    training_data_for_synthesis.append(spkr_tuple)

In [73]:
# spect_vc collects spectrogram information to be used later for spec2wav conversion via wavnet model
spect_vc = []

x_org_list = []

# each sbmt_i has a speaker ID, a speaker embedding, and a spectrogram
for sbmt_i in training_data_for_synthesis:
    
    # x origin - 80Mel spectrogram
    x_org = sbmt_i[2]
    x_org_list.append( ('{}'.format(sbmt_i[0]), x_org) )
    x_org, len_pad = pad_seq(x_org)
    # utterance origin is just padded spec in tensor form
    uttr_org = torch.from_numpy(x_org[np.newaxis, :, :]).to(device)
    # speaker embedding 
    emb_org = torch.from_numpy(sbmt_i[1][np.newaxis, :]).to(device)

    # for each entry in training_data_for_synthesis, use it as the embedding target and use this in G
    for sbmt_j in training_data_for_synthesis:
        # embedding target represents S2 in the paper - the embedding provided by a pretrained Speaker Encoder
        # I guess this converts each utterance from the data so that it matches the utterance of every speaker
        # So if there were 4 utterances with different speakers, then this code will generate 4*4=16 conversions
        emb_trg = torch.from_numpy(sbmt_j[1][np.newaxis, :]).to(device)
        
        # as this is conversion not training, no backprop/gradientCommunication needed here
        with torch.no_grad():
            # x_identic_psnt = target utterance, produced by the Generator
            # Generator is fed an utterance and speaker 
            if one_hot == True:
                emb_org =emb_org.float()
                emb_trg =emb_trg.float() 
            _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_trg)
            
        if len_pad == 0:
            # utterance target is the converted speech
            uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
        else:
            uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()
        # utterance is saved in a list, along with source and target ID
        spect_vc.append( ('{}x{}'.format(sbmt_i[0], sbmt_j[0]), uttr_trg) )
        
        
with open('results.pkl', 'wb') as handle:
    pickle.dump(spect_vc, handle)

with open('x_org.pkl', 'wb') as howdy:
    pickle.dump(x_org_list, howdy)

print('done')

done


In [74]:
# prepare wavenet operations
torch.cuda.empty_cache()

import torch
import librosa
import soundfile as sf
import pickle
from synthesis import build_model
from synthesis import wavegen


device = torch.device("cuda:0")
model = build_model().to(device)
checkpoint = torch.load("checkpoint_step001000000_ema.pth")
model.load_state_dict(checkpoint["state_dict"])

<All keys matched successfully>

In [None]:
# reproduce original data from pre-compuuted (noise-injected) spectrograms

results = pickle.load(open('results.pkl', 'rb'))
subdir_for_conversion = subdir_for_wavs +'/conversions'
if os.path.exists(subdir_for_conversion)==False:
    os.makedirs(subdir_for_conversion)

for result in results:
    name = result[0]
    c = result[1]
    print(name)
    waveform = wavegen(model, c=c)   
#     librosa.output.write_wav(name+'.wav', waveform, sr=16000)
    sf.write(subdir_for_conversion +'/' +name +'.wav', waveform, samplerate=16000)

  0%|          | 6/35072 [00:00<10:02, 58.16it/s]

p360xp360


 37%|███▋      | 13066/35072 [03:34<06:16, 58.41it/s]