In [12]:
# CONVERSION PERFORMS THE ACTUAL VOICE CONVERSION THAT HAPPENS AFTER A MODEL IS TRAINED,
# SO WE'VE BEEN PROVIDED WITH A PRETRAINED AUTOVC MODEL TO DEMONSTRATE THIS

import os
import pickle
import torch
import numpy as np
from math import ceil
from model_vc import Generator

def pad_seq(x, base=32):
    len_out = int(base * ceil(float(x.shape[0])/base))
    len_pad = len_out - x.shape[0]
    assert len_pad >= 0
    return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad

device = 'cuda:0'
one_hot = True
if one_hot==True:
    G = Generator(32,20,512,32).eval().to(device)
else:
    G = Generator(32,256,512,32).eval().to(device)
ckpt_iters = 575000
model_dir = 'Default1HotFrom400kIters'
root = '/homes/bdoc3/my_autovc/model_data'
checkpoint_path = root +'/' +model_dir +'/ckpts/ckpt_' +str(ckpt_iters) +'.pth.tar'

subdir_for_wavs = root +'/' +model_dir +'/generated_wavs/' +str(ckpt_iters) +'iters'
if os.path.exists(subdir_for_wavs)==False:
    os.makedirs(subdir_for_wavs)

In [13]:
# pretrained autovc model loaded into G model
g_checkpoint = torch.load(checkpoint_path)
G.load_state_dict(g_checkpoint['model_state_dict'])
g_optimizer = torch.optim.Adam(G.parameters(), 0.0001)
g_optimizer.load_state_dict(g_checkpoint['optimizer_state_dict'])

In [14]:
# see which speakers are in both training and pretrained meta data
training_meta_data = pickle.load(open(root +'/' +model_dir +'/training_meta_data.pkl', "rb"))
pretrained_test_data = pickle.load(open('metadata.pkl', "rb"))

for i, spkr_i in enumerate(training_meta_data):
    print('Training speaker: ', spkr_i[0])
    for j, spkr_j in enumerate(pretrained_test_data):
        if spkr_i[0]==spkr_j[0]:
            print('Speaker in common with pretrained meta data: ', spkr_i[0])

Training speaker:  p244
Training speaker:  p303
Training speaker:  p376
Training speaker:  p360
Training speaker:  p341
Training speaker:  p233
Training speaker:  p259
Training speaker:  p241
Training speaker:  p293
Training speaker:  p284
Training speaker:  p287
Training speaker:  p315
Training speaker:  p275
Training speaker:  p347
Training speaker:  p253
Training speaker:  p238
Training speaker:  p292
Training speaker:  p228
Speaker in common with pretrained meta data:  p228
Training speaker:  p364
Training speaker:  p276


In [15]:
# Determine test speakers

# p360  19  M    American  New  Jersey
# p259  23  M    English    Nottingham
# p233  23  F    English    Staffordshire
# p228  22  F    English    Southern  England
test_speakers = ['p360', 'p259', 'p233']
trained_spkr = 'p228'
test_speakers.append(trained_spkr)
test_speakers
test_speaker_idx = []
for i, spkr_i in enumerate(test_speakers):
    for j, spkr_j in enumerate(training_meta_data):
        if spkr_i == spkr_j[0]:
            print(j, spkr_i)
            test_speaker_idx.append(j)
test_speaker_idx

3 p360
6 p259
5 p233
17 p228


[3, 6, 5, 17]

In [16]:
# Determine embeddings
num_training_spkrs = len(training_meta_data)
if one_hot==True:
    one_hot_array = np.eye(num_training_spkrs)[np.arange(num_training_spkrs)]
else: throw_error

test_spkr_embs = []
if one_hot==True:
    for spkr_idx in test_speaker_idx:
        spkr_emb = one_hot_array[spkr_idx]
        test_spkr_embs.append(spkr_emb)
else: throw_error
test_spkr_embs

[array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]),
 array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0.])]

In [17]:
# make format that can be fed to converter which consists of a list of id, spkr_emb, mel_spec
example_name = '001'
spec_dir = './spmel'
test_spkr_specs = []
database_paths = []
database_root = '/import/c4dm-datasets/VCTK-Corpus-0.92/wav48_silence_trimmed'
for spkr in test_speakers:
    print(spkr)
    spec_path = spec_dir +'/' +str(spkr) +'/' +str(spkr) +'_' +example_name +'_mic1.npy'
    audio_path = database_root +'/' +str(spkr) +'/' +str(spkr) +'_' +example_name +'_mic1.flac'
    database_paths.append(audio_path)
    spec = np.load(spec_path)
    print(spec)
    test_spkr_specs.append(spec)
database_paths

p360
[[0.22176196 0.26607546 0.16936074 ... 0.         0.         0.00643862]
 [0.26866588 0.25565267 0.24472088 ... 0.00692705 0.         0.00536238]
 [0.27034816 0.23925517 0.24312149 ... 0.01256773 0.         0.        ]
 ...
 [0.38358983 0.25486168 0.21697247 ... 0.05973994 0.07492047 0.07607802]
 [0.3766503  0.28793564 0.24837269 ... 0.04760766 0.0669216  0.07356098]
 [0.35273877 0.33902618 0.2798415  ... 0.04621968 0.04384156 0.03151707]]
p259
[[0.32901496 0.30260512 0.27612418 ... 0.03612921 0.         0.        ]
 [0.33875662 0.29561654 0.28199044 ... 0.01817036 0.         0.        ]
 [0.29525036 0.25718814 0.18256567 ... 0.         0.         0.        ]
 ...
 [0.40179682 0.31568745 0.33146933 ... 0.07257041 0.08366439 0.07716304]
 [0.31301412 0.26171017 0.30939654 ... 0.0401157  0.05747946 0.0258889 ]
 [0.29566148 0.29055786 0.33375525 ... 0.05467255 0.0603699  0.        ]]
p233
[[0.3925055  0.4009754  0.2377748  ... 0.         0.         0.        ]
 [0.41015998 0.36489874 

['/import/c4dm-datasets/VCTK-Corpus-0.92/wav48_silence_trimmed/p360/p360_001_mic1.flac',
 '/import/c4dm-datasets/VCTK-Corpus-0.92/wav48_silence_trimmed/p259/p259_001_mic1.flac',
 '/import/c4dm-datasets/VCTK-Corpus-0.92/wav48_silence_trimmed/p233/p233_001_mic1.flac',
 '/import/c4dm-datasets/VCTK-Corpus-0.92/wav48_silence_trimmed/p228/p228_001_mic1.flac']

In [18]:
training_data_for_synthesis = []

for i in range(len(test_speakers)):
    spkr_tuple = (test_speakers[i], test_spkr_embs[i], test_spkr_specs[i])
    training_data_for_synthesis.append(spkr_tuple)

for entry in training_data_for_synthesis:
    print(entry)

('p360', array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.]), array([[0.22176196, 0.26607546, 0.16936074, ..., 0.        , 0.        ,
        0.00643862],
       [0.26866588, 0.25565267, 0.24472088, ..., 0.00692705, 0.        ,
        0.00536238],
       [0.27034816, 0.23925517, 0.24312149, ..., 0.01256773, 0.        ,
        0.        ],
       ...,
       [0.38358983, 0.25486168, 0.21697247, ..., 0.05973994, 0.07492047,
        0.07607802],
       [0.3766503 , 0.28793564, 0.24837269, ..., 0.04760766, 0.0669216 ,
        0.07356098],
       [0.35273877, 0.33902618, 0.2798415 , ..., 0.04621968, 0.04384156,
        0.03151707]], dtype=float32))
('p259', array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.]), array([[0.32901496, 0.30260512, 0.27612418, ..., 0.03612921, 0.        ,
        0.        ],
       [0.33875662, 0.29561654, 0.28199044, ..., 0.01817036, 0.        ,
        0.        ],
       [0

In [19]:
# spect_vc collects spectrogram information to be used later for spec2wav conversion via wavnet model
spect_vc = []

x_org_list = []

# each sbmt_i has a speaker ID, a speaker embedding, and a spectrogram
for sbmt_i in training_data_for_synthesis:
    
    # x origin - 80Mel spectrogram
    x_org = sbmt_i[2]
    x_org_list.append( ('{}'.format(sbmt_i[0]), x_org) )
    x_org, len_pad = pad_seq(x_org)
    # utterance origin is just padded spec in tensor form
    uttr_org = torch.from_numpy(x_org[np.newaxis, :, :]).to(device)
    # speaker embedding 
    emb_org = torch.from_numpy(sbmt_i[1][np.newaxis, :]).to(device)

    # for each entry in training_data_for_synthesis, use it as the embedding target and use this in G
    for sbmt_j in training_data_for_synthesis:
        # embedding target represents S2 in the paper - the embedding provided by a pretrained Speaker Encoder
        # I guess this converts each utterance from the data so that it matches the utterance of every speaker
        # So if there were 4 utterances with different speakers, then this code will generate 4*4=16 conversions
        emb_trg = torch.from_numpy(sbmt_j[1][np.newaxis, :]).to(device)
        
        # as this is conversion not training, no backprop/gradientCommunication needed here
        with torch.no_grad():
            # x_identic_psnt = target utterance, produced by the Generator
            # Generator is fed an utterance and speaker 
            if one_hot == True:
                emb_org =emb_org.float()
                emb_trg =emb_trg.float()
            else:
                throw_error  
            _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_trg)
            
        if len_pad == 0:
            # utterance target is the converted speech
            uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
        else:
            uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()
        # utterance is saved in a list, along with source and target ID
        spect_vc.append( ('{}x{}'.format(sbmt_i[0], sbmt_j[0]), uttr_trg) )
        
        
with open('results.pkl', 'wb') as handle:
    pickle.dump(spect_vc, handle)

with open('x_org.pkl', 'wb') as howdy:
    pickle.dump(x_org_list, howdy)

print('done')

done


In [20]:
# prepare wavenet operations
torch.cuda.empty_cache()

import torch
import librosa
import soundfile as sf
import pickle
from synthesis import build_model
from synthesis import wavegen


device = torch.device("cuda:0")
model = build_model().to(device)
checkpoint = torch.load("checkpoint_step001000000_ema.pth")
model.load_state_dict(checkpoint["state_dict"])

<All keys matched successfully>

In [21]:
results = pickle.load(open('results.pkl', 'rb'))
result = results[0]
result[1].shape

(137, 80)

In [22]:
# reproduce original data from pre-compuuted (noise-injected) spectrograms

results = pickle.load(open('results.pkl', 'rb'))
subdir_for_conversion = subdir_for_wavs +'/conversions'
if os.path.exists(subdir_for_conversion)==False:
    os.makedirs(subdir_for_conversion)

for result in results:
    name = result[0]
    c = result[1]
    print(name)
    waveform = wavegen(model, c=c)   
#     librosa.output.write_wav(name+'.wav', waveform, sr=16000)
    sf.write(subdir_for_conversion +'/' +name +'.wav', waveform, samplerate=16000)

  0%|          | 8/35072 [00:00<07:50, 74.59it/s]

p360xp360


100%|██████████| 35072/35072 [07:38<00:00, 76.44it/s]
  0%|          | 8/35072 [00:00<07:25, 78.67it/s]

p360xp259


100%|██████████| 35072/35072 [07:35<00:00, 77.06it/s]
  0%|          | 8/35072 [00:00<07:19, 79.73it/s]

p360xp233


100%|██████████| 35072/35072 [07:32<00:00, 77.56it/s]
  0%|          | 8/35072 [00:00<07:30, 77.87it/s]

p360xp228


100%|██████████| 35072/35072 [07:37<00:00, 76.70it/s]
  0%|          | 6/34304 [00:00<10:08, 56.32it/s]

p259xp360


100%|██████████| 34304/34304 [07:25<00:00, 77.02it/s]
  0%|          | 6/34304 [00:00<10:29, 54.47it/s]

p259xp259


100%|██████████| 34304/34304 [07:27<00:00, 76.65it/s]
  0%|          | 6/34304 [00:00<10:20, 55.25it/s]

p259xp233


100%|██████████| 34304/34304 [07:25<00:00, 77.08it/s]
  0%|          | 8/34304 [00:00<07:15, 78.82it/s]

p259xp228


100%|██████████| 34304/34304 [07:45<00:00, 73.62it/s]
  0%|          | 8/41984 [00:00<08:53, 78.75it/s]

p233xp360


100%|██████████| 41984/41984 [09:13<00:00, 75.79it/s]
  0%|          | 6/41984 [00:00<12:03, 58.01it/s]

p233xp259


 29%|██▉       | 12117/41984 [02:37<06:24, 77.64it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 96%|█████████▌| 40191/41984 [08:42<00:23, 77.52it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 66%|██████▌   | 28115/42752 [06:12<03:11, 76.62it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp

p228xp233


 29%|██▉       | 12338/42752 [02:40<06:34, 77.01it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 98%|█████████▊| 41779/42752 [09:05<00:12, 76.91it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

