In [33]:
# CONVERSION PERFORMS THE ACTUAL VOICE CONVERSION THAT HAPPENS AFTER A MODEL IS TRAINED,
# SO WE'VE BEEN PROVIDED WITH A PRETRAINED AUTOVC MODEL TO DEMONSTRATE THIS

import os
import pickle
import torch
import numpy as np
from math import ceil
from model_vc import Generator

def pad_seq(x, base=32):
    len_out = int(base * ceil(float(x.shape[0])/base))
    len_pad = len_out - x.shape[0]
    assert len_pad >= 0
    return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad

device = 'cuda:0'
one_hot = True
if one_hot==True:
    G = Generator(32,20,512,32).eval().to(device)
else:
    G = Generator(32,256,512,32).eval().to(device)
ckpt_iters = 100000
model_dir = 'Default1Hot'
root = '/homes/bdoc3/my_autovc/model_saves'
checkpoint_path = root +'/' +model_dir +'/ckpts/ckpt_' +str(ckpt_iters) +'.pth.tar'

subdir_for_wavs = root +'/' +model_dir +'/generated_wavs/' +str(ckpt_iters) +'iters'
if os.path.exists(subdir_for_wavs)==False:
    os.makedirs(subdir_for_wavs)

In [34]:
# pretrained autovc model loaded into G model
g_checkpoint = torch.load(checkpoint_path)
G.load_state_dict(g_checkpoint['model_state_dict'])
g_optimizer = torch.optim.Adam(G.parameters(), 0.0001)
g_optimizer.load_state_dict(g_checkpoint['optimizer_state_dict'])

In [35]:
# see which speakers are in both training and pretrained meta data
training_meta_data = pickle.load(open(root +'/' +model_dir +'/training_meta_data.pkl', "rb"))
pretrained_test_data = pickle.load(open('metadata.pkl', "rb"))

for i, spkr_i in enumerate(training_meta_data):
    print('Training speaker: ', spkr_i[0])
    for j, spkr_j in enumerate(pretrained_test_data):
        if spkr_i[0]==spkr_j[0]:
            print('Speaker in common with pretrained meta data: ', spkr_i[0])

Training speaker:  p244
Training speaker:  p303
Training speaker:  p376
Training speaker:  p360
Training speaker:  p341
Training speaker:  p233
Training speaker:  p259
Training speaker:  p241
Training speaker:  p293
Training speaker:  p284
Training speaker:  p287
Training speaker:  p315
Training speaker:  p275
Training speaker:  p347
Training speaker:  p253
Training speaker:  p238
Training speaker:  p292
Training speaker:  p228
Speaker in common with pretrained meta data:  p228
Training speaker:  p364
Training speaker:  p276


In [36]:
# Determine test speakers

# p360  19  M    American  New  Jersey
# p259  23  M    English    Nottingham
# p233  23  F    English    Staffordshire
# p228  22  F    English    Southern  England
test_speakers = ['p360', 'p259', 'p233']
trained_spkr = 'p228'
test_speakers.append(trained_spkr)
test_speakers
test_speaker_idx = []
for i, spkr_i in enumerate(test_speakers):
    for j, spkr_j in enumerate(training_meta_data):
        if spkr_i == spkr_j[0]:
            print(j, spkr_i)
            test_speaker_idx.append(j)
test_speaker_idx

3 p360
6 p259
5 p233
17 p228


[3, 6, 5, 17]

In [37]:
# Determine embeddings
num_training_spkrs = len(training_meta_data)
if one_hot==True:
    one_hot_array = np.eye(num_training_spkrs)[np.arange(num_training_spkrs)]
else: throw_error

test_spkr_embs = []
if one_hot==True:
    for spkr_idx in test_speaker_idx:
        spkr_emb = one_hot_array[spkr_idx]
        test_spkr_embs.append(spkr_emb)
else: throw_error
test_spkr_embs

[array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]),
 array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0.])]

In [38]:
# make format that can be fed to converter which consists of a list of id, spkr_emb, mel_spec
example_name = '001'
spec_dir = './spmel'
test_spkr_specs = []
database_paths = []
database_root = '/import/c4dm-datasets/VCTK-Corpus-0.92/wav48_silence_trimmed'
for spkr in test_speakers:
    print(spkr)
    spec_path = spec_dir +'/' +str(spkr) +'/' +str(spkr) +'_' +example_name +'_mic1.npy'
    audio_path = database_root +'/' +str(spkr) +'/' +str(spkr) +'_' +example_name +'_mic1.flac'
    database_paths.append(audio_path)
    spec = np.load(spec_path)
    print(spec)
    test_spkr_specs.append(spec)
database_paths

p360
[[0.22176196 0.26607546 0.16936074 ... 0.         0.         0.00643862]
 [0.26866588 0.25565267 0.24472088 ... 0.00692705 0.         0.00536238]
 [0.27034816 0.23925517 0.24312149 ... 0.01256773 0.         0.        ]
 ...
 [0.38358983 0.25486168 0.21697247 ... 0.05973994 0.07492047 0.07607802]
 [0.3766503  0.28793564 0.24837269 ... 0.04760766 0.0669216  0.07356098]
 [0.35273877 0.33902618 0.2798415  ... 0.04621968 0.04384156 0.03151707]]
p259
[[0.32901496 0.30260512 0.27612418 ... 0.03612921 0.         0.        ]
 [0.33875662 0.29561654 0.28199044 ... 0.01817036 0.         0.        ]
 [0.29525036 0.25718814 0.18256567 ... 0.         0.         0.        ]
 ...
 [0.40179682 0.31568745 0.33146933 ... 0.07257041 0.08366439 0.07716304]
 [0.31301412 0.26171017 0.30939654 ... 0.0401157  0.05747946 0.0258889 ]
 [0.29566148 0.29055786 0.33375525 ... 0.05467255 0.0603699  0.        ]]
p233
[[0.3925055  0.4009754  0.2377748  ... 0.         0.         0.        ]
 [0.41015998 0.36489874 

['/import/c4dm-datasets/VCTK-Corpus-0.92/wav48_silence_trimmed/p360/p360_001_mic1.flac',
 '/import/c4dm-datasets/VCTK-Corpus-0.92/wav48_silence_trimmed/p259/p259_001_mic1.flac',
 '/import/c4dm-datasets/VCTK-Corpus-0.92/wav48_silence_trimmed/p233/p233_001_mic1.flac',
 '/import/c4dm-datasets/VCTK-Corpus-0.92/wav48_silence_trimmed/p228/p228_001_mic1.flac']

In [39]:
training_data_for_synthesis = []

for i in range(len(test_speakers)):
    spkr_tuple = (test_speakers[i], test_spkr_embs[i], test_spkr_specs[i])
    training_data_for_synthesis.append(spkr_tuple)

for entry in training_data_for_synthesis:
    print(entry)

('p360', array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.]), array([[0.22176196, 0.26607546, 0.16936074, ..., 0.        , 0.        ,
        0.00643862],
       [0.26866588, 0.25565267, 0.24472088, ..., 0.00692705, 0.        ,
        0.00536238],
       [0.27034816, 0.23925517, 0.24312149, ..., 0.01256773, 0.        ,
        0.        ],
       ...,
       [0.38358983, 0.25486168, 0.21697247, ..., 0.05973994, 0.07492047,
        0.07607802],
       [0.3766503 , 0.28793564, 0.24837269, ..., 0.04760766, 0.0669216 ,
        0.07356098],
       [0.35273877, 0.33902618, 0.2798415 , ..., 0.04621968, 0.04384156,
        0.03151707]], dtype=float32))
('p259', array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.]), array([[0.32901496, 0.30260512, 0.27612418, ..., 0.03612921, 0.        ,
        0.        ],
       [0.33875662, 0.29561654, 0.28199044, ..., 0.01817036, 0.        ,
        0.        ],
       [0

In [40]:
# spect_vc collects spectrogram information to be used later for spec2wav conversion via wavnet model
spect_vc = []

x_org_list = []

# each sbmt_i has a speaker ID, a speaker embedding, and a spectrogram
for sbmt_i in training_data_for_synthesis:
    
    # x origin - 80Mel spectrogram
    x_org = sbmt_i[2]
    x_org_list.append( ('{}'.format(sbmt_i[0]), x_org) )
    x_org, len_pad = pad_seq(x_org)
    # utterance origin is just padded spec in tensor form
    uttr_org = torch.from_numpy(x_org[np.newaxis, :, :]).to(device)
    # speaker embedding 
    emb_org = torch.from_numpy(sbmt_i[1][np.newaxis, :]).to(device)

    # for each entry in training_data_for_synthesis, use it as the embedding target and use this in G
    for sbmt_j in training_data_for_synthesis:
        # embedding target represents S2 in the paper - the embedding provided by a pretrained Speaker Encoder
        # I guess this converts each utterance from the data so that it matches the utterance of every speaker
        # So if there were 4 utterances with different speakers, then this code will generate 4*4=16 conversions
        emb_trg = torch.from_numpy(sbmt_j[1][np.newaxis, :]).to(device)
        
        # as this is conversion not training, no backprop/gradientCommunication needed here
        with torch.no_grad():
            # x_identic_psnt = target utterance, produced by the Generator
            # Generator is fed an utterance and speaker 
            if one_hot == True:
                emb_org =emb_org.float()
                emb_trg =emb_trg.float()
            else:
                throw_error  
            _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_trg)
            
        if len_pad == 0:
            # utterance target is the converted speech
            uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
        else:
            uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()
        # utterance is saved in a list, along with source and target ID
        spect_vc.append( ('{}x{}'.format(sbmt_i[0], sbmt_j[0]), uttr_trg) )
        
        
with open('results.pkl', 'wb') as handle:
    pickle.dump(spect_vc, handle)

with open('x_org.pkl', 'wb') as howdy:
    pickle.dump(x_org_list, howdy)

print('done')

done


In [41]:
# prepare wavenet operations

import torch
import librosa
import soundfile as sf
import pickle
from synthesis import build_model
from synthesis import wavegen


device = torch.device("cuda:0")
model = build_model().to(device)
checkpoint = torch.load("checkpoint_step001000000_ema.pth")
model.load_state_dict(checkpoint["state_dict"])

<All keys matched successfully>

In [42]:
# reproduce original data from pre-compuuted (noise-injected) spectrograms
torch.cuda.empty_cache()

x_orgs = pickle.load(open('x_org.pkl', 'rb'))
subdir_for_x_origin = subdir_for_wavs +'/x_origins'
if os.path.exists(subdir_for_x_origin)==False:
    os.makedirs(subdir_for_x_origin)

for x_org in x_orgs:
    name = x_org[0]
    file_path = subdir_for_x_origin +'/' +name +'.wav'
    c = x_org[1]
    print(name, file_path)
    waveform = wavegen(model, c=c)   
#     librosa.output.write_wav(name+'.wav', waveform, sr=16000)
    sf.write(file_path, waveform, samplerate=16000)

  0%|          | 6/35072 [00:00<11:12, 52.10it/s]

p360 /homes/bdoc3/my_autovc/model_saves/Default1Hot/generated_wavs/100000iters/x_origins/p360.wav


100%|██████████| 35072/35072 [09:51<00:00, 59.28it/s]
  0%|          | 6/34304 [00:00<10:22, 55.10it/s]

p259 /homes/bdoc3/my_autovc/model_saves/Default1Hot/generated_wavs/100000iters/x_origins/p259.wav


100%|██████████| 34304/34304 [09:45<00:00, 58.58it/s]
  0%|          | 6/41984 [00:00<12:25, 56.28it/s]

p233 /homes/bdoc3/my_autovc/model_saves/Default1Hot/generated_wavs/100000iters/x_origins/p233.wav


100%|██████████| 41984/41984 [11:43<00:00, 59.71it/s]
  0%|          | 7/42752 [00:00<10:18, 69.10it/s]

p228 /homes/bdoc3/my_autovc/model_saves/Default1Hot/generated_wavs/100000iters/x_origins/p228.wav


100%|██████████| 42752/42752 [12:09<00:00, 58.60it/s]


In [43]:
import os, pdb, time, shutil, crepe, librosa, pickle, random
import numpy as np
import soundfile as sf
from scipy import signal
from scipy.signal import get_window, medfilt
from librosa.filters import mel
from numpy.random import RandomState

def butter_highpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
    return b, a

def pySTFT(x, fft_length=1024, hop_length=256):

    x = np.pad(x, int(fft_length//2), mode='reflect')

    noverlap = fft_length - hop_length
    shape = x.shape[:-1]+((x.shape[-1]-noverlap)//hop_length, fft_length)
    strides = x.strides[:-1]+(hop_length*x.strides[-1], x.strides[-1])
    result = np.lib.stride_tricks.as_strided(x, shape=shape,
                                             strides=strides)

    fft_window = get_window('hann', fft_length, fftbins=True)
    result = np.fft.rfft(fft_window * result, n=fft_length).T

    return np.abs(result)

mel_basis = mel(16000, 1024, fmin=90, fmax=7600, n_mels=80).T
min_level = np.exp(-100 / 20 * np.log(10))
b, a = butter_highpass(30, 16000, order=5)

database_paths
origins_no_augs = []

for file_idx, file_path in enumerate(database_paths):
    # ensure that only mic1 files are processed
    if file_path.endswith('mic1.flac'):
        # Read audio file
        audio, sr = sf.read(file_path)
        # Remove drifting noise
        y = signal.filtfilt(b, a, audio)
#             # Ddd a little random noise for model roubstness
#             wav = y * 0.96 + (prng.rand(y.shape[0])-0.5)*1e-06
        # resample 48kHz to 16kHz
        resampled_wav = librosa.resample(y, sr, 16000)
        # pdb.set_trace()
        # compute pitch contour
#             timestamp, frequency_prediction, confidence, activation = crepe.predict(resampled_wav, 16000, viterbi=False, step_size=16)
#             # preprocess pitch contour
#             one_hot_preprocessed_pitch_conotours = pitch_preprocessing(frequency_prediction, confidence)
        # Compute spect
        D = pySTFT(resampled_wav).T
        # Convert to mel and normalize
        D_mel = np.dot(D, mel_basis)
        #Author mentioned min level -100 and ref level 16 dB in https://github.com/auspicious3000/autovc/issues/4
        D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16
        S = np.clip((D_db + 100) / 100, 0, 1)
        # save spect    
        origins_no_augs.append((file_path[-18:-10], S))
origins_no_augs

[('p360_001',
  array([[2.25306970e-01, 2.69609861e-01, 1.72910747e-01, ...,
          0.00000000e+00, 0.00000000e+00, 1.00404809e-02],
         [2.72209540e-01, 2.59184080e-01, 2.48264116e-01, ...,
          1.04524651e-02, 3.56945777e-04, 8.89657316e-03],
         [2.73895064e-01, 2.42805972e-01, 2.46666074e-01, ...,
          1.61198731e-02, 0.00000000e+00, 0.00000000e+00],
         ...,
         [3.87137681e-01, 2.58410766e-01, 2.20526963e-01, ...,
          6.32682574e-02, 7.84395061e-02, 7.96247342e-02],
         [3.80199271e-01, 2.91485559e-01, 2.51932819e-01, ...,
          5.11101733e-02, 7.04774190e-02, 7.71211486e-02],
         [3.56290588e-01, 3.42577357e-01, 2.83377008e-01, ...,
          4.97290658e-02, 4.74290419e-02, 3.50742196e-02]])),
 ('p259_001',
  array([[0.33257237, 0.30615085, 0.27965283, ..., 0.03968096, 0.        ,
          0.        ],
         [0.34230928, 0.29916488, 0.28552444, ..., 0.02172501, 0.        ,
          0.        ],
         [0.29879044, 0.260

In [44]:
torch.cuda.empty_cache()

subdir_for_x_origins_no_augs = subdir_for_wavs +'/x_origins_no_augs'
if os.path.exists(subdir_for_x_origins_no_augs)==False:
    os.makedirs(subdir_for_x_origins_no_augs)

for x_org in origins_no_augs:
    name = x_org[0]
    c = x_org[1]
    print(name)
    waveform = wavegen(model, c=c)   
#     librosa.output.write_wav(name+'.wav', waveform, sr=16000)
    sf.write(subdir_for_x_origins_no_augs +'/' +name +'.wav', waveform, samplerate=16000)

  0%|          | 7/35072 [00:00<08:37, 67.78it/s]

p360_001


100%|██████████| 35072/35072 [10:00<00:00, 58.41it/s]
  0%|          | 6/34304 [00:00<10:04, 56.71it/s]

p259_001


100%|██████████| 34304/34304 [09:59<00:00, 57.19it/s]
  0%|          | 8/41984 [00:00<09:26, 74.12it/s]

p233_001


100%|██████████| 41984/41984 [11:57<00:00, 58.55it/s]
  0%|          | 7/42752 [00:00<11:16, 63.22it/s]

p228_001


100%|██████████| 42752/42752 [11:31<00:00, 61.78it/s]
