# USE THIS NOTEBOOK AFTER TRAINING THE MODEL ON SINGING VOICE

In [1]:
from model_vc_og import Generator
import torch
import torch.nn.functional as F
import os


In [2]:
ckpt_loc = './processed_data/trained_models/checkpoint_experiment24x256x512x32_DAMP_step350000_trainloss_0.00040587130934000015.pth'
device = 'cuda:0'
# G = Generator(32,256,512,32).to(device)
G = Generator(24,256,512,32)
g_checkpoint = torch.load(ckpt_loc, map_location=device)
G.load_state_dict(g_checkpoint['state_dict'])
# G = torch.jit.load(ckpt_loc).to(device)


<All keys matched successfully>

HELPER FUNCTIONS

In [3]:
import os
import pickle
import torch
import torch.nn as nn
import numpy as np
from math import ceil
from model_vc_og import Generator
import soundfile as sf
import librosa
import numpy as np
from collections import OrderedDict
import pyworld
import scipy.signal

def butter_highpass(cutoff, fs, order=5):
    from scipy import signal
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
    return b, a

def pySTFT(x, fft_length=1024, hop_length=256):
    import numpy as np
    from scipy.signal import get_window
    x = np.pad(x, int(fft_length//2), mode='reflect')
    
    noverlap = fft_length - hop_length
    shape = x.shape[:-1]+((x.shape[-1]-noverlap)//hop_length, fft_length)
    strides = x.strides[:-1]+(hop_length*x.strides[-1], x.strides[-1])
    result = np.lib.stride_tricks.as_strided(x, shape=shape,
                                             strides=strides)
    
    fft_window = get_window('hann', fft_length, fftbins=True)
    result = np.fft.rfft(fft_window * result, n=fft_length).T
    
    return np.abs(result)    
    
def get_mel_spec(x):
    import numpy as np
    from scipy import signal
    from librosa.filters import mel
    from numpy.random import RandomState
    prng = RandomState(42)
    mel_basis = mel(16000, 1024, fmin=90, fmax=7600, n_mels=80).T
    min_level = np.exp(-100 / 20 * np.log(10))
    b, a = butter_highpass(30, 16000, order=5)
    y = signal.filtfilt(b, a, x)
    # Ddd a little random noise for model roubstness
    wav = y * 0.96 + (prng.rand(y.shape[0])-0.5)*1e-06
    # Compute spect
    D = pySTFT(wav).T
    # Convert to mel and normalize
    D_mel = np.dot(D, mel_basis)
    D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16
    S = np.clip((D_db + 100) / 100, 0, 1)   
    return S 

def get_embedding(C,x):
    import torch
    import numpy as np
    len_crop = 128
    left = np.random.randint(0, x.shape[0]-len_crop)
    melsp = torch.from_numpy(x[np.newaxis, left:left+len_crop, : ]).cuda()
    emb = C(melsp).detach().squeeze().cpu().numpy()
    return emb

class D_VECTOR(nn.Module):
    """d vector speaker embedding."""
    def __init__(self, num_layers=3, dim_input=40, dim_cell=256, dim_emb=64):
        super(D_VECTOR, self).__init__()
        self.lstm = nn.LSTM(input_size=dim_input, hidden_size=dim_cell, 
                            num_layers=num_layers, batch_first=True)  
        self.embedding = nn.Linear(dim_cell, dim_emb)
        
        
    def forward(self, x):
        self.lstm.flatten_parameters()            
        lstm_out, _ = self.lstm(x)
        embeds = self.embedding(lstm_out[:,-1,:])
        norm = embeds.norm(p=2, dim=-1, keepdim=True) 
        embeds_normalized = embeds.div(norm)
        return embeds_normalized

def pad_seq(x, base=32):
    len_out = int(base * ceil(float(x.shape[0])/base))
    len_pad = len_out - x.shape[0]
    assert len_pad >= 0
    return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad
    
def get_pyworld(wav, fs):
    _f0, timeaxis = pyworld.dio(wav, fs)    # raw pitch extractor
    f0 = pyworld.stonemask(wav, _f0, timeaxis, fs)  # pitch refinement
    # Finding Spectogram
    sp = pyworld.cheaptrick(wav, f0, timeaxis, fs)
    # Finding aperiodicity
    ap = pyworld.d4c(wav, f0, timeaxis, fs)

    return f0, sp, ap

APPLYING MODEL

In [4]:
def run_model(source_sp, source_emb, target_emb, ckpt_loc, device = 'cuda:0'):
    G = Generator(32,256,512,32).to(device)
    # G = Generator(24,256,512,24).to(device)
    g_checkpoint = torch.load(ckpt_loc, map_location=device)
    G.load_state_dict(g_checkpoint['state_dict'])
    spect, len_pad = pad_seq(source_sp)
    sp = torch.from_numpy(spect[np.newaxis, :, :]).float().to(device)
    emb_source = torch.from_numpy(source_emb[np.newaxis, :]).float().to(device)
    emb_target = torch.from_numpy(target_emb[np.newaxis, :]).float().to(device)
    with torch.no_grad():
        _, x_identic_psnt, _ = G(sp, emb_source, emb_target)
    if len_pad == 0:
        uttr_trg = x_identic_psnt[0].cpu().numpy()#x_identic_psnt[0, 0, :, :].cpu().numpy()
    else:
        uttr_trg = x_identic_psnt[0][:-len_pad].cpu().numpy()#x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()

    return uttr_trg
    # sf.write('converted_file.wav', uttr_trg, 44100)

def convert_voice(source_path, target_path, ckpt_path):
    fs = 44100
    # LOAD AUDIO FILES
    source, sr = librosa.load(source_path, sr=fs)
    target, sr = librosa.load(target_path, sr=fs)
    source = source.astype(np.float64) 
    target = target.astype(np.float64) 

    # EXTRACT EMBEDDINGS
    
    C = D_VECTOR(dim_input=80, dim_cell=768, dim_emb=256).eval().cuda()
    c_checkpoint = torch.load('3000000-BL.ckpt')
    new_state_dict = OrderedDict()
    for key, val in c_checkpoint['model_b'].items():
        new_key = key[7:]
        new_state_dict[new_key] = val
    C.load_state_dict(new_state_dict)

    S_source = get_mel_spec(source)
    S_source = S_source.astype(np.float32)

    S_target = get_mel_spec(target)
    S_target = S_target.astype(np.float32)

    # GET EMBEDDINGS
    source_emb = get_embedding(C, S_source)
    target_emb = get_embedding(C,S_target)


    # GET PYWORLD
    f0, sp, ap = get_pyworld(source, fs=fs)
    sp_coded = pyworld.code_spectral_envelope(sp, fs, 80)

    # CONVERT VOICE
    converted_sp_coded = run_model(sp_coded, source_emb, target_emb, ckpt_path)

    converted_sp = pyworld.decode_spectral_envelope(converted_sp_coded.astype(np.double), fs, 2048)
    # RE-SYNTHESIZE VOICE
    y = pyworld.synthesize(f0, converted_sp, ap, fs)
    
    return y



In [5]:
out = convert_voice('../resources/data/f1_arpeggios_straight_a.wav', '../resources/data/m8_arpeggios_straight_a.wav', './processed_data/trained_models/checkpoint_experiment32x256x512x32_DAMP_step6000_trainloss_0.0030806278809905052.pth')

  mel_basis = mel(16000, 1024, fmin=90, fmax=7600, n_mels=80).T


In [None]:
import soundfile as sf
sf.write('f1_to_m8.wav',out,44100)

In [30]:
sf.write('m8.wav',librosa.load('../resources/data/m8_arpeggios_straight_a.wav', sr=44100, mono=True)[0],44100)

# TESTING

In [None]:
# import soundfile as sf
# sf.write('225_to_225.wav',y,22050)

In [None]:
# sf.write('225.wav',source,22050)

## CODED SP TESTING

In [None]:
# sp_coded = pyworld.code_spectral_envelope(sp[0].cpu().numpy().astype(float), 22050, 80)

In [None]:
# sp[0].shape

In [None]:
# sp_coded.shape

In [None]:
# sp_decoded = pyworld.decode_spectral_envelope(sp_coded, 22050, 2048)

In [None]:
# sp_decoded.shape

In [12]:
# source_path='../resources/data/f1_arpeggios_straight_a.wav'
# target_path = '../resources/data/m8_arpeggios_straight_a.wav'
source_path = './input_data/p225/p225_001_mic1.flac'
target_path = './input_data/p231/p231_001_mic1.flac'
# target_path = './input_data/p225/p225_001_mic1.flac'
source, sr = librosa.load(source_path, sr=44100)
target, sr = librosa.load(target_path, sr=44100)
source = source.astype(np.float64) 
target = target.astype(np.float64) 

# EXTRACT EMBEDDINGS

C = D_VECTOR(dim_input=80, dim_cell=768, dim_emb=256).eval().cuda()
c_checkpoint = torch.load('3000000-BL.ckpt')
new_state_dict = OrderedDict()
for key, val in c_checkpoint['model_b'].items():
    new_key = key[7:]
    new_state_dict[new_key] = val
C.load_state_dict(new_state_dict)

S_source = get_mel_spec(source)
S_source = S_source.astype(np.float32)

S_target = get_mel_spec(target)
S_target = S_target.astype(np.float32)

# GET EMBEDDINGS
source_emb = get_embedding(C, S_source)
target_emb = get_embedding(C,S_target)

f0, sp, ap = get_pyworld(source, sr)
coded_sp = pyworld.code_spectral_envelope(sp, sr, 80)


  mel_basis = mel(16000, 1024, fmin=90, fmax=7600, n_mels=80).T


In [13]:
spect, len_pad = pad_seq(coded_sp)
sp = torch.from_numpy(spect[np.newaxis, :, :]).float().to(device)
emb_source = torch.from_numpy(source_emb[np.newaxis, :]).float().to(device)
emb_target = torch.from_numpy(target_emb[np.newaxis, :]).float().to(device)
with torch.no_grad():
    _, x_identic_psnt, _ = G(sp, emb_source, emb_target)
if len_pad == 0:
    uttr_trg = x_identic_psnt[0].cpu().numpy()#x_identic_psnt[0, 0, :, :].cpu().numpy()
else:
    uttr_trg = x_identic_psnt[0][:-len_pad].cpu().numpy()#x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()

In [14]:
sp_mod = pyworld.decode_spectral_envelope(uttr_trg.astype(np.double), sr, 1024)

In [15]:
y = pyworld.synthesize(f0, sp_mod, ap, sr)
import soundfile as sf
sf.write('225_to_231_475kepoch.wav',y,22050)

In [None]:
sf.write('231.wav',target,22050)

In [None]:
import pandas as pd
df = pd.read_pickle('./processed_data/pyworld/train.pkl')

In [None]:
enc=[]
for idx, row in df.iterrows():
    enc.append(pyworld.code_spectral_envelope(row['sp'], 44100, 80))

In [None]:
df.insert(3, 'sp_coded', enc)

In [None]:
df.to_pickle('./processed_data/pyworld/train.pkl')

In [None]:
i = 34399


In [None]:
10000 < (i+1) < 100000

In [None]:
(i+1) % 10000