In [2]:
import math
from torch.autograd import Variable
from voice_impersonation_utils import *
from voice_impersonation_model import *

In [3]:
input_files = "voice_impersonation_input/"
content_file = input_files + "male_voice.wav"
style_file = input_files + "Eleanor_Roosevelt.wav"

In [4]:
audio_content, sampling_rate = wav2spectrum(content_file)
audio_style, sampling_rate = wav2spectrum(style_file)
audio_content_torch = torch.from_numpy(audio_content)[None, None, :, :]
audio_style_torch = torch.from_numpy(audio_style)[None, None, :, :]
voice_impersonation_model = RandomCNN()
voice_impersonation_model.eval()

RandomCNN(
  (conv1): Conv2d(1, 32, kernel_size=(3, 1), stride=(1, 1))
  (LeakyReLU): LeakyReLU(negative_slope=0.2)
)

In [5]:
audio_content_variable = Variable(audio_content_torch, requires_grad=False).float()
audio_style_variable = Variable(audio_style_torch, requires_grad=False).float()
audio_content = voice_impersonation_model(audio_content_variable)
audio_style = voice_impersonation_model(audio_style_variable)

learning_rate = 0.003
audio_G_var = Variable(
    torch.randn(audio_content_torch.shape) * 1e-3, requires_grad=True
)
opt = torch.optim.Adam([audio_G_var])

style_param = 1
content_param = 5e2

num_epochs = 500
print_frequency = 50

In [6]:
for epoch in range(1, num_epochs + 1):
    opt.zero_grad()
    audio_G = voice_impersonation_model(audio_G_var)

    content_loss = content_param * compute_content_loss(audio_content, audio_G)
    style_loss = style_param * compute_layer_style_loss(audio_style, audio_G)
    loss = content_loss + style_loss
    loss.backward()
    opt.step()

    if epoch % print_frequency == 0:
        print("epoch: " + str(epoch))
        print("content loss: " + str(content_loss.item()))
        print("style loss: " + str(style_loss.item()))
        print("loss: " + str(loss.item()))

epoch: 50
content loss: 46.61996841430664
style loss: 546.4484252929688
loss: 593.0684204101562
epoch: 100
content loss: 42.155818939208984
style loss: 523.2269287109375
loss: 565.3827514648438
epoch: 150
content loss: 38.336341857910156
style loss: 488.7856750488281
loss: 527.1220092773438
epoch: 200
content loss: 35.36016082763672
style loss: 449.9360046386719
loss: 485.2961730957031
epoch: 250
content loss: 32.985469818115234
style loss: 411.0228271484375
loss: 444.00830078125
epoch: 300
content loss: 30.958057403564453
style loss: 373.3154296875
loss: 404.27349853515625
epoch: 350
content loss: 29.16558837890625
style loss: 336.8212890625
loss: 365.98687744140625
epoch: 400
content loss: 27.59528160095215
style loss: 301.69134521484375
loss: 329.28662109375
epoch: 450
content loss: 26.24138069152832
style loss: 268.2904052734375
loss: 294.53179931640625
epoch: 500
content loss: 25.084564208984375
style loss: 236.98492431640625
loss: 262.0694885253906


In [7]:
gen_spectrum = audio_G_var.cpu().data.numpy().squeeze()
output_audio_name = "Eleanor_saying_there_was_a_change_now.wav"
spectrum2wav(gen_spectrum, sampling_rate, output_audio_name)